GenericPacketMath.h 21.7 KB
Newer Older
LM's avatar
LM committed
1 2 3 4 5 6
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
Don Gagne's avatar
Don Gagne committed
7 8 9
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
LM's avatar
LM committed
10 11 12 13

#ifndef EIGEN_GENERIC_PACKET_MATH_H
#define EIGEN_GENERIC_PACKET_MATH_H

Don Gagne's avatar
Don Gagne committed
14 15
namespace Eigen {

LM's avatar
LM committed
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
namespace internal {

/** \internal
  * \file GenericPacketMath.h
  *
  * Default implementation for types not supported by the vectorization.
  * In practice these functions are provided to make easier the writing
  * of generic vectorized code.
  */

#ifndef EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_ALIGNED_LOAD
#endif

#ifndef EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_LOAD
#endif

#ifndef EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_STORE
#endif

#ifndef EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_STORE
#endif

struct default_packet_traits
{
  enum {
45 46
    HasHalfPacket = 0,

LM's avatar
LM committed
47 48 49 50 51
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
    HasNegate = 1,
    HasAbs    = 1,
52
    HasArg    = 0,
LM's avatar
LM committed
53 54 55 56 57
    HasAbs2   = 1,
    HasMin    = 1,
    HasMax    = 1,
    HasConj   = 1,
    HasSetLinear = 1,
58
    HasBlend  = 0,
LM's avatar
LM committed
59 60 61

    HasDiv    = 0,
    HasSqrt   = 0,
62
    HasRsqrt  = 0,
LM's avatar
LM committed
63 64
    HasExp    = 0,
    HasLog    = 0,
65 66
    HasLog1p  = 0,
    HasLog10  = 0,
LM's avatar
LM committed
67 68 69 70 71 72 73
    HasPow    = 0,

    HasSin    = 0,
    HasCos    = 0,
    HasTan    = 0,
    HasASin   = 0,
    HasACos   = 0,
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
    HasATan   = 0,
    HasSinh   = 0,
    HasCosh   = 0,
    HasTanh   = 0,
    HasLGamma = 0,
    HasDiGamma = 0,
    HasZeta = 0,
    HasPolygamma = 0,
    HasErf = 0,
    HasErfc = 0,
    HasIGamma = 0,
    HasIGammac = 0,
    HasBetaInc = 0,

    HasRound  = 0,
    HasFloor  = 0,
    HasCeil   = 0,

    HasSign   = 0
LM's avatar
LM committed
93 94 95 96 97 98
  };
};

template<typename T> struct packet_traits : default_packet_traits
{
  typedef T type;
99
  typedef T half;
LM's avatar
LM committed
100 101 102
  enum {
    Vectorizable = 0,
    size = 1,
103 104
    AlignedOnScalar = 0,
    HasHalfPacket = 0
LM's avatar
LM committed
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
  };
  enum {
    HasAdd    = 0,
    HasSub    = 0,
    HasMul    = 0,
    HasNegate = 0,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasConj   = 0,
    HasSetLinear = 0
  };
};

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
template<typename T> struct packet_traits<const T> : packet_traits<T> { };

template <typename Src, typename Tgt> struct type_casting_traits {
  enum {
    VectorizedCast = 0,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
};


/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
template <typename SrcPacket, typename TgtPacket>
EIGEN_DEVICE_FUNC inline TgtPacket
pcast(const SrcPacket& a) {
  return static_cast<TgtPacket>(a);
}
template <typename SrcPacket, typename TgtPacket>
EIGEN_DEVICE_FUNC inline TgtPacket
pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
  return static_cast<TgtPacket>(a);
}

template <typename SrcPacket, typename TgtPacket>
EIGEN_DEVICE_FUNC inline TgtPacket
pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
  return static_cast<TgtPacket>(a);
}

LM's avatar
LM committed
149
/** \internal \returns a + b (coeff-wise) */
150
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
151 152 153 154
padd(const Packet& a,
        const Packet& b) { return a+b; }

/** \internal \returns a - b (coeff-wise) */
155
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
156 157 158 159
psub(const Packet& a,
        const Packet& b) { return a-b; }

/** \internal \returns -a (coeff-wise) */
160
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
161 162 163
pnegate(const Packet& a) { return -a; }

/** \internal \returns conj(a) (coeff-wise) */
164 165

template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
Don Gagne's avatar
Don Gagne committed
166
pconj(const Packet& a) { return numext::conj(a); }
LM's avatar
LM committed
167 168

/** \internal \returns a * b (coeff-wise) */
169
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
170 171 172 173
pmul(const Packet& a,
        const Packet& b) { return a*b; }

/** \internal \returns a / b (coeff-wise) */
174
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
175 176 177 178
pdiv(const Packet& a,
        const Packet& b) { return a/b; }

/** \internal \returns the min of \a a and \a b  (coeff-wise) */
179
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
180
pmin(const Packet& a,
181
        const Packet& b) { return numext::mini(a, b); }
LM's avatar
LM committed
182 183

/** \internal \returns the max of \a a and \a b  (coeff-wise) */
184
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
185
pmax(const Packet& a,
186
        const Packet& b) { return numext::maxi(a, b); }
LM's avatar
LM committed
187 188

/** \internal \returns the absolute value of \a a */
189
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
Don Gagne's avatar
Don Gagne committed
190
pabs(const Packet& a) { using std::abs; return abs(a); }
LM's avatar
LM committed
191

192 193 194 195
/** \internal \returns the phase angle of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
parg(const Packet& a) { using numext::arg; return arg(a); }

LM's avatar
LM committed
196
/** \internal \returns the bitwise and of \a a and \a b */
197
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
198 199 200
pand(const Packet& a, const Packet& b) { return a & b; }

/** \internal \returns the bitwise or of \a a and \a b */
201
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
202 203 204
por(const Packet& a, const Packet& b) { return a | b; }

/** \internal \returns the bitwise xor of \a a and \a b */
205
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
206 207 208
pxor(const Packet& a, const Packet& b) { return a ^ b; }

/** \internal \returns the bitwise andnot of \a a and \a b */
209
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
210 211 212
pandnot(const Packet& a, const Packet& b) { return a & (!b); }

/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
213
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
214 215 216
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }

/** \internal \returns a packet version of \a *from, (un-aligned load) */
217
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
218 219
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }

220 221 222 223 224 225 226 227
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }

/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }

Don Gagne's avatar
Don Gagne committed
228
/** \internal \returns a packet with elements of \a *from duplicated.
229 230
  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
Don Gagne's avatar
Don Gagne committed
231
  * Currently, this function is only used for scalar * complex products.
232 233
  */
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
LM's avatar
LM committed
234 235
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
/** \internal \returns a packet with elements of \a *from quadrupled.
  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
  * Currently, this function is only used in matrix products.
  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
  */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadquad(const typename unpacket_traits<Packet>::type* from)
{ return pload1<Packet>(from); }

/** \internal equivalent to
  * \code
  * a0 = pload1(a+0);
  * a1 = pload1(a+1);
  * a2 = pload1(a+2);
  * a3 = pload1(a+3);
  * \endcode
  * \sa pset1, pload1, ploaddup, pbroadcast2
  */
template<typename Packet> EIGEN_DEVICE_FUNC
inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
{
  a0 = pload1<Packet>(a+0);
  a1 = pload1<Packet>(a+1);
  a2 = pload1<Packet>(a+2);
  a3 = pload1<Packet>(a+3);
}

/** \internal equivalent to
  * \code
  * a0 = pload1(a+0);
  * a1 = pload1(a+1);
  * \endcode
  * \sa pset1, pload1, ploaddup, pbroadcast4
  */
template<typename Packet> EIGEN_DEVICE_FUNC
inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
                        Packet& a0, Packet& a1)
{
  a0 = pload1<Packet>(a+0);
  a1 = pload1<Packet>(a+1);
}
LM's avatar
LM committed
279 280

/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
281 282
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
plset(const typename unpacket_traits<Packet>::type& a) { return a; }
LM's avatar
LM committed
283 284

/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
285
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
LM's avatar
LM committed
286 287 288
{ (*to) = from; }

/** \internal copy the packet \a from to \a *to, (un-aligned store) */
289 290 291 292 293 294 295 296
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
{  (*to) = from; }

 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
 { return ploadu<Packet>(from); }

 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
 { pstore(to, from); }
LM's avatar
LM committed
297 298

/** \internal tries to do cache prefetching of \a addr */
299
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
LM's avatar
LM committed
300
{
301 302 303 304 305 306 307 308 309 310
#ifdef __CUDA_ARCH__
#if defined(__LP64__)
  // 64-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
#else
  // 32-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
#endif
#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
  __builtin_prefetch(addr);
LM's avatar
LM committed
311 312 313 314
#endif
}

/** \internal \returns the first element of a packet */
315
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
LM's avatar
LM committed
316 317 318
{ return a; }

/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
319
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
320 321 322
preduxp(const Packet* vecs) { return vecs[0]; }

/** \internal \returns the sum of the elements of \a a*/
323 324 325 326 327 328 329 330 331 332
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
{ return a; }

/** \internal \returns the sum of the elements of \a a by block of 4 elements.
  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
  * For packet-size smaller or equal to 4, this boils down to a noop.
  */
template<typename Packet> EIGEN_DEVICE_FUNC inline
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
predux_downto4(const Packet& a)
LM's avatar
LM committed
333 334 335
{ return a; }

/** \internal \returns the product of the elements of \a a*/
336
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
LM's avatar
LM committed
337 338 339
{ return a; }

/** \internal \returns the min of the elements of \a a*/
340
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
LM's avatar
LM committed
341 342 343
{ return a; }

/** \internal \returns the max of the elements of \a a*/
344
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
LM's avatar
LM committed
345 346 347
{ return a; }

/** \internal \returns the reversed elements of \a a*/
348
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
LM's avatar
LM committed
349 350 351
{ return a; }

/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
352
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
Don Gagne's avatar
Don Gagne committed
353 354 355 356 357 358
{
  // FIXME: uncomment the following in case we drop the internal imag and real functions.
//   using std::imag;
//   using std::real;
  return Packet(imag(a),real(a));
}
LM's avatar
LM committed
359 360 361 362 363 364 365

/**************************
* Special math functions
***************************/

/** \internal \returns the sine of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
366
Packet psin(const Packet& a) { using std::sin; return sin(a); }
LM's avatar
LM committed
367 368 369

/** \internal \returns the cosine of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
370
Packet pcos(const Packet& a) { using std::cos; return cos(a); }
LM's avatar
LM committed
371 372 373

/** \internal \returns the tan of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
374
Packet ptan(const Packet& a) { using std::tan; return tan(a); }
LM's avatar
LM committed
375 376 377

/** \internal \returns the arc sine of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
378
Packet pasin(const Packet& a) { using std::asin; return asin(a); }
LM's avatar
LM committed
379 380 381

/** \internal \returns the arc cosine of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
382
Packet pacos(const Packet& a) { using std::acos; return acos(a); }
LM's avatar
LM committed
383

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
/** \internal \returns the arc tangent of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet patan(const Packet& a) { using std::atan; return atan(a); }

/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }

/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }

/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }

LM's avatar
LM committed
400 401
/** \internal \returns the exp of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
402
Packet pexp(const Packet& a) { using std::exp; return exp(a); }
LM's avatar
LM committed
403 404 405

/** \internal \returns the log of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
406
Packet plog(const Packet& a) { using std::log; return log(a); }
LM's avatar
LM committed
407

408 409 410 411 412 413 414 415
/** \internal \returns the log1p of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet plog1p(const Packet& a) { return numext::log1p(a); }

/** \internal \returns the log10 of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet plog10(const Packet& a) { using std::log10; return log10(a); }

LM's avatar
LM committed
416 417
/** \internal \returns the square-root of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Don Gagne's avatar
Don Gagne committed
418
Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
LM's avatar
LM committed
419

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet prsqrt(const Packet& a) {
  return pdiv(pset1<Packet>(1), psqrt(a));
}

/** \internal \returns the rounded value of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pround(const Packet& a) { using numext::round; return round(a); }

/** \internal \returns the floor of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }

/** \internal \returns the ceil of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }

LM's avatar
LM committed
438 439 440 441 442 443 444 445 446 447 448 449 450
/***************************************************************************
* The following functions might not have to be overwritten for vectorized types
***************************************************************************/

/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
template<typename Packet>
inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
{
  pstore(to, pset1<Packet>(a));
}

/** \internal \returns a * b + c (coeff-wise) */
451
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
LM's avatar
LM committed
452 453 454 455 456 457
pmadd(const Packet&  a,
         const Packet&  b,
         const Packet&  c)
{ return padd(pmul(a, b),c); }

/** \internal \returns a packet version of \a *from.
458 459 460
  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
template<typename Packet, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
LM's avatar
LM committed
461
{
462
  if(Alignment >= unpacket_traits<Packet>::alignment)
LM's avatar
LM committed
463 464 465 466 467 468
    return pload<Packet>(from);
  else
    return ploadu<Packet>(from);
}

/** \internal copy the packet \a from to \a *to.
469 470 471
  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
template<typename Scalar, typename Packet, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
LM's avatar
LM committed
472
{
473
  if(Alignment >= unpacket_traits<Packet>::alignment)
LM's avatar
LM committed
474 475 476 477 478
    pstore(to, from);
  else
    pstoreu(to, from);
}

479 480 481 482 483 484 485 486 487 488 489
/** \internal \returns a packet version of \a *from.
  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
  * hardware if available to speedup the loading of data that won't be modified
  * by the current computation.
  */
template<typename Packet, int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{
  return ploadt<Packet, LoadMode>(from);
}

LM's avatar
LM committed
490 491 492 493 494
/** \internal default implementation of palign() allowing partial specialization */
template<int Offset,typename PacketType>
struct palign_impl
{
  // by default data are aligned, so there is nothing to be done :)
Don Gagne's avatar
Don Gagne committed
495
  static inline void run(PacketType&, const PacketType&) {}
LM's avatar
LM committed
496 497
};

Don Gagne's avatar
Don Gagne committed
498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
  * of \a first and \a Offset first elements of \a second.
  * 
  * This function is currently only used to optimize matrix-vector products on unligned matrices.
  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
  * at the position \a Offset. For instance, for packets of 4 elements, we have:
  *  Input:
  *  - first = {f0,f1,f2,f3}
  *  - second = {s0,s1,s2,s3}
  * Output: 
  *   - if Offset==0 then {f0,f1,f2,f3}
  *   - if Offset==1 then {f1,f2,f3,s0}
  *   - if Offset==2 then {f2,f3,s0,s1}
  *   - if Offset==3 then {f3,s0,s1,s3}
  */
LM's avatar
LM committed
513 514 515 516 517 518 519 520 521 522
template<int Offset,typename PacketType>
inline void palign(PacketType& first, const PacketType& second)
{
  palign_impl<Offset,PacketType>::run(first,second);
}

/***************************************************************************
* Fast complex products (GCC generates a function call which is very slow)
***************************************************************************/

523 524 525
// Eigen+CUDA does not support complexes.
#ifndef __CUDACC__

LM's avatar
LM committed
526 527 528 529 530 531
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
#endif


/***************************************************************************
 * PacketBlock, that is a collection of N packets where the number of words
 * in the packet is a multiple of N.
***************************************************************************/
template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
  Packet packet[N];
};

template<typename Packet> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
}

/***************************************************************************
 * Selector, i.e. vector of N boolean values used to select (i.e. blend)
 * words from 2 packets.
***************************************************************************/
template <size_t N> struct Selector {
  bool select[N];
};

template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
  return ifPacket.select[0] ? thenPacket : elsePacket;
}

/** \internal \returns \a a with the first coefficient replaced by the scalar b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
{
  // Default implementation based on pblend.
  // It must be specialized for higher performance.
  Selector<unpacket_traits<Packet>::size> mask;
  mask.select[0] = true;
  // This for loop should be optimized away by the compiler.
  for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
    mask.select[i] = false;
  return pblend(mask, pset1<Packet>(b), a);
}

/** \internal \returns \a a with the last coefficient replaced by the scalar b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
{
  // Default implementation based on pblend.
  // It must be specialized for higher performance.
  Selector<unpacket_traits<Packet>::size> mask;
  // This for loop should be optimized away by the compiler.
  for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i)
    mask.select[i] = false;
  mask.select[unpacket_traits<Packet>::size-1] = true;
  return pblend(mask, pset1<Packet>(b), a);
}

LM's avatar
LM committed
589 590
} // end namespace internal

Don Gagne's avatar
Don Gagne committed
591 592
} // end namespace Eigen

LM's avatar
LM committed
593
#endif // EIGEN_GENERIC_PACKET_MATH_H