PacketMath.h 28.1 KB
Newer Older
LM's avatar
LM committed
1 2 3 4
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5
// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
LM's avatar
LM committed
6 7
// Heavily based on Gael's SSE version.
//
Don Gagne's avatar
Don Gagne committed
8 9 10
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
LM's avatar
LM committed
11 12 13 14

#ifndef EIGEN_PACKET_MATH_NEON_H
#define EIGEN_PACKET_MATH_NEON_H

Don Gagne's avatar
Don Gagne committed
15 16
namespace Eigen {

LM's avatar
LM committed
17 18 19 20 21 22
namespace internal {

#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif

23 24 25 26 27 28 29 30
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif

#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif

LM's avatar
LM committed
31
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32 33 34 35
#if EIGEN_ARCH_ARM64
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#else
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
LM's avatar
LM committed
36
#endif
37 38 39 40 41 42 43 44
#endif

#if EIGEN_COMP_MSVC

// In MSVC's arm_neon.h header file, all NEON vector types
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
LM's avatar
LM committed
45

46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
template<typename T,int unique_id>
struct eigen_packet_wrapper
{
  operator T&() { return m_val; }
  operator const T&() const { return m_val; }
  eigen_packet_wrapper() {}
  eigen_packet_wrapper(const T &v) : m_val(v) {}
  eigen_packet_wrapper& operator=(const T &v) {
    m_val = v;
    return *this;
  }

  T m_val;
};
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;

#else

typedef float32x2_t Packet2f;
LM's avatar
LM committed
69 70
typedef float32x4_t Packet4f;
typedef int32x4_t   Packet4i;
71
typedef int32x2_t   Packet2i;
LM's avatar
LM committed
72 73
typedef uint32x4_t  Packet4ui;

74 75
#endif // EIGEN_COMP_MSVC

LM's avatar
LM committed
76 77 78 79
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
  const Packet4f p4f_##NAME = pset1<Packet4f>(X)

#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
80
  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
LM's avatar
LM committed
81 82 83 84

#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)

85 86 87 88 89 90
#if EIGEN_ARCH_ARM64
  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
  // prefetch instructions there are too detailed for __builtin_prefetch to map
  // meaningfully to them.
  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
91 92 93
  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
94 95
#elif EIGEN_ARCH_ARM32
  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
96 97 98
#else
  // by default no explicit prefetching
  #define EIGEN_ARM_PREFETCH(ADDR)
LM's avatar
LM committed
99 100 101 102 103
#endif

template<> struct packet_traits<float>  : default_packet_traits
{
  typedef Packet4f type;
104
  typedef Packet4f half; // Packet2f intrinsics not implemented yet
LM's avatar
LM committed
105 106 107 108
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
109
    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
LM's avatar
LM committed
110 111 112 113 114 115
   
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
    HasCos  = 0,
    HasLog  = 0,
116
    HasExp  = 1,
LM's avatar
LM committed
117 118 119
    HasSqrt = 0
  };
};
120
template<> struct packet_traits<int32_t>    : default_packet_traits
LM's avatar
LM committed
121 122
{
  typedef Packet4i type;
123
  typedef Packet4i half; // Packet2i intrinsics not implemented yet
LM's avatar
LM committed
124 125 126
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
127 128
    size=4,
    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
LM's avatar
LM committed
129 130 131 132
    // FIXME check the Has*
  };
};

133
#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
LM's avatar
LM committed
134 135 136
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
137
EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
LM's avatar
LM committed
138 139 140 141
EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif

142 143
template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
LM's avatar
LM committed
144 145

template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
146
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }
LM's avatar
LM committed
147

148
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
LM's avatar
LM committed
149
{
150 151
  const float f[] = {0, 1, 2, 3};
  Packet4f countdown = vld1q_f32(f);
LM's avatar
LM committed
152 153
  return vaddq_f32(pset1<Packet4f>(a), countdown);
}
154
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
LM's avatar
LM committed
155
{
156 157
  const int32_t i[] = {0, 1, 2, 3};
  Packet4i countdown = vld1q_s32(i);
LM's avatar
LM committed
158 159 160 161 162 163 164 165 166 167 168 169
  return vaddq_s32(pset1<Packet4i>(a), countdown);
}

template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }

Don Gagne's avatar
Don Gagne committed
170 171 172
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }

LM's avatar
LM committed
173 174 175 176 177
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
{
178 179 180
#if EIGEN_ARCH_ARM64
  return vdivq_f32(a,b);
#else
LM's avatar
LM committed
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
  Packet4f inv, restep, div;

  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
  // Newton-Raphson and vrecpsq_f32()
  inv = vrecpeq_f32(b);

  // This returns a differential, by which we will have to multiply inv to get a better
  // approximation of 1/b.
  restep = vrecpsq_f32(b, inv);
  inv = vmulq_f32(restep, inv);

  // Finally, multiply a by 1/b and get the wanted result of the division.
  div = vmulq_f32(a, inv);

  return div;
199
#endif
LM's avatar
LM committed
200
}
201

LM's avatar
LM committed
202 203 204 205 206
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by NEON");
  return pset1<Packet4i>(0);
}

207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
// then implements a slow software scalar fallback calling fmaf()!
// Filed LLVM bug:
//     https://llvm.org/bugs/show_bug.cgi?id=27216
#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
// See bug 936.
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
// MLA is not fused i.e. does 2 roundings.
// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
#else
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
  // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
  // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
  // -march=armv7-a, that is a very common case.
  // See e.g. this thread:
  //     http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
  // Filed LLVM bug:
  //     https://llvm.org/bugs/show_bug.cgi?id=27219
  Packet4f r = c;
  asm volatile(
    "vmla.f32 %q[r], %q[a], %q[b]"
    : [r] "+w" (r)
    : [a] "w" (a),
      [b] "w" (b)
    : );
  return r;
#else
  return vmlaq_f32(c,a,b);
#endif
}
#endif

// No FMA instruction for int, so use MLA unconditionally.
Don Gagne's avatar
Don Gagne committed
244
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
LM's avatar
LM committed
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276

template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }

// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
{
  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
{
  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
{
  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }

template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
{
  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }

277 278
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
LM's avatar
LM committed
279

280 281
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
LM's avatar
LM committed
282

283
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
LM's avatar
LM committed
284 285
{
  float32x2_t lo, hi;
Don Gagne's avatar
Don Gagne committed
286 287
  lo = vld1_dup_f32(from);
  hi = vld1_dup_f32(from+1);
LM's avatar
LM committed
288 289
  return vcombine_f32(lo, hi);
}
290
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
LM's avatar
LM committed
291 292
{
  int32x2_t lo, hi;
Don Gagne's avatar
Don Gagne committed
293 294
  lo = vld1_dup_s32(from);
  hi = vld1_dup_s32(from+1);
LM's avatar
LM committed
295 296 297
  return vcombine_s32(lo, hi);
}

298 299
template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
LM's avatar
LM committed
300

301 302
template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
LM's avatar
LM committed
303

304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
  Packet4f res = pset1<Packet4f>(0.f);
  res = vsetq_lane_f32(from[0*stride], res, 0);
  res = vsetq_lane_f32(from[1*stride], res, 1);
  res = vsetq_lane_f32(from[2*stride], res, 2);
  res = vsetq_lane_f32(from[3*stride], res, 3);
  return res;
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
{
  Packet4i res = pset1<Packet4i>(0);
  res = vsetq_lane_s32(from[0*stride], res, 0);
  res = vsetq_lane_s32(from[1*stride], res, 1);
  res = vsetq_lane_s32(from[2*stride], res, 2);
  res = vsetq_lane_s32(from[3*stride], res, 3);
  return res;
}

template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
  to[stride*0] = vgetq_lane_f32(from, 0);
  to[stride*1] = vgetq_lane_f32(from, 1);
  to[stride*2] = vgetq_lane_f32(from, 2);
  to[stride*3] = vgetq_lane_f32(from, 3);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
{
  to[stride*0] = vgetq_lane_s32(from, 0);
  to[stride*1] = vgetq_lane_s32(from, 1);
  to[stride*2] = vgetq_lane_s32(from, 2);
  to[stride*3] = vgetq_lane_s32(from, 3);
}

template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }
LM's avatar
LM committed
340 341

// FIXME only store the 2 first elements ?
342 343
template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
LM's avatar
LM committed
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362

template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
  float32x2_t a_lo, a_hi;
  Packet4f a_r64;

  a_r64 = vrev64q_f32(a);
  a_lo = vget_low_f32(a_r64);
  a_hi = vget_high_f32(a_r64);
  return vcombine_f32(a_hi, a_lo);
}
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
  int32x2_t a_lo, a_hi;
  Packet4i a_r64;

  a_r64 = vrev64q_s32(a);
  a_lo = vget_low_s32(a_r64);
  a_hi = vget_high_s32(a_r64);
  return vcombine_s32(a_hi, a_lo);
}
363

LM's avatar
LM committed
364 365 366 367 368 369 370 371 372 373 374
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }

template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
  float32x2_t a_lo, a_hi, sum;

  a_lo = vget_low_f32(a);
  a_hi = vget_high_f32(a);
  sum = vpadd_f32(a_lo, a_hi);
  sum = vpadd_f32(sum, sum);
Don Gagne's avatar
Don Gagne committed
375
  return vget_lane_f32(sum, 0);
LM's avatar
LM committed
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
}

template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
  float32x4x2_t vtrn1, vtrn2, res1, res2;
  Packet4f sum1, sum2, sum;

  // NEON zip performs interleaving of the supplied vectors.
  // We perform two interleaves in a row to acquire the transposed vector
  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);

  // Do the addition of the resulting vectors
  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
  sum = vaddq_f32(sum1, sum2);

  return sum;
}

398
template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
LM's avatar
LM committed
399 400 401 402 403 404 405
{
  int32x2_t a_lo, a_hi, sum;

  a_lo = vget_low_s32(a);
  a_hi = vget_high_s32(a);
  sum = vpadd_s32(a_lo, a_hi);
  sum = vpadd_s32(sum, sum);
Don Gagne's avatar
Don Gagne committed
406
  return vget_lane_s32(sum, 0);
LM's avatar
LM committed
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
}

template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
{
  int32x4x2_t vtrn1, vtrn2, res1, res2;
  Packet4i sum1, sum2, sum;

  // NEON zip performs interleaving of the supplied vectors.
  // We perform two interleaves in a row to acquire the transposed vector
  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);

  // Do the addition of the resulting vectors
  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
  sum = vaddq_s32(sum1, sum2);

  return sum;
}

// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
{
  float32x2_t a_lo, a_hi, prod;

  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
  a_lo = vget_low_f32(a);
  a_hi = vget_high_f32(a);
  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
  prod = vmul_f32(a_lo, a_hi);
  // Multiply prod with its swapped value |a2*a4|a1*a3|
  prod = vmul_f32(prod, vrev64_f32(prod));

Don Gagne's avatar
Don Gagne committed
443
  return vget_lane_f32(prod, 0);
LM's avatar
LM committed
444
}
445
template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
LM's avatar
LM committed
446 447 448 449 450 451 452 453 454 455 456
{
  int32x2_t a_lo, a_hi, prod;

  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
  a_lo = vget_low_s32(a);
  a_hi = vget_high_s32(a);
  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
  prod = vmul_s32(a_lo, a_hi);
  // Multiply prod with its swapped value |a2*a4|a1*a3|
  prod = vmul_s32(prod, vrev64_s32(prod));

Don Gagne's avatar
Don Gagne committed
457
  return vget_lane_s32(prod, 0);
LM's avatar
LM committed
458 459 460 461 462 463 464 465 466 467 468 469
}

// min
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
  float32x2_t a_lo, a_hi, min;

  a_lo = vget_low_f32(a);
  a_hi = vget_high_f32(a);
  min = vpmin_f32(a_lo, a_hi);
  min = vpmin_f32(min, min);

Don Gagne's avatar
Don Gagne committed
470
  return vget_lane_f32(min, 0);
LM's avatar
LM committed
471
}
Don Gagne's avatar
Don Gagne committed
472

473
template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
LM's avatar
LM committed
474 475 476 477 478 479 480
{
  int32x2_t a_lo, a_hi, min;

  a_lo = vget_low_s32(a);
  a_hi = vget_high_s32(a);
  min = vpmin_s32(a_lo, a_hi);
  min = vpmin_s32(min, min);
Don Gagne's avatar
Don Gagne committed
481 482
  
  return vget_lane_s32(min, 0);
LM's avatar
LM committed
483 484 485 486 487 488 489 490 491 492 493 494
}

// max
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{
  float32x2_t a_lo, a_hi, max;

  a_lo = vget_low_f32(a);
  a_hi = vget_high_f32(a);
  max = vpmax_f32(a_lo, a_hi);
  max = vpmax_f32(max, max);

Don Gagne's avatar
Don Gagne committed
495
  return vget_lane_f32(max, 0);
LM's avatar
LM committed
496
}
Don Gagne's avatar
Don Gagne committed
497

498
template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
LM's avatar
LM committed
499 500 501 502 503 504
{
  int32x2_t a_lo, a_hi, max;

  a_lo = vget_low_s32(a);
  a_hi = vget_high_s32(a);
  max = vpmax_s32(a_lo, a_hi);
505
  max = vpmax_s32(max, max);
LM's avatar
LM committed
506

Don Gagne's avatar
Don Gagne committed
507
  return vget_lane_s32(max, 0);
LM's avatar
LM committed
508 509
}

Don Gagne's avatar
Don Gagne committed
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
template<>\
struct palign_impl<Offset,Type>\
{\
    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
    {\
        if (Offset!=0)\
            first = Command(first, second, Offset);\
    }\
};\

PALIGN_NEON(0,Packet4f,vextq_f32)
PALIGN_NEON(1,Packet4f,vextq_f32)
PALIGN_NEON(2,Packet4f,vextq_f32)
PALIGN_NEON(3,Packet4f,vextq_f32)
PALIGN_NEON(0,Packet4i,vextq_s32)
PALIGN_NEON(1,Packet4i,vextq_s32)
PALIGN_NEON(2,Packet4i,vextq_s32)
PALIGN_NEON(3,Packet4i,vextq_s32)
531

Don Gagne's avatar
Don Gagne committed
532
#undef PALIGN_NEON
LM's avatar
LM committed
533

534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4f,4>& kernel) {
  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);

  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
}

EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
}

//---------- double ----------

// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
// Confirmed at least with __apple_build_version__ = 6000054.
#ifdef __apple_build_version__
// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
// major toolchain updates.
#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
#else
#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
#endif

#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG

// Bug 907: workaround missing declarations of the following two functions in the ADK
// Defining these functions as templates ensures that if these intrinsics are
// already defined in arm_neon.h, then our workaround doesn't cause a conflict
// and has lower priority in overload resolution.
template <typename T>
uint64x2_t vreinterpretq_u64_f64(T a)
{
  return (uint64x2_t) a;
}

template <typename T>
float64x2_t vreinterpretq_f64_u64(T a)
{
  return (float64x2_t) a;
}

typedef float64x2_t Packet2d;
typedef float64x1_t Packet1d;

template<> struct packet_traits<double>  : default_packet_traits
{
  typedef Packet2d type;
  typedef Packet2d half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket=0,
   
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
    HasCos  = 0,
    HasLog  = 0,
    HasExp  = 0,
    HasSqrt = 0
  };
};

template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };

template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }

template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
{
  const double countdown_raw[] = {0.0,1.0};
  const Packet2d countdown = vld1q_f64(countdown_raw);
  return vaddq_f64(pset1<Packet2d>(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }

template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }

template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }

template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }

template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }

#ifdef __ARM_FEATURE_FMA
// See bug 936. See above comment about FMA for float.
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
#else
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
#endif

template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }

template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }

// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
{
  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}

template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
{
  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}

template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
{
  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}

template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
{
  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}

template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }

template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }

template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
{
  return vld1q_dup_f64(from);
}
template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }

template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }

template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
  Packet2d res = pset1<Packet2d>(0.0);
  res = vsetq_lane_f64(from[0*stride], res, 0);
  res = vsetq_lane_f64(from[1*stride], res, 1);
  return res;
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
  to[stride*0] = vgetq_lane_f64(from, 0);
  to[stride*1] = vgetq_lane_f64(from, 1);
}
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }

// FIXME only store the 2 first elements ?
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }

template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }

template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }

#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
// workaround ICE, see bug 907
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
#else
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
#endif

template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
  float64x2_t trn1, trn2;

  // NEON zip performs interleaving of the supplied vectors.
  // We perform two interleaves in a row to acquire the transposed vector
  trn1 = vzip1q_f64(vecs[0], vecs[1]);
  trn2 = vzip2q_f64(vecs[0], vecs[1]);

  // Do the addition of the resulting vectors
  return vaddq_f64(trn1, trn2);
}
// Other reduction functions:
// mul
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
#else
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
#endif

// min
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }

// max
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }

// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
template<>\
struct palign_impl<Offset,Type>\
{\
    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
    {\
        if (Offset!=0)\
            first = Command(first, second, Offset);\
    }\
};\

PALIGN_NEON(0,Packet2d,vextq_f64)
PALIGN_NEON(1,Packet2d,vextq_f64)
#undef PALIGN_NEON

EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet2d,2>& kernel) {
  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);

  kernel.packet[0] = trn1;
  kernel.packet[1] = trn2;
}
#endif // EIGEN_ARCH_ARM64 

LM's avatar
LM committed
756 757
} // end namespace internal

Don Gagne's avatar
Don Gagne committed
758 759
} // end namespace Eigen

LM's avatar
LM committed
760
#endif // EIGEN_PACKET_MATH_NEON_H