218 lines
12 KiB
C
218 lines
12 KiB
C
#ifndef COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_
|
|
#define COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_
|
|
#include "libc/bits/progn.internal.h"
|
|
#include "libc/bits/xmmintrin.internal.h"
|
|
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
|
|
|
/*───────────────────────────────────────────────────────────────────────────│─╗
|
|
│ cosmopolitan § it's a trap! » sse2 ─╬─│┼
|
|
╚────────────────────────────────────────────────────────────────────────────│*/
|
|
|
|
typedef char __v16qi _Vector_size(16);
|
|
typedef unsigned char __v16qu _Vector_size(16);
|
|
typedef signed char __v16qs _Vector_size(16);
|
|
|
|
typedef short __v8hi _Vector_size(16);
|
|
typedef unsigned short __v8hu _Vector_size(16);
|
|
|
|
typedef double __v2df _Vector_size(16);
|
|
typedef double __m128d _Vector_size(16) forcealign(16);
|
|
typedef double __m128d_u _Vector_size(16) forcealign(1);
|
|
|
|
typedef long long __v2di _Vector_size(16);
|
|
typedef long long __m128i _Vector_size(16) forcealign(16);
|
|
typedef long long __m128i_u _Vector_size(16) forcealign(1);
|
|
typedef unsigned long long __v2du _Vector_size(16);
|
|
|
|
struct thatispacked mayalias __usi128ma {
|
|
__m128i_u __v;
|
|
};
|
|
|
|
/*───────────────────────────────────────────────────────────────────────────│─╗
|
|
│ cosmopolitan § it's a trap! » sse2 » memory ops ─╬─│┼
|
|
╚────────────────────────────────────────────────────────────────────────────│*/
|
|
|
|
#define _mm_loadu_si128(M128IP) ((struct __usi128ma *)(M128IP))->__v
|
|
#define _mm_storeu_si128(M128IP, M128I) \
|
|
(((struct __usi128ma *)(M128IP))->__v = (M128I))
|
|
|
|
#define _mm_set_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \
|
|
I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0) \
|
|
((__m128i)(__v16qi){I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, \
|
|
I8_9, I8_10, I8_11, I8_12, I8_13, I8_14, I8_15})
|
|
#define _mm_set_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \
|
|
((__m128i)(__v8hi){I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7})
|
|
#define _mm_set_epi32(I32_3, I32_2, I32_1, I32_0) \
|
|
((__m128i)(__v4si){I32_0, I32_1, I32_2, I32_3})
|
|
#define _mm_set_epi64x(I64_1, I64_0) ((__m128i)(__v2di){I64_0, I64_1})
|
|
|
|
#define _mm_setr_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \
|
|
I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0) \
|
|
_mm_set_epi8(I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, I8_9, \
|
|
I8_10, I8_11, I8_12, I8_13, I8_14, I8_15)
|
|
#define _mm_setr_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \
|
|
_mm_set_epi16(I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7)
|
|
#define _mm_setr_epi32(I32_3, I32_2, I32_1, I32_0) \
|
|
_mm_set_epi32(I32_0, I32_1, I32_2, I32_3)
|
|
#define _mm_setr_epi64x(I64_1, I64_0) _mm_set_epi64x(I64_0, I64_1)
|
|
|
|
#define _mm_set1_epi8(I8) \
|
|
_mm_set_epi8(I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8)
|
|
#define _mm_set1_epi16(I16) \
|
|
_mm_set_epi16(I16, I16, I16, I16, I16, I16, I16, I16)
|
|
#define _mm_set1_epi32(I32) _mm_set_epi32(I32, I32, I32, I32)
|
|
#define _mm_set1_epi64x(I64) _mm_set_epi64x(I64, I64)
|
|
|
|
#define _mm_cvtsi128_si32(M128I) ((__v4si)(M128I))[0]
|
|
#define _mm_cvtsi32_si128(I32) ((__m128i)(__v4si){(I32), 0, 0, 0})
|
|
#define _mm_setzero_si128() ((__m128i)(__v2di){0LL, 0LL})
|
|
#define _mm_castsi128_ps(M128I) ((__m128)(M128I))
|
|
#define _mm_castps_si128(M128) ((__m128i)(M128))
|
|
#define _mm_load_si128(M128I) (*(M128I))
|
|
|
|
/*───────────────────────────────────────────────────────────────────────────│─╗
|
|
│ cosmopolitan § it's a trap! » sse2 » simd ops ─╬─│┼
|
|
╚────────────────────────────────────────────────────────────────────────────│*/
|
|
|
|
#define _mm_and_si128(M128I_0, M128I_1) \
|
|
((__m128i)((__v2du)(M128I_0) & (__v2du)(M128I_1)))
|
|
#define _mm_or_si128(M128I_0, M128I_1) \
|
|
((__m128i)((__v2du)(M128I_0) | (__v2du)(M128I_1)))
|
|
#define _mm_xor_si128(M128I_0, M128I_1) \
|
|
((__m128i)((__v2du)(M128I_0) ^ (__v2du)(M128I_1)))
|
|
#define _mm_andnot_si128(M128I_0, M128I_1) \
|
|
((__m128i)(~(__v2du)(M128I_0) & (__v2du)(M128I_1)))
|
|
|
|
#define _mm_add_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) + (__v2df)(M128D_1))
|
|
#define _mm_sub_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) - (__v2df)(M128D_1))
|
|
#define _mm_mul_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) * (__v2df)(M128D_1))
|
|
#define _mm_div_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) / (__v2df)(M128D_1))
|
|
#define _mm_and_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) & (__v2df)(M128D_1))
|
|
#define _mm_or_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) | (__v2df)(M128D_1))
|
|
#define _mm_xor_pd(M128D_0, M128D_1) \
|
|
(__m128d)((__v2df)(M128D_0) ^ (__v2df)(M128D_1))
|
|
#define _mm_andnot_pd(M128D_0, M128D_1) \
|
|
(__m128d)(~(__v2df)(M128D_0) & (__v2df)(M128D_1))
|
|
#define _mm_sqrt_pd(M128D) __builtin_ia32_sqrtpd((__v2df)(M128D))
|
|
|
|
#define _mm_min_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_minpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_max_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_maxpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpeq_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpeqpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpneq_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpneqpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmplt_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpltpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpnlt_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnltpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmple_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmplepd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpnle_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnlepd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpgt_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpltpd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpngt_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnltpd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpge_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmplepd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpnge_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnlepd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpord_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpordpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpunord_pd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpunordpd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
|
|
#define _mm_sad_epu8(M128I_0, M128I_1) \
|
|
__builtin_ia32_psadbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))
|
|
|
|
#define _mm_subs_epi8(M128I_0, M128I_1) \
|
|
((__m128i)__builtin_ia32_psubsb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
|
|
#define _mm_subs_epu8(M128I_0, M128I_1) \
|
|
((__m128i)__builtin_ia32_psubusw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
|
|
#define _mm_subs_epi16(M128I_0, M128I_1) \
|
|
((__m128i)__builtin_ia32_psubsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
|
|
#define _mm_subs_epu16(M128I_0, M128I_1) \
|
|
((__m128i)__builtin_ia32_psubusw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
|
|
|
|
#define _mm_add_epi32(M128I_0, M128I_1) \
|
|
((__m128i)((__v4su)(M128I_0) + (__v4su)(M128I_1)))
|
|
#define _mm_sub_epi32(M128I_0, M128I_1) \
|
|
((__m128i)((__v4su)(M128I_0) - (__v4su)(M128I_1)))
|
|
#define _mm_madd_epi16(M128I_0, M128I_1) \
|
|
((__m128i)__builtin_ia32_pmaddwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
|
|
#define _mm_shuffle_epi32(V, IMM) \
|
|
((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(V), (int)(IMM)))
|
|
|
|
#define _mm_slli_epi32(M128I, COUNT) \
|
|
((__m128i)__builtin_ia32_pslldi128((__v4si)(M128I), (COUNT)))
|
|
|
|
#define _mm_slli_si128(M128I, IMM) \
|
|
((__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8))
|
|
#define _mm_srli_si128(M128I, IMM) \
|
|
((__m128i)__builtin_ia32_psrldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8))
|
|
|
|
/*───────────────────────────────────────────────────────────────────────────│─╗
|
|
│ cosmopolitan § it's a trap! » sse2 » scalar ops ─╬─│┼
|
|
╚────────────────────────────────────────────────────────────────────────────│*/
|
|
|
|
#define _mm_sqrt_sd(M128D_0, M128D_1) \
|
|
({ \
|
|
__m128d M128d2 = __builtin_ia32_sqrtsd((__v2df)(M128D_1)); \
|
|
(__m128d){M128d2[0], (M128D_0)[1]}; \
|
|
})
|
|
|
|
#define _mm_add_sd(M128D_0, M128D_1) \
|
|
PROGN((M128D_0)[0] += (M128D_1)[0], (M128D_0))
|
|
#define _mm_sub_sd(M128D_0, M128D_1) \
|
|
PROGN((M128D_0)[0] -= (M128D_1)[0], (M128D_0))
|
|
#define _mm_mul_sd(M128D_0, M128D_1) \
|
|
PROGN((M128D_0)[0] *= (M128D_1)[0], (M128D_0))
|
|
#define _mm_div_sd(M128D_0, M128D_1) \
|
|
PROGN((M128D_0)[0] /= (M128D_1)[0], (M128D_0))
|
|
|
|
#define _mm_min_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_minsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_max_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_maxsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpeq_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpeqsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpneq_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpneqsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmplt_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpltsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpnlt_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnltsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmple_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmplesd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpnle_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnlesd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpgt_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpltsd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpngt_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnltsd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpge_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmplesd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpnge_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpnlesd((__v2df)(M128D_1), (__v2df)(M128D_0))
|
|
#define _mm_cmpord_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpordsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
#define _mm_cmpunord_sd(M128D_0, M128D_1) \
|
|
__builtin_ia32_cmpunordsd((__v2df)(M128D_0), (__v2df)(M128D_1))
|
|
|
|
/*───────────────────────────────────────────────────────────────────────────│─╗
|
|
│ cosmopolitan § it's a trap! » sse2 » miscellaneous ─╬─│┼
|
|
╚────────────────────────────────────────────────────────────────────────────│*/
|
|
|
|
#define _mm_pause() asm("rep nop")
|
|
|
|
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
|
#endif /* COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ */
|