Инструкция	Описание
VBROADCASTSS, VBROADCASTSD, VBROADCASTF128	Копирует 32-х-, 64-х- или 128-битный операнд из памяти во все элементы векторного регистра XMM или YMM.
VINSERTF128	Замещает младшую или старшую половину 256-битного регистра YMM значением 128-битного операнда. Другая часть регистра-получателя не изменяется.
VEXTRACTF128	Извлекает младшую или старшую половину 256-битного регистра YMM и копирует в 128-битный операнд-назначение.
VMASKMOVPS, VMASKMOVPD	Условно считывает любое количество элементов из векторного операнда из памяти в регистр-получатель, оставляя остальные элементы несчитанными и обнуляя соответствующие им элементы регистра-получателя. Также может условно записывать любое количество элементов из векторного регистра в векторный операнд в памяти, оставляя остальные элементы операнда памяти неизменёнными.
VPERMILPS, VPERMILPD	Переставляет 32-х или 64-х битные элементы вектора согласно операнду-селектору (из памяти или из регистра).
VPERM2F128	Переставляет 4 128-битных элемента двух 256-битных регистров в 256-битный операнд-назначение с использованием непосредственной константы (imm) в качестве селектора.
VZEROALL	Обнуляет все YMM-регистры и помечает их как неиспользуемые. Используется при переключении между 128-битным режимом и 256-битным.
VZEROUPPER	Обнуляет старшие половины всех регистров YMM. Используется при переключении между 128-битным режимом и 256-битным.

__m256i _mm256_abs_epi16 (__m256i a)

Synopsis

__m256i _mm256_abs_epi16 (__m256i a)
#include «immintrin.h»
Instruction: vpabsw ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpabsd

__m256i _mm256_abs_epi32 (__m256i a)

Synopsis

__m256i _mm256_abs_epi32 (__m256i a)
#include «immintrin.h»
Instruction: vpabsd ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpabsb

__m256i _mm256_abs_epi8 (__m256i a)

Synopsis

__m256i _mm256_abs_epi8 (__m256i a)
#include «immintrin.h»
Instruction: vpabsb ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpaddw

__m256i _mm256_add_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi16 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := a[i+15:i] + b[i+15:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpaddd

__m256i _mm256_add_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi32 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpaddq

__m256i _mm256_add_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi64 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 64-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpaddb

__m256i _mm256_add_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := a[i+7:i] + b[i+7:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vaddpd

__m256d _mm256_add_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_add_pd (__m256d a, __m256d b)
#include «immintrin.h»
Instruction: vaddpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vaddps

__m256 _mm256_add_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_add_ps (__m256 a, __m256 b)
#include «immintrin.h»
Instruction: vaddps ymm, ymm, ymm
CPUID Flags: AVX

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vpaddsw

__m256i _mm256_adds_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epi16 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpaddsb

__m256i _mm256_adds_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epi8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddsb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpaddusw

__m256i _mm256_adds_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epu16 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddusw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpaddusb

__m256i _mm256_adds_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epu8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpaddusb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vaddsubpd

__m256d _mm256_addsub_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_addsub_pd (__m256d a, __m256d b)
#include «immintrin.h»
Instruction: vaddsubpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Alternatively add and subtract packed double-precision (64-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF (j is even) dst[i+63:i] := a[i+63:i] — b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] + b[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vaddsubps

__m256 _mm256_addsub_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_addsub_ps (__m256 a, __m256 b)
#include «immintrin.h»
Instruction: vaddsubps ymm, ymm, ymm
CPUID Flags: AVX

Description

Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF (j is even) dst[i+31:i] := a[i+31:i] — b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] + b[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vpalignr

__m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int count)
#include «immintrin.h»
Instruction: vpalignr ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.

Operation

FOR j := 0 to 1 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) dst[i+127:i] := tmp[127:0] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vandpd

__m256d _mm256_and_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_and_pd (__m256d a, __m256d b)
#include «immintrin.h»
Instruction: vandpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	1
Ivy Bridge	1	1
Sandy Bridge	1	1

vandps

__m256 _mm256_and_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_and_ps (__m256 a, __m256 b)
#include «immintrin.h»
Instruction: vandps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	1
Ivy Bridge	1	1
Sandy Bridge	1	1

vpand

__m256i _mm256_and_si256 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_and_si256 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpand ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the bitwise AND of 256 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[255:0] := (a[255:0] AND b[255:0]) dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vandnpd

__m256d _mm256_andnot_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_andnot_pd (__m256d a, __m256d b)
#include «immintrin.h»
Instruction: vandnpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in a and then AND with b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	1
Ivy Bridge	1	1
Sandy Bridge	1	1

vandnps

__m256 _mm256_andnot_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_andnot_ps (__m256 a, __m256 b)
#include «immintrin.h»
Instruction: vandnps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in a and then AND with b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	1
Ivy Bridge	1	1
Sandy Bridge	1	1

vpandn

__m256i _mm256_andnot_si256 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_andnot_si256 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpandn ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the bitwise NOT of 256 bits (representing integer data) in a and then AND with b, and store the result in dst.

Operation

dst[255:0] := ((NOT a[255:0]) AND b[255:0]) dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpavgw

__m256i _mm256_avg_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_avg_epu16 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpavgw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpavgb

__m256i _mm256_avg_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_avg_epu8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpavgb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpblendw

__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
#include «immintrin.h»
Instruction: vpblendw ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF imm8[j%8] dst[i+15:i] := b[i+15:i] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	1

vpblendd

__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)

Synopsis

__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
#include «immintrin.h»
Instruction: vpblendd xmm, xmm, xmm, imm
CPUID Flags: AVX2

Description

Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.33

vpblendd

__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
#include «immintrin.h»
Instruction: vpblendd ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.33

vblendpd

__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8)
#include «immintrin.h»
Instruction: vblendpd ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF imm8[j%8] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.33
Ivy Bridge	1	0.5
Sandy Bridge	1	0.5

vblendps

__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
#include «immintrin.h»
Instruction: vblendps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.33
Ivy Bridge	1	0.5
Sandy Bridge	1	0.5

vpblendvb

__m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask)

Synopsis

__m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask)
#include «immintrin.h»
Instruction: vpblendvb ymm, ymm, ymm, ymm
CPUID Flags: AVX2

Description

Blend packed 8-bit integers from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 IF mask[i+7] dst[i+7:i] := b[i+7:i] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	2	2

vblendvpd

__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)

Synopsis

__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
#include «immintrin.h»
Instruction: vblendvpd ymm, ymm, ymm, ymm
CPUID Flags: AVX

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	2	2
Ivy Bridge	2	1
Sandy Bridge	2	1

vblendvps

__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask)

Synopsis

__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask)
#include «immintrin.h»
Instruction: vblendvps ymm, ymm, ymm, ymm
CPUID Flags: AVX

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	2	2
Ivy Bridge	2	1
Sandy Bridge	2	1

vbroadcastf128

__m256d _mm256_broadcast_pd (__m128d const * mem_addr)

Synopsis

__m256d _mm256_broadcast_pd (__m128d const * mem_addr)
#include «immintrin.h»
Instruction: vbroadcastf128 ymm, m128
CPUID Flags: AVX

Description

Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of dst.

Operation

tmp[127:0] = MEM[mem_addr+127:mem_addr] dst[127:0] := tmp[127:0] dst[255:128] := tmp[127:0] dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Ivy Bridge	1	—
Sandy Bridge	1	—

vbroadcastf128

__m256 _mm256_broadcast_ps (__m128 const * mem_addr)

Synopsis

__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
#include «immintrin.h»
Instruction: vbroadcastf128 ymm, m128
CPUID Flags: AVX

Description

Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of dst.

Operation

tmp[127:0] = MEM[mem_addr+127:mem_addr] dst[127:0] := tmp[127:0] dst[255:128] := tmp[127:0] dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Ivy Bridge	1	—
Sandy Bridge	1	—

vbroadcastsd

__m256d _mm256_broadcast_sd (double const * mem_addr)

Synopsis

__m256d _mm256_broadcast_sd (double const * mem_addr)
#include «immintrin.h»
Instruction: vbroadcastsd ymm, m64
CPUID Flags: AVX

Description

Broadcast a double-precision (64-bit) floating-point element from memory to all elements of dst.

Operation

tmp[63:0] = MEM[mem_addr+63:mem_addr] FOR j := 0 to 3 i := j*64 dst[i+63:i] := tmp[63:0] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Ivy Bridge	1	—
Sandy Bridge	1	—

vbroadcastss

vroundpd

__m256d _mm256_ceil_pd (__m256d a)

Synopsis

__m256d _mm256_ceil_pd (__m256d a)
#include «immintrin.h»
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	6	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vroundps

__m256 _mm256_ceil_ps (__m256 a)

Synopsis

__m256 _mm256_ceil_ps (__m256 a)
#include «immintrin.h»
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	6	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vcmppd

__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)
#include «immintrin.h»
Instruction: vcmppd xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 1 i := j*64 dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	3	—
Sandy Bridge	3	—

vcmppd

__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)
#include «immintrin.h»
Instruction: vcmppd ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*64 dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vcmpps

__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)
#include «immintrin.h»
Instruction: vcmpps xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	3	—
Sandy Bridge	3	—

vcmpps

__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)
#include «immintrin.h»
Instruction: vcmpps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*32 dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vcmpsd

__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)
#include «immintrin.h»
Instruction: vcmpsd xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	3	—
Sandy Bridge	3	—

vcmpss

__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)
#include «immintrin.h»
Instruction: vcmpss xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0 dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	3	—
Sandy Bridge	3	—

vpcmpeqw

__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpeqw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 16-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpcmpeqd

__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpeqd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 32-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpcmpeqq

__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpeqq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 64-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpcmpeqb

__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpeqb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 8-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	0.5

vpcmpgtw

__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpgtw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpcmpgtd

__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpgtd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpcmpgtq

__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpgtq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	5	1

vpcmpgtb

__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpcmpgtb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	1	—

vpmovsxwd

__m256i _mm256_cvtepi16_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepi16_epi32 (__m128i a)
#include «immintrin.h»
Instruction: vpmovsxwd ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 7 i := 32*j k := 16*j dst[i+31:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovsxwq

__m256i _mm256_cvtepi16_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepi16_epi64 (__m128i a)
#include «immintrin.h»
Instruction: vpmovsxwq ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 16*j dst[i+63:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovsxdq

__m256i _mm256_cvtepi32_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepi32_epi64 (__m128i a)
#include «immintrin.h»
Instruction: vpmovsxdq ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 32*j dst[i+63:i] := SignExtend(a[k+31:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vcvtdq2pd

__m256d _mm256_cvtepi32_pd (__m128i a)

Synopsis

__m256d _mm256_cvtepi32_pd (__m128i a)
#include «immintrin.h»
Instruction: vcvtdq2pd ymm, xmm
CPUID Flags: AVX

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	4	1
Ivy Bridge	4	1
Sandy Bridge	4	1

vcvtdq2ps

__m256 _mm256_cvtepi32_ps (__m256i a)

Synopsis

__m256 _mm256_cvtepi32_ps (__m256i a)
#include «immintrin.h»
Instruction: vcvtdq2ps ymm, ymm
CPUID Flags: AVX

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vpmovsxbw

__m256i _mm256_cvtepi8_epi16 (__m128i a)

Synopsis

__m256i _mm256_cvtepi8_epi16 (__m128i a)
#include «immintrin.h»
Instruction: vpmovsxbw ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 l := j*16 dst[l+15:l] := SignExtend(a[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovsxbd

__m256i _mm256_cvtepi8_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepi8_epi32 (__m128i a)
#include «immintrin.h»
Instruction: vpmovsxbd ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[i+31:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovsxbq

__m256i _mm256_cvtepi8_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepi8_epi64 (__m128i a)
#include «immintrin.h»
Instruction: vpmovsxbq ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[i+63:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovzxwd

__m256i _mm256_cvtepu16_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepu16_epi32 (__m128i a)
#include «immintrin.h»
Instruction: vpmovzxwd ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 16*j dst[i+31:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovzxwq

__m256i _mm256_cvtepu16_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepu16_epi64 (__m128i a)
#include «immintrin.h»
Instruction: vpmovzxwq ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 16*j dst[i+63:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovzxdq

__m256i _mm256_cvtepu32_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepu32_epi64 (__m128i a)
#include «immintrin.h»
Instruction: vpmovzxdq ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 32*j dst[i+63:i] := ZeroExtend(a[k+31:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovzxbw

__m256i _mm256_cvtepu8_epi16 (__m128i a)

Synopsis

__m256i _mm256_cvtepu8_epi16 (__m128i a)
#include «immintrin.h»
Instruction: vpmovzxbw ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 l := j*16 dst[l+15:l] := ZeroExtend(a[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovzxbd

__m256i _mm256_cvtepu8_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepu8_epi32 (__m128i a)
#include «immintrin.h»
Instruction: vpmovzxbd ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[i+31:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vpmovzxbq

__m256i _mm256_cvtepu8_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepu8_epi64 (__m128i a)
#include «immintrin.h»
Instruction: vpmovzxbq ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[i+63:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—

vcvtpd2dq

__m128i _mm256_cvtpd_epi32 (__m256d a)

Synopsis

__m128i _mm256_cvtpd_epi32 (__m256d a)
#include «immintrin.h»
Instruction: vcvtpd2dq xmm, ymm
CPUID Flags: AVX

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	4	1
Ivy Bridge	4	1
Sandy Bridge	4	1

vcvtpd2ps

__m128 _mm256_cvtpd_ps (__m256d a)

Synopsis

__m128 _mm256_cvtpd_ps (__m256d a)
#include «immintrin.h»
Instruction: vcvtpd2ps xmm, ymm
CPUID Flags: AVX

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	4	1
Ivy Bridge	4	1
Sandy Bridge	4	1

vcvtps2dq

__m256i _mm256_cvtps_epi32 (__m256 a)

Synopsis

__m256i _mm256_cvtps_epi32 (__m256 a)
#include «immintrin.h»
Instruction: vcvtps2dq ymm, ymm
CPUID Flags: AVX

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vcvtps2pd

__m256d _mm256_cvtps_pd (__m128 a)

Synopsis

__m256d _mm256_cvtps_pd (__m128 a)
#include «immintrin.h»
Instruction: vcvtps2pd ymm, xmm
CPUID Flags: AVX

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	2	1
Ivy Bridge	2	1
Sandy Bridge	2	1

vcvttpd2dq

__m128i _mm256_cvttpd_epi32 (__m256d a)

Synopsis

__m128i _mm256_cvttpd_epi32 (__m256d a)
#include «immintrin.h»
Instruction: vcvttpd2dq xmm, ymm
CPUID Flags: AVX

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR dst[MAX:128] := 0

Performance

Architecture	Latency	Throughput
Haswell	4	1
Ivy Bridge	4	1
Sandy Bridge	4	1

vcvttps2dq

__m256i _mm256_cvttps_epi32 (__m256 a)

Synopsis

__m256i _mm256_cvttps_epi32 (__m256 a)
#include «immintrin.h»
Instruction: vcvttps2dq ymm, ymm
CPUID Flags: AVX

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vdivpd

__m256d _mm256_div_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_div_pd (__m256d a, __m256d b)
#include «immintrin.h»
Instruction: vdivpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	35	25
Ivy Bridge	35	28
Sandy Bridge	43	44

vdivps

__m256 _mm256_div_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_div_ps (__m256 a, __m256 b)
#include «immintrin.h»
Instruction: vdivps ymm, ymm, ymm
CPUID Flags: AVX

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	21	13
Ivy Bridge	21	14
Sandy Bridge	29	28

vdpps

__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
#include «immintrin.h»
Instruction: vdpps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.

Operation

DP(a[127:0], b[127:0], imm8[7:0]) { FOR j := 0 to 3 i := j*32 IF imm8[(4+j)%8] temp[i+31:i] := a[i+31:i] * b[i+31:i] ELSE temp[i+31:i] := 0 FI ENDFOR sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) FOR j := 0 to 3 i := j*32 IF imm8[j%8] tmpdst[i+31:i] := sum[31:0] ELSE tmpdst[i+31:i] := 0 FI ENDFOR RETURN tmpdst[127:0] } dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0]) dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	14	2
Ivy Bridge	12	2
Sandy Bridge	12	2

__m256d _mm256_round_pd (__m256d a, int rounding)
#include «immintrin.h»
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:

(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	6	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vroundps

__m256 _mm256_round_ps (__m256 a, int rounding)

Synopsis

__m256 _mm256_round_ps (__m256 a, int rounding)
#include «immintrin.h»
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:

(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	6	1
Ivy Bridge	3	1
Sandy Bridge	3	1

vrsqrtps

__m256 _mm256_rsqrt_ps (__m256 a)

Synopsis

__m256 _mm256_rsqrt_ps (__m256 a)
#include «immintrin.h»
Instruction: vrsqrtps ymm, ymm
CPUID Flags: AVX

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	7	1
Ivy Bridge	7	1
Sandy Bridge	7	1

vpsadbw

__m256i _mm256_sad_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sad_epu8 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vpsadbw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.

Operation

FOR j := 0 to 31 i := j*8 tmp[i+7:i] := ABS(a[i+7:i] — b[i+7:i]) ENDFOR FOR j := 0 to 4 i := j*64 dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] dst[i+63:i+16] := 0 ENDFOR dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	5	1

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	1	—
Sandy Bridge	1	—

vinsertf128

__m256d _mm256_set_m128d (__m128d hi, __m128d lo)

Synopsis

__m256d _mm256_set_m128d (__m128d hi, __m128d lo)
#include «immintrin.h»
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256d vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	1	—
Sandy Bridge	1	—

vinsertf128

__m256i _mm256_set_m128i (__m128i hi, __m128i lo)

Synopsis

__m256i _mm256_set_m128i (__m128i hi, __m128i lo)
#include «immintrin.h»
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256i vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	1	—
Sandy Bridge	1	—

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	1	—
Sandy Bridge	1	—

vinsertf128

__m256d _mm256_setr_m128d (__m128d lo, __m128d hi)

Synopsis

__m256d _mm256_setr_m128d (__m128d lo, __m128d hi)
#include «immintrin.h»
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256d vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	1	—
Sandy Bridge	1	—

vinsertf128

__m256i _mm256_setr_m128i (__m128i lo, __m128i hi)

Synopsis

__m256i _mm256_setr_m128i (__m128i lo, __m128i hi)
#include «immintrin.h»
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256i vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput
Haswell	3	—
Ivy Bridge	1	—
Sandy Bridge	1	—

…

__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0)

int _mm256_testz_si256 (__m256i a, __m256i b)
#include «immintrin.h»
Instruction: vptest ymm, ymm
CPUID Flags: AVX

Zero the contents of all XMM or YMM registers.

Operation

YMM0[MAX:0] := 0 YMM1[MAX:0] := 0 YMM2[MAX:0] := 0 YMM3[MAX:0] := 0 YMM4[MAX:0] := 0 YMM5[MAX:0] := 0 YMM6[MAX:0] := 0 YMM7[MAX:0] := 0 IF 64-bit mode YMM8[MAX:0] := 0 YMM9[MAX:0] := 0 YMM10[MAX:0] := 0 YMM11[MAX:0] := 0 YMM12[MAX:0] := 0 YMM13[MAX:0] := 0 YMM14[MAX:0] := 0 YMM15[MAX:0] := 0 FI

vzeroupper

void _mm256_zeroupper (void)

Synopsis

void _mm256_zeroupper (void)
#include «immintrin.h»
Instruction: vzeroupper
CPUID Flags: AVX

Description

Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.

Operation

YMM0[MAX:128] := 0 YMM1[MAX:128] := 0 YMM2[MAX:128] := 0 YMM3[MAX:128] := 0 YMM4[MAX:128] := 0 YMM5[MAX:128] := 0 YMM6[MAX:128] := 0 YMM7[MAX:128] := 0 IF 64-bit mode YMM8[MAX:128] := 0 YMM9[MAX:128] := 0 YMM10[MAX:128] := 0 YMM11[MAX:128] := 0 YMM12[MAX:128] := 0 YMM13[MAX:128] := 0 YMM14[MAX:128] := 0 YMM15[MAX:128] := 0 FI

Performance

Architecture	Latency	Throughput
Haswell	0	1
Ivy Bridge	0	1
Sandy Bridge	0	1

Улучшения

Новая схема кодирования

Новые инструкции

Применение

Инструкции и примеры

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation

Performance

Synopsis

Description

Operation