PROWAREtech
Intel IA-32 Assembly Tutorial - A Guide to the Basics of x86 Assembly - Page 13
This assembly code was compiled with MS Visual C++ Express Edition 2005 using the MACRO ASSEMBLER for MS Visual C++ Express Edition 2005.
SSE/SSE2: Streaming SIMD Extensions
Architecture
SIMD stands for Single Instruction, Multiple Data. In the modern CPU and operating system, SSE is used for floating-point operations.
There are eight new 128-bit registers introduced with SSE: xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
and xmm7
— for x64 processors there are sixteen 128-bit registers! (xmm8
, xmm9
, xmm10
, xmm11
, xmm12
, xmm13
, xmm14
and xmm15
)
Unlike MMX, these registers are not just a mask for the x87 floating-point unit registers: st(0)
, st(1)
, st(2)
, st(3)
, st(4)
, st(5)
, st(6)
and st(7)
Just like other registers, XMM registers use Little Endian Order.
SSE supports single precision floating point (4 bytes); SSE2 supports double precision floating point (8 bytes), and integers.
SSE includes a new control register called MXCSR (Multimedia eXtension Control and Status Register).
Microsoft Specific
Microsoft's x64 calling convention is to pass floating-point data via SSE registers. The first four floating-point operands are passed in xmm0
, xmm1
, xmm2
and xmm3
. Additional operands are passed via the stack. So this is similar to the __fastcall
calling convention that uses registers to pass operands.
Floating-point values are returned in the lowest 32- or 64-bits of xmm0
.
Detecting SSE/SSE2 with CPUID
Here are procedures that check a processor for SSE capabilities using cpuid
. If using a 32-bit processor, check for CPUID support first then SSE/SSE2 support. All x64 processors support CPUID.
Check for the CPUID instruction:
TITLE 'extern "C" int __cdecl isCPUID();'
.686P
.model FLAT
PUBLIC _isCPUID
_TEXT SEGMENT
_isCPUID PROC NEAR
push ebx ; save ebx for the caller
pushfd ; push eflags on the stack
pop eax ; pop them into eax
mov ebx, eax ; save to ebx for restoring afterwards
xor eax, 200000h ; toggle bit 21
push eax ; push the toggled eflags
popfd ; pop them back into eflags
pushfd ; push eflags
pop eax ; pop them back into eax
cmp eax, ebx ; see if bit 21 was reset
jz not_supported
mov eax, 1
jmp exit
not_supported:
xor eax, eax;
exit:
pop ebx
ret 0
_isCPUID ENDP
_TEXT ENDS
END
Check for MMX (not to be confused with XMM):
TITLE 'extern "C" int __cdecl isMMX();'
.686P
.model FLAT
PUBLIC _isMMX
_TEXT SEGMENT
_isMMX PROC NEAR
push ebx
mov eax, 1
cpuid
shr edx, 23 ; bit 23 of the edx register
and edx, 1
mov eax, edx
pop ebx
ret 0
_isMMX ENDP
_TEXT ENDS
END
Check for SSE:
TITLE 'extern "C" int __cdecl isSSE();'
.686P
.model FLAT
PUBLIC _isSSE
_TEXT SEGMENT
_isSSE PROC NEAR
push ebx
mov eax, 1
cpuid
shr edx, 25 ; bit 25 of the edx register
and edx, 1
mov eax, edx
pop ebx
ret 0
_isSSE ENDP
_TEXT ENDS
END
Check for SSE2:
TITLE 'extern "C" int __cdecl isSSE2();'
.686P
.model FLAT
PUBLIC _isSSE2
_TEXT SEGMENT
_isSSE2 PROC NEAR
push ebx
mov eax, 1
cpuid
shr edx, 26 ; bit 26 of the edx register
and edx, 1
mov eax, edx
pop ebx
ret 0
_isSSE2 ENDP
_TEXT ENDS
END
Check for SSE3:
TITLE 'extern "C" int __cdecl isSSE3();'
.686P
.model FLAT
PUBLIC _isSSE3
_TEXT SEGMENT
_isSSE3 PROC NEAR
push ebx
mov eax, 1
cpuid
; bit 0 of the ecx register
and ecx, 1
mov eax, ecx
pop ebx
ret 0
_isSSE3 ENDP
_TEXT ENDS
END
Check for SSSE3:
TITLE 'extern "C" int __cdecl isSSSE3();'
.686P
.model FLAT
PUBLIC _isSSSE3
_TEXT SEGMENT
_isSSSE3 PROC NEAR
push ebx
mov eax, 1
cpuid
shr ecx, 9 ; bit 9 of the ecx register
and ecx, 1
mov eax, ecx
pop ebx
ret 0
_isSSSE3 ENDP
_TEXT ENDS
END
Check for SSE4.1:
TITLE 'extern "C" int __cdecl isSSE41();'
.686P
.model FLAT
PUBLIC _isSSE41
_TEXT SEGMENT
_isSSE41 PROC NEAR
push ebx
mov eax, 1
cpuid
shr ecx, 19 ; bit 19 of the ecx register
and ecx, 1
mov eax, ecx
pop ebx
ret 0
_isSSE41 ENDP
_TEXT ENDS
END
Check for SSE4.2:
TITLE 'extern "C" int __cdecl isSSE42();'
.686P
.model FLAT
PUBLIC _isSSE42
_TEXT SEGMENT
_isSSE42 PROC NEAR
push ebx
mov eax, 1
cpuid
shr ecx, 20 ; bit 20 of the ecx register
and ecx, 1
mov eax, ecx
pop ebx
ret 0
_isSSE42 ENDP
_TEXT ENDS
END
SSE/SSE2 Instructions
Instructions for Data Movement
Here are the ten SSE data movement instructions covered below: movd
, movq
, movdqa
, movdqu
, movaps
, movups
, movapd
, movupd
, movss
and movsd
.
MOVD
— Move DWORD
Although it suggests only 32-bits, QWORD (64-bit) values are also moved as determined by the source operand (and if a 64-bit operating system).
Parameters: [xmm/register/memory], [xmm/register/memory] — one must be an XMM register and both cannot be XMM registers.
If the destination operand is an XMM register and the source is an x86 register then the top is zeroed-out.
TITLE 'extern "C" int __cdecl sse_example();'
.686P
.xmm
.model FLAT
PUBLIC _sse_example
_TEXT SEGMENT
_sse_example PROC NEAR
mov eax, 1234
movd xmm0, eax
movd xmm2, xmm1 ; cannot do this
movd xmm3, 1234 ; cannot do this
mov eax, 1
ret 0
_sse_example ENDP
_TEXT ENDS
END
MOVQ
— Move QWORD
Moves QWORD values from memory or another XMM register.
Parameters: [xmm/64-bits-memory], [xmm/64-bits-memory] — one or both must be an XMM register.
The last 64-bits are always zeroed-out.
TITLE 'extern "C" int __cdecl sse_example();'
.686P
.xmm
.model FLAT
PUBLIC _sse_example
_TEXT SEGMENT
_sse_example PROC NEAR
mov eax, 1234
movd xmm0, eax
movq xmm1, xmm1 ; this will zero-out the last 64-bits of xmm1
movq xmm2, xmm0 ; this will zero-out the last 64-bits of xmm2
mov eax, 1
ret 0
_sse_example ENDP
_TEXT ENDS
END
MOVDQA
— Move Aligned Double QWORD (SSE2)
Moves Double QWORD values from memory or another XMM register.
Parameters: [xmm/128-bits-memory], [xmm/128-bits-memory] — one or both must be an XMM register.
Use when the data are aligned to 16 bytes, or a protection fault will occur.
MOVDQU
— Move Unaligned Double QWORD (SSE2)
Moves Double QWORD values from memory or another XMM register even if they are unaligned.
Parameters: [xmm/128-bits-memory], [xmm/128-bits-memory] — one or both must be an XMM register.
Use only if unsure about the alignment of the data as there is a slight performance penalty when using this instruction.
XMMWORD
is a 16-byte value.
TITLE 'extern "C" int __cdecl sse_copy_example(char dest[], char src[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_copy_example
_TEXT SEGMENT
_sse_copy_example PROC NEAR
mov eax, [esp+(2+0)*4] ; src - should be at least 16 bytes long
movdqu xmm0, XMMWORD PTR [eax] ; copy src to xmm0
mov eax, [esp+(1+0)*4] ; dest - should be at least 16 bytes long
movdqu XMMWORD PTR [eax], xmm0 ; copy xmm0 to dest
mov eax, 1
ret 0
_sse_copy_example ENDP
_TEXT ENDS
END
Here is an example using movdqu
to copy memory in 128-bit segments using the SSE register xmm0
. There is no division operation.
TITLE 'extern "C" int __cdecl memcopy128(char dest[], const char *src, unsigned int length);'
.686P
.xmm
.model FLAT
PUBLIC _memcopy128
_TEXT SEGMENT
_memcopy128 PROC NEAR
push ebx ; save ebx for the caller
mov ecx, DWORD PTR [esp+(3+1)*4] ; move length to ecx
shr ecx, 4 ; divide by 16 - holds quotient now
shl ecx, 4 ; multiply by 16
mov ebx, DWORD PTR [esp+(3+1)*4] ; move length to ebx
sub ebx, ecx ; find the remainder and store in ebx
shr ecx, 4 ; divide by 16 - holds quotient now
mov edx, DWORD PTR [esp+(2+1)*4] ; move src to edx
mov eax, DWORD PTR [esp+(1+1)*4] ; move dest to eax
cmp ecx, 0 ; make sure there is at least 16 bytes to copy
je compare_remainder
copy16bytes:
movdqu xmm0, XMMWORD PTR [edx] ; copy 16 bytes from src to xmm0
movdqu XMMWORD PTR [eax], xmm0 ; copy 16 bytes from xmm0 to dest
add edx, 16
add eax, 16
loopnz copy16bytes ; loop while ecx > 0 - this will automatically decrement ecx
compare_remainder:
cmp ebx, 0 ; if there is no remainder then finished
je exit
mov ecx, ebx ; move the remainder to ecx
copybytes:
mov bl, BYTE PTR [edx] ; copy 1 byte from src to bl
mov BYTE PTR [eax], bl ; copy 1 byte from bl to dest
inc edx
inc eax
loopnz copybytes ; loop while ecx > 0 - this will automatically decrement ecx
exit:
pop ebx
ret 0
_memcopy128 ENDP
_TEXT ENDS
END
Instructions for Moving Floating Point Values
SSE introduces the XMMWORD, a 128-bit value.
MOVAPS
— Move Aligned Packed Singles
Parameters: [xmm/128-bits-memory], [xmm/128-bits-memory] — one or both must be an XMM register.
MOVAPD
— Move Aligned Packed Doubles (SSE2)
Parameters: [xmm/128-bits-memory], [xmm/128-bits-memory] — one or both must be an XMM register.
MOVUPS
— Move Unaligned Packed Singles
Parameters: [xmm/128-bits-memory], [xmm/128-bits-memory] — one or both must be an XMM register.
MOVUPD
— Move Unaligned Packed Doubles (SSE2)
Parameters: [xmm/128-bits-memory], [xmm/128-bits-memory] — one or both must be an XMM register.
Instructions for Moving Scalar
MOVSS
— Move Scalar Single
Parameters: [xmm/32-bits-memory], [xmm/32-bits-memory] — one or both must be an XMM register.
If both operands are SSE registers then the top is not changed, otherwise, the last 96 bits of the register are zeroed-out.
TITLE 'extern "C" int __cdecl sse_example(float *f);'
.686P
.xmm
.model FLAT
PUBLIC _sse_example
_TEXT SEGMENT
_sse_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter: f
movss xmm0, DWORD PTR [esi] ; this will zero-out the last 96-bits of xmm0
mov eax, 1
pop esi
ret 0
_sse_example ENDP
_TEXT ENDS
END
MOVSD
— Move Scalar Double
Parameters: [xmm/64-bits-memory], [xmm/64-bits-memory] — one or both must be an XMM register.
If both operands are SSE registers then the top is not changed, otherwise, the last 64 bits of the register are zeroed-out.
TITLE 'extern "C" int __cdecl sse_example(double *d);'
.686P
.xmm
.model FLAT
PUBLIC _sse_example
_TEXT SEGMENT
_sse_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter: d
movsd xmm0, QWORD PTR [esi] ; this will zero-out the last 64-bits of xmm0
mov eax, 1
pop esi
ret 0
_sse_example ENDP
_TEXT ENDS
END
Instructions for Packed Arithmetic
Instructions addps
, addpd
, subps
, subpd
, mulps
, mulpd
, divps
, divpd
, sqrtps
, sqrtpd
, maxps
, maxpd
, minps
, and minpd
are covered here.
All of these instructions take the same parameters: [xmm], [xmm/memory] — the first is always an SSE register plus it's where the result of the operation is stored.
The memory operands are 128-bit for these packed (which means all the bits in the register are affected) instructions.
ADDPS
— Add Packed Singles
ADDPD
— Add Packed Doubles (SSE2)
SUBPS
— Subtract Packed Singles
SUBPD
— Subtract Packed Doubles (SSE2)
MULPS
— Multiply Packed Singles
MULPD
— Multiply Packed Doubles (SSE2)
DIVPS
— Divide Packed Singles
DIVPD
— Divide Packed Doubles (SSE2)
SQRTPS
— Square Root Packed Singles
SQRTPD
— Square Root Packed Doubles (SSE2)
MAXPS
— Maximum of Packed Singles
MAXPD
— Maximum of Packed Doubles (SSE2)
MINPS
— Minimum of Packed Singles
MINPD
— Minimum of Packed Doubles (SSE2)
Example of mulps
:
TITLE 'extern "C" int __cdecl sse_multiply_example(float op1[], float op2[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_multiply_example
_TEXT SEGMENT
_sse_multiply_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter op1 should be 16 bytes (4 floats)
movups xmm0, [esi]
mov esi, [esp+(2+1)*4] ; parameter op2 should be 16 bytes (4 floats)
mulps xmm0, [esi]
mov esi, [esp+(1+1)*4]
movups [esi], xmm0
pop esi
ret 0
_sse_multiply_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE();
extern "C" int __cdecl sse_multiply_example(float op1[], float op2[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE())
{
float op1[] = {4,-3,2,-1};
for(int i = 0; i < sizeof(op1) / sizeof(float); i++)
{
std::cout << op1[i] << "\t";
}
std::cout << std::endl;
float op2[] = {0.5,0.5,0.5,0.5};
sse_multiply_example(op1, op2);
for(int i = 0; i < sizeof(op1) / sizeof(float); i++)
{
std::cout << op1[i] << "\t";
}
std::cout << std::endl;
}
return 0;
}
Instructions for Scalar Arithmetic
Instructions addss
, addsd
, subss
, subsd
, mulss
, mulsd
, divss
, divsd
, sqrtss
, sqrtsd
, maxss
, maxsd
, minss
and minsd
are covered here.
All of these instructions take the same parameters: [xmm], [xmm/memory] — the first is always an SSE register and it's also where the result of the operation is stored.
The memory operands are 32- (single) or 64-bit (double) for these scalar (which means only the first 32 or 64 bits are affected) instructions.
ADDSS
— Add Scalar Single
ADDSD
— Add Scalar Double (SSE2)
SUBSS
— Subtract Scalar Single
SUBSD
— Subtract Scalar Double (SSE2)
MULSS
— Multiply Scalar Single
MULSD
— Multiply Scalar Double (SSE2)
DIVSS
— Divide Scalar Single
DIVSD
— Divide Scalar Double (SSE2)
SQRTSS
— Square Root Scalar Single
SQRTSD
— Square Root Scalar Double (SSE2)
MAXSS
— Maximum of Scalar Single
MAXSD
— Maximum of Scalar Double (SSE2)
MINSS
— Minimum of Scalar Single
MINSD
— Minimum of Scalar Double (SSE2)
When are Instructions SSE2?
By now, it may be obvious that all the instructions dealing with doubles are SSE2 instructions. This is true. Also, going forward, all the instructions that deal with integers/DWORD values are SSE2 instructions.
Instructions for Data Conversion
First, about rounding. Rounding is done by setting bits 13 and 14 of the MXCSR register. Bits 00
round to the nearest integer, which is the default behavior. Bits 01
round down. Bits 10
round up. Bits 11
truncate which is the behavior of C/C++. Use the instruction stmxcsr
to copy the MXCSR register to a memory location and the instruction ldmxcsr
to load the MXCSR register from a memory location. MXCSR is covered in greater detail below.
The convert instructions all begin with CVT then the destination type then a 2 then the source type — CVTxx2xx.
-
Destination and Source Types:
- PS — Packed Singles
- PD — Packed Doubles
- SS — Scalar Single
- SD — Scalar Double
- SI — x86 registers (32- or 64-bit)
- PI — MMX register holding DWORD values
- DQ — SSE register with packed DWORD values
Converting Doubles and Singles in SSE Register and Memory
CVTPS2PD
— parameters: [xmm], [xmm/128-bits-memory] — converts the lowest singles of the source to doubles and stores them in XMM destination register.
CVTPD2PS
— parameters: [xmm], [xmm/128-bits-memory] — converts the two doubles of the source to singles and stores in lowest 64-bits of XMM destination register while zeroing-out the last 64-bits of the destination register.
CVTSD2SS
— parameters: [xmm], [xmm/128-bits-memory] — converts the lowest double of the source to a single and stores it in lowest 32-bits of XMM destination register while leaving the last 96-bits of the destination register unchanged.
CVTSS2SD
— parameters: [xmm], [xmm/128-bits-memory] — converts the lowest single of the source to a double and stores it in lowest 64-bits of XMM destination register while leaving the last 64-bits of the destination register unchanged.
Note: DQ
is used for integer/DWORD values.
CVTDQ2PS
— parameters: [xmm], [xmm/128-bits-memory] — converts four DWORD values of the source to four singles and stores them in XMM destination register.
CVTDQ2PD
— parameters: [xmm], [xmm/128-bits-memory] — converts two of the first DWORD values of the source to two doubles and stores them in XMM destination register.
CVTPS2DQ
— parameters: [xmm], [xmm/128-bits-memory] — converts four singles of the source to four DWORD values and stores them in XMM destination register while using the rounding function set in the MXCSR register.
Example: CVTPS2DQ
TITLE 'extern "C" int __cdecl sse_cvt_example(int dest[], float src[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_cvt_example
_TEXT SEGMENT
_sse_cvt_example PROC NEAR
push esi
mov esi, [esp+(2+1)*4] ; parameter: src - this must be at least four float/single values
movdqu xmm0, [esi] ; copy src to xmm0
cvtps2dq xmm1, xmm0 ; convert four singles to integer/DWORD values while rounding the singles at the same time
mov esi, [esp+(1+1)*4] ; parameter: dest - this must be at least four integer values long
movdqu [esi], xmm1 ; copy xmm0 to dest
mov eax, 1
pop esi
ret 0
_sse_cvt_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_cvt_example(int dest[], float src[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
float src[] = {1234.56f, 123.45f, 12.3f, 1.2f};
int dest[] = {0, 0, 0, 0};
sse_cvt_example(dest, src);
for(int i = 0; i < sizeof(src) / sizeof(float); i++)
{
std::cout << src[i] << " = " << dest[i] << std::endl;
}
}
return 0;
}
The program output:
1234.56 = 1235 123.45 = 123 12.3 = 12 1.2 = 1
CVTTPS2DQ
— parameters: [xmm], [xmm/128-bits-memory] — truncates and converts four singles of the source to four DWORD values and stores them in XMM destination register.
CVTPD2DQ
— parameters: [xmm], [xmm/128-bits-memory] — converts two doubles of the source to two DWORD values and stores them in XMM destination register while zeroing-out the last 64-bits of the destination register and using the rounding function set in the MXCSR register.
CVTTPD2DQ
— parameters: [xmm], [xmm/128-bits-memory] — truncates and converts two doubles of the source to two DWORD values and stores them in XMM destination register while zeroing-out the last 64-bits of the destination register.
Converting Between SSE and MMX/Memory
Note: PI
means packed integers.
CVTPI2PS
— parameters: [xmm], [MMX/64-bits-memory] — converts two DWORD values of the source to singles and stores them in the lowest 64-bits of the XMM destination register while leaving the last 64-bits unchanged.
Example: CVTPI2PS
TITLE 'extern "C" int __cdecl sse_cvt_example(float dest[], int src[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_cvt_example
_TEXT SEGMENT
_sse_cvt_example PROC NEAR
push esi
mov esi, [esp+(2+1)*4] ; parameter: src - this must be at least two integer/DWORD values
movq mm0, [esi] ; copy src to the MMX register mm0
cvtpi2ps xmm0, mm0 ; convert two integer values to packed singles - the last 64-bits of the xmm0 register are unchanged
mov esi, [esp+(1+1)*4] ; parameter: dest - this must be at least two single values
movsd QWORD PTR [esi], xmm0 ; copy xmm0 to dest
emms ; emms = exit multi-media state
mov eax, 1
pop esi
ret 0
_sse_cvt_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_cvt_example(float dest[], int src[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
int src[] = {1234, 123};
float dest[] = {0, 0};
sse_cvt_example(dest, src);
for(int i = 0; i < sizeof(src) / sizeof(float); i++)
{
std::cout << src[i] << " = " << dest[i] << std::endl;
}
}
return 0;
}
CVTPI2PD
— parameters: [xmm], [MMX/64-bits-memory] — converts two DWORD values of the source to doubles and stores them in the lowest 64-bits of the XMM destination register while leaving the last 64-bits unchanged.
CVTPS2PI
— parameters: [MMX], [xmm/64-bits-memory] — converts two lowest singles of the source to DWORD values and stores them in the MMX destination register while using the rounding function set in the MXCSR register.
CVTTPS2PI
— parameters: [MMX], [xmm/64-bits-memory] — truncates and converts two lowest singles of the source to DWORD values and stores them in the MMX destination register.
CVTPD2PI
— parameters: [MMX], [xmm/128-bits-memory] — converts two doubles of the source to DWORD values and stores them in the MMX destination register while using the rounding function set in the MXCSR register.
CVTTPD2PI
— parameters: [MMX], [xmm/128-bits-memory] — truncates and converts two doubles of the source to DWORD values and stores them in the MMX destination register.
Converting Between SSE and x86/Memory
Note: SI
means scalar integers.
CVTSI2SS
— parameters: [xmm], [registry/32-bits-memory/64-bits-memory] — converts an x86 register or memory operand to a single and stores it in the lowest part of the XMM destination register.
CVTSI2SD
— parameters: [xmm], [registry/32-bits-memory/64-bits-memory] — converts an x86 register or memory operand to a double and stores it in the lowest 64-bits of the XMM destination register while leaving the last 64-bits unchanged.
CVTSS2SI
— parameters: [registry/32-bits-memory/64-bits-memory], [xmm] — converts the lowest single of the source to an integer/DWORD value and stores it in the destination register/memory while using the rounding function set in the MXCSR register.
CVTTSS2SI
— parameters: [registry/32-bits-memory/64-bits-memory], [xmm] — truncates and converts the lowest single of the source to an integer/DWORD value and stores it in the destination register/memory.
CVTSD2SI
— parameters: [registry/32-bits-memory/64-bits-memory], [xmm] — converts the lowest double of the source to an integer/DWORD value and stores it in the destination register/memory while using the rounding function set in the MXCSR register.
CVTTSD2SI
— parameters: [registry/32-bits-memory/64-bits-memory], [xmm] — truncates and converts the lowest double of the source to an integer/DWORD value and stores it in the destination register/memory.
Instructions for Shuffle
The shuffle instructions are used to swap singles and doubles around. These instructions take three operands with the third being called the immediate operand.
SHUFPD
— parameters: [xmm], [xmm/128-bits-memory], [imm8] — shuffles the doubles from the first and second operands into the first operand, and the order depends on the value of the immediate operand.
There are only two bits for the immediate operand so it has a maximum of four values: 0, 1, 2 or 3
This is among the simpliest of the shuffle instructions.
Example: SHUFPD
TITLE 'extern "C" int __cdecl sse_shuffle_example(double d1[], double d2[], int shuffle);'
.686P
.xmm
.model FLAT
PUBLIC _sse_shuffle_example
_TEXT SEGMENT
_sse_shuffle_example PROC NEAR
push esi
mov esi, [esp+(2+1)*4] ; parameter: d2
movupd xmm1, [esi]
mov esi, [esp+(1+1)*4] ; parameter: d1
movupd xmm0, [esi]
mov eax, [esp+(3+1)*4] ; parameter: shuffle
cmp eax, 1
je label1
cmp eax, 2
je label2
cmp eax, 3
je label3
shufpd xmm0, xmm1, 0
jmp exit
label1:
shufpd xmm0, xmm1, 1
jmp exit
label2:
shufpd xmm0, xmm1, 2
jmp exit
label3:
shufpd xmm0, xmm1, 3
exit:
movupd [esi], xmm0
mov eax, 1
pop esi
ret 0
_sse_shuffle_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_shuffle_example(double d1[], double d2[], int shuffle);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
while(true)
{
double d1[] = {1.2, 9.9};
double d2[] = {5.4, 3.5};
std::cout << "[" << d1[0] << ", " << d1[1] << "]" << std::endl;
std::cout << "[" << d2[0] << ", " << d2[1] << "]" << std::endl;
int shuffle;
std::cout << "Enter 0-3 for the shuffle order (-1 to quite): ";
std::cin >> shuffle;
if(shuffle == -1)
break;
switch(shuffle)
{
case 0:
std::cout << "binary: 00 = ";
break;
case 1:
std::cout << "binary: 01 = ";
break;
case 2:
std::cout << "binary: 10 = ";
break;
case 3:
std::cout << "binary: 11 = ";
break;
default:
continue;
}
sse_shuffle_example(d1, d2, shuffle);
std::cout << "[" << d1[0] << ", " << d1[1] << "]" << std::endl << std::endl;
}
}
return 0;
}
Here is the result of the program:
[1.2, 9.9] [5.4, 3.5] Enter 0-3 for the shuffle order (-1 to quite): 0 binary: 00 = [1.2, 5.4] [1.2, 9.9] [5.4, 3.5] Enter 0-3 for the shuffle order (-1 to quite): 1 binary: 01 = [9.9, 5.4] [1.2, 9.9] [5.4, 3.5] Enter 0-3 for the shuffle order (-1 to quite): 2 binary: 10 = [1.2, 3.5] [1.2, 9.9] [5.4, 3.5] Enter 0-3 for the shuffle order (-1 to quite): 3 binary: 11 = [9.9, 3.5] [1.2, 9.9] [5.4, 3.5] Enter 0-3 for the shuffle order (-1 to quite): -1
It doesn't make a lot of sense because the results are reversed from what's expected based on the bits.
SHUFPS
— parameters: [xmm], [xmm/128-bits-memory], [imm8] — shuffles the singles from the first and second operands into the first operand, and the order depends on the value of the immediate operand.
Because there are four singles per operand, this instruction is considerably more complex than shufpd
.
Consider this code:
shufps xmm0, xmm1, 01101100b
Consider this figure:
xmm0 | 1.0 | 2.0 | 3.0 | 4.0 |
xmm1 | 5.0 | 6.0 | 7.0 | 8.0 |
xmm0 | 7.0 | 6.0 | 1.0 | 4.0 |
The first two bits determine which of the singles (index 0 to 3) from the destination goes into the first single of the destination. The second two bits determine which of the singles (index 0 to 3) from the destination goes into the second single of the destination. The third two bits determine which of the singles (index 0 to 3) from the source goes into the third single of the destination. The fourth two bits determine which of the singles (index 0 to 3) from the source goes into the fourth single of the destination.
Do not forget that binary 00 is decimal 0, binary 01 is decimal 1, binary 10 is decimal 2 and binary 11 is decimal 3!
Here is another example:
shufps xmm0, xmm1, 01111101b
xmm0 | 1.0 | 2.0 | 3.0 | 4.0 |
xmm1 | 5.0 | 6.0 | 7.0 | 8.0 |
xmm0 | 7.0 | 5.0 | 1.0 | 3.0 |
And yet another example:
shufps xmm0, xmm1, 00011011b
xmm0 | 1.0 | 2.0 | 3.0 | 4.0 |
xmm1 | 5.0 | 6.0 | 7.0 | 8.0 |
xmm0 | 8.0 | 7.0 | 2.0 | 1.0 |
SHUFPS
can be used to easily reverse an array of four singles by specifying the same XMM register for both the source and destination. Notice that only SSE is required for this nice instruction because it only deals with singles. Run this code:
TITLE 'extern "C" int __cdecl sse_shuffle_reverse_example(float f4[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_shuffle_reverse_example
_TEXT SEGMENT
_sse_shuffle_reverse_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter: f4
movups xmm0, [esi]
shufps xmm0, xmm0, 00011011b
movups [esi], xmm0
mov eax, 1
pop esi
ret 0
_sse_shuffle_reverse_example ENDP
_TEXT ENDS
END
Here is the driver code (notice that only a check for SSE instructions is done):
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE();
extern "C" int __cdecl sse_shuffle_reverse_example(float f4[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE())
{
float f4[] = {4.5, 3.5, 2.5, 1.5};
for(int i = 0; i < sizeof(f4) / sizeof(float); i++)
{
std::cout << f4[i] << " ";
}
std::cout << std::endl;
sse_shuffle_reverse_example(f4);
for(int i = 0; i < sizeof(f4) / sizeof(float); i++)
{
std::cout << f4[i] << " ";
}
std::cout << std::endl;
}
return 0;
}
PSHUFW
— deals with the 64-bit MMX registers and takes three operands — shuffles WORD values in the MMX registers, the bits in the immediate operand correspond to the WORD values in the source and where they go to in the destination; note: use emms
when done with the MMX registers.
PSHUFD
— parameters: [xmm], [xmm/128-bits-memory], [imm8] — shuffles integers/DWORD values in the SSE registers, the bits in the immediate operand correspond to the DWORD values in the source and where they go to in the destination.
This code will reverse the DWORD values:
TITLE 'extern "C" int __cdecl sse_shuffle_reverse_dwords_example(int i4[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_shuffle_reverse_dwords_example
_TEXT SEGMENT
_sse_shuffle_reverse_dwords_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter i4 should be 16 bytes (4 dwords)
movupd xmm0, [esi]
pshufd xmm0, xmm0, 00011011b ; reverse the 4 dwords
movupd [esi], xmm0
mov eax, 1
pop esi
ret 0
_sse_shuffle_reverse_dwords_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_shuffle_reverse_dwords_example(int i4[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
int i4[] = {400,300,200,100};
for(int i = 0; i < sizeof(i4) / sizeof(int); i++)
{
std::cout << i4[i] << "\t";
}
std::cout << std::endl;
sse_shuffle_reverse_dwords_example(i4);
for(int i = 0; i < sizeof(i4) / sizeof(int); i++)
{
std::cout << i4[i] << "\t";
}
std::cout << std::endl;
}
return 0;
}
PSHUFLW
— parameters: [xmm], [xmm/128-bits-memory], [imm8] — shuffles the low WORD values (index values 0 to 3) in the SSE registers, the bits in the immediate operand correspond to the low WORD values in the source and where they go to in the destination.
PSHUFHW
— parameters: [xmm], [xmm/128-bits-memory], [imm8] — shuffles the high WORD values (index values 4 to 7) in the SSE registers, the bits in the immediate operand correspond to the high WORD values in the source and where they go to in the destination.
This code will reverse the WORD values:
TITLE 'extern "C" int __cdecl sse_shuffle_reverse_words_example(short s8[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_shuffle_reverse_words_example
_TEXT SEGMENT
_sse_shuffle_reverse_words_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter s8 should be 16 bytes (8 words)
movupd xmm0, [esi]
pshufd xmm0, xmm0, 01001110b ; move dwords around to make the high word value low word values and vice-versa
pshuflw xmm0, xmm0, 00011011b ; reverse the low word values
pshufhw xmm0, xmm0, 00011011b ; reverse the high word values
movupd [esi], xmm0
mov eax, 1
pop esi
ret 0
_sse_shuffle_reverse_words_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_shuffle_reverse_words_example(short s8[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
short s8[] = {8,7,6,5,4,3,2,1};
for(int i = 0; i < sizeof(s8) / sizeof(short); i++)
{
std::cout << s8[i] << " ";
}
std::cout << std::endl;
sse_shuffle_reverse_words_example(s8);
for(int i = 0; i < sizeof(s8) / sizeof(short); i++)
{
std::cout << s8[i] << " ";
}
std::cout << std::endl;
}
return 0;
}
There are more shuffle instructions not dealt with here because they require SSSE3, but they are PSHUFB
for MMX and the same for SSE registers. Processors that support SSSE3 are relatively new so be sure to test that the processor is SSSE3 capable.
Instructions for Moving the Sign Bitmask from SSE to x86 Registers
There are only three instructions that help here:
MOVMSKPS
— parameters: [register], [xmm] — Extract sign bitmask of packed singles or integers; sets four bits: 1 for negative and 0 for positive.
Example usage:
TITLE 'extern "C" int __cdecl sse_sign_mask_example(int i4[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_sign_mask_example
_TEXT SEGMENT
_sse_sign_mask_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter i4 should be 16 bytes (4 dwords)
movupd xmm0, [esi]
movmskps eax, xmm0 ; get sign bitmask and save it in EAX
pop esi
ret 0
_sse_sign_mask_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_sign_mask_example(int i[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
int i4[] = {4,-3,2,-1};
int mask = sse_sign_mask_example(i4);
for(int i = 0; i < sizeof(i4) / sizeof(int); i++)
{
std::cout << i4[i] << " is " << ((mask & 1) == 0 ? "positive" : "negative") << std::endl;
mask >>= 1;
}
}
return 0;
}
MOVMSKPD
— parameters: [register], [xmm] — Extract sign bitmask of packed doubles; sets two bits: 1 for negative and 0 for positive.
PMOVMSKB
— parameters: [register], [xmm] — Extract sign bitmask of packed bytes; sets sixteen bits: 1 for negative and 0 for positive.
Example usage:
TITLE 'extern "C" int __cdecl sse_sign_mask_example(char c16[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_sign_mask_example
_TEXT SEGMENT
_sse_sign_mask_example PROC NEAR
push esi
mov esi, [esp+(1+1)*4] ; parameter c16 should be 16 bytes
movupd xmm0, [esi]
pmovmskb eax, xmm0 ; get sign bitmask and save it in EAX
pop esi
ret 0
_sse_sign_mask_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_sign_mask_example(char c16[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
char c16[] = {4,-3,2,-1,10,-20,-16,-32,5,3,11,-12,13,14,15,-16};
int mask = sse_sign_mask_example(c16);
for(int i = 0; i < sizeof(c16); i++)
{
std::cout << (int)c16[i] << " is " << ((mask & 1) == 0 ? "positive" : "negative") << std::endl;
mask >>= 1;
}
}
return 0;
}
The program output:
4 is positive -3 is negative 2 is positive -1 is negative 10 is positive -20 is negative -16 is negative -32 is negative 5 is positive 3 is positive 11 is positive -12 is negative 13 is positive 14 is positive 15 is positive -16 is negative
The MXCSR Register
The MXCSR register is a flags one that records exceptions occuring in SSE floating point operations. Only the first 16 bits are used. The rest of this 32-bit register is reserved.
As has already been discussed, bits 13 and 14 are used for the rounding function.
There are six exceptions that can be generated by SSE instructions. Bits 0 to 5 are used for recording exceptions:
- Bit 0: Invalid Operation - bit 7 is the mask
- Bit 1: Denormal Operation - bit 8 is the mask
- Bit 2: Divide by Zero - bit 9 is the mask
- Bit 3: Overflow - bit 10 is the mask
- Bit 4: Underflow - bit 11 is the mask
- Bit 5: Precision Error - bit 12 is the mask
These six bits stay set after an exception and must be manually cleared. No exception is thrown when it occurs. There is a bitmask for bits 0 to 5 in bits 7 to 12 that tell the processor if it should throw an exception when one occurs or just ignore it. The default is to ignore the exception which is when the bit in the bitmask is set to 1. If bit 9 of the bitmask is set to 0 then the program will crash when a divide by zero occurs.
Use btr
(bit reset - sets a bit to 0) instruction, bts
(bit set - sets a bit to 1) instruction, and
instruction and or
instruction to modify the MXCSR registry.
stmxcsr temp ; temp is a 32-bit variable
btr temp, 9 ; make the processor throw an exception with divide by zero
ldmxcsr temp
stmxcsr temp ; temp is a 32-bit variable
bts temp, 9 ; make the processor NOT throw an exception with divide by zero (the default)
ldmxcsr temp
stmxcsr temp ; temp is a 32-bit variable
or temp, 1111110000000b ; set make the processor to never throw an exception (the default)
ldmxcsr temp
Instructions for Comparison
These instructions (comiss
, comisd
, ucomiss
and ucomisd
) set the eflags
(or rflags
) register and can be used for comparing floating-point values and jumping as if they were x86 integer values.
NaN means Not a Number.
ZF | PF | CF | result |
---|---|---|---|
1 | 1 | 1 | NaN |
0 | 0 | 0 | Op1 > Op2 |
0 | 0 | 1 | Op1 < Op2 |
1 | 0 | 0 | Op1 = Op2 |
ZF=Zero Flag PF=Parity Flag CF=Carry Flag |
Instructions jz
and je
may be used, but NaN sets the Zero Flag to 1 (ZF = 1) just like for equal results so be careful.
COMISS
— parameters: [xmm], [xmm/32-bits-memory] — ordered comparison of the first single value.
COMISD
— parameters: [xmm], [xmm/64-bits-memory] — ordered comparison of the first double value.
UCOMISS
— parameters: [xmm], [xmm/32-bits-memory] — unordered comparison of the first single value.
UCOMISD
— parameters: [xmm], [xmm/64-bits-memory] — unordered comparison of the first double value.
SNaN and QNaN
SNaN: Signalling NaN. An exception should be thrown when comparisons or operations are performed with these values. These are NaN values with the most significant bit of the mantissa (bit 22 of single precision and bit 51 of double precision) set to 0. No instruction produces SNaN. Manually move these values into the registers.
QNaN: Quiet NaN. No exception should be thrown. These are NaN values with the most significant bit of the mantissa (bit 22 of single precision and bit 51 of double precision) set to 1.
To Signal or Not to Signal an Exception
The ordered and unordered difference is subtle. There are several competing mechanisms on whether to or not to signal an exception when an operation is performed on NaN values. Bit 7 of the MXCSR register is one mechanism, another mechanism is the values being operated on, and then the comparison instruction itself is another mechanism.
Results from Operations on NaN
When bit 7 of the MXCSR register is set to 0 and the first operand is a QNaN and the second operand is a QNaN single then COMISS
throws an exception.
When bit 7 of the MXCSR register is set to 0 and the first operand is a QNaN and the second operand is ordered then COMISD
throws an exception.
When bit 7 of the MXCSR register is set to 0 and the first operand is a QNaN and the second operand is a QNaN single then UCOMISS
sets ZF, PF and CF to 1 (ZF = PF = CF = 1).
When bit 7 of the MXCSR register is set to 0 and the first operand is a QNaN and the second operand is ordered then UCOMISD
sets ZF, PF and CF to 1 (ZF = PF = CF = 1).
Instructions for Inserting and Extracting Data
The SSE2 instructions that deal with inserting and extracting WORD values are pinsrw
and pextrw
. Both of these instructions have a third operand.
PINSRW
— [xmm], [32-bit-register/16-bits-memory], [imm8] — insert a WORD value into a SSE register.
PEXTRW
— [register], [xmm], [imm8] — extract a WORD value into a 32- or 64-bit register; note: SSE 4.1 allows extracting directly to a [16-bits-memory] location.
Example of pinsrw
and pextrw
to multiply each WORD value by 2:
TITLE 'extern "C" int __cdecl sse_multiply_words_example(short s8[]);'
.686P
.xmm
.model FLAT
PUBLIC _sse_multiply_words_example
_TEXT SEGMENT
_sse_multiply_words_example PROC NEAR
push esi
push ebx
mov ebx, 2
mov esi, [esp+(1+2)*4] ; parameter s8 should be 16 bytes (8 shorts)
movupd xmm0, [esi]
pextrw eax, xmm0, 0
imul ebx
pinsrw xmm0, eax, 0
pextrw eax, xmm0, 1
imul ebx
pinsrw xmm0, eax, 1
pextrw eax, xmm0, 2
imul ebx
pinsrw xmm0, eax, 2
pextrw eax, xmm0, 3
imul ebx
pinsrw xmm0, eax, 3
pextrw eax, xmm0, 4
imul ebx
pinsrw xmm0, eax, 4
pextrw eax, xmm0, 5
imul ebx
pinsrw xmm0, eax, 5
pextrw eax, xmm0, 6
imul ebx
pinsrw xmm0, eax, 6
pextrw eax, xmm0, 7
imul ebx
pinsrw xmm0, eax, 7
movupd [esi], xmm0
pop ebx
pop esi
ret 0
_sse_multiply_words_example ENDP
_TEXT ENDS
END
Here is the driver code:
#include <iostream>
extern "C" int __cdecl isCPUID();
extern "C" int __cdecl isSSE2();
extern "C" int __cdecl sse_multiply_words_example(short s8[]);
int main(int argc, char *argv[])
{
if(isCPUID() && isSSE2())
{
short s8[] = {4,-3,2,-1,8,-7,9,10};
for(int i = 0; i < sizeof(s8) / sizeof(short); i++)
{
std::cout << s8[i] << "\t";
}
std::cout << std::endl;
sse_multiply_words_example(s8);
for(int i = 0; i < sizeof(s8) / sizeof(short); i++)
{
std::cout << s8[i] << "\t";
}
std::cout << std::endl;
}
return 0;
}
The program output:
4 -3 2 -1 8 -7 9 10 8 -6 4 -2 16 -14 18 20