x86-64 Assembly: Tutorial - A Quick Guide to the Changes in 64-bit Assembly - Page 3
Copy Memory Using 64-bit Registers, Copy Memory Using 128-bit Registers.
Copy Memory Using 64-bit Registers
This x64 assembly code will copy memory from one place to another using 8-byte registers then falls back to using 1-byte registers for the remainder. For performance reasons, no division operations are performed. Notice that the stack is never touched here.
memcopyx64 PROC
mov rax, rcx ; move the dest pointer in rcx to rax
mov rcx, r8 ; move length to rcx
shr rcx, 3 ; divide by 8 - holds quotient now
shl rcx, 3 ; divide by 8 - holds quotient now
sub r8, rcx ; find the remainder and store in r8
shr rcx, 3 ; divide by 8 - holds quotient now
cmp rcx, 0 ; make sure there is at least 8 bytes to copy
je compare_remainder
mov r9, QWORD PTR [rdx] ; copy 8 bytes from src to r9
mov QWORD PTR [rax], r9 ; copy 8 bytes from r9 to dest
add rax, 8
add rdx, 8
loopnz copy8bytes ; loop while rcx > 0 - this will automatically decrement rcx
cmp r8, 0 ; if there is no remainder then finished
je exit
mov rcx, r8 ; move the remainder to rcx
mov r9b, BYTE PTR [rdx] ; copy 1 byte from src to r9b
mov BYTE PTR [rax], r9b ; copy 1 byte from r9b to dest
inc rax
inc rdx
loopnz copybytes ; loop while rcx > 0 - this will automatically decrement rcx
memcopyx64 ENDP
The driver source code:
extern "C" char * memcopyx64(char *dest, const char *src, unsigned long long length);
int main()
char dest[256];
const char* src = "abcdefghijklmnopqrstuvwxyz!ABCDEFGHIJKLMNOPQRSTUVWXYZ!";
unsigned int len = 0;
while (src[len])
len++; // add the sentinel to the length
char* end = dest;
for (int i = 0; i < 3; i++)
end = memcopyx64(end, src, len) - 1; // subtract the sentinel
return 0;
Copy Memory Using 128-bit Registers
This x64 assembly code will copy memory from one place to another using 16-byte registers then falls back to using 2-byte registers for the remainder with the exception of the last odd byte. For performance reasons, no division operations are performed just as above, and the stack is never touched here. This code is a port of the x86 version.
memcopy128 PROC
mov rax, rcx ; move the dest pointer in rcx to rax
mov r10, r8 ; move length to r10
shr r10, 4 ; divide by 16 - holds quotient now
shl r10, 4 ; multiply by 16
sub r8, r10 ; find the remainder and store in r8
shr r10, 4 ; divide by 16 - holds quotient now
cmp r10, 0 ; make sure there are at least 16 bytes to copy
je compare_remainder
mov rcx, r10 ; move the quotient to rcx
movdqu xmm0, XMMWORD PTR [rdx] ; copy 16 bytes from src to xmm0
movdqu XMMWORD PTR [rax], xmm0 ; copy 16 bytes from xmm0 to dest
add rax, 16
add rdx, 16
loopnz copy16bytes ; loop while rcx > 0 - this will automatically decrement rcx
cmp r8, 0 ; if there is no remainder then finished
je exit
mov rcx, r8 ; move the remainder to rcx
shr rcx, 1 ; divide by 2
cmp rcx, 0 ; make sure there are at least 2 bytes to copy
je check_odd_byte
mov r9w, WORD PTR [rdx] ; copy 2 bytes from src to r9w
mov WORD PTR [rax], r9w ; copy 2 byte from r9w to dest
add rax, 2
add rdx, 2
loopnz copy2bytes ; loop while rcx > 0 - this will automatically decrement rcx
test r8, 1
jz exit
mov r9b, BYTE PTR [rdx] ; copy 1 byte from src to r9b
mov BYTE PTR [rax], r9b ; copy 1 byte from r9b to dest
inc rax
memcopy128 ENDP
The driver source code:
extern "C" char * memcopy128(char *dest, const char *src, unsigned long long length);
int main()
char dest[256];
const char* src = "abcdefghijklmnopqrstuvwxyz!ABCDEFGHIJKLMNOPQRSTUVWXYZ!";
unsigned int len = 0;
while (src[len])
len++; // add the sentinel to the length
char* end = dest;
for (int i = 0; i < 3; i++)
end = memcopy128(end, src, len) - 1; // subtract the sentinel
return 0;