From 94e3e3652f9f04810126ee754fa9a788289e2897 Mon Sep 17 00:00:00 2001 From: Prefetch Date: Mon, 24 Jul 2023 19:21:30 +0200 Subject: Reduce total code size by 53 bytes (big deal, right?) --- src/lock_acquire.asm | 26 +++++++++++++++----------- src/lock_release.asm | 32 +++++++++++++++++-------------- src/thread_create.asm | 52 +++++++++++++++++++++++++++++++-------------------- src/thread_finish.asm | 30 ++++++++++++++++++----------- 4 files changed, 84 insertions(+), 56 deletions(-) (limited to 'src') diff --git a/src/lock_acquire.asm b/src/lock_acquire.asm index f32ba6a..8415d7f 100644 --- a/src/lock_acquire.asm +++ b/src/lock_acquire.asm @@ -43,12 +43,15 @@ section .text ; Returns zero on success, or a standard error code. global linen_lock_acquire linen_lock_acquire: + ; It's handy to have a register that's 0 during most of this function + xor esi, esi + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Check validity of argument ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Return EINVAL if rdi is NULL or otherwise invalid - mov eax, -22 ; (EINVAL = -22) + ; Return EINVAL (-22) if rdi is NULL or otherwise invalid + lea eax, [rsi - 22] ; mov eax, -22 test rdi, rdi jz acquire_return ; rdi is NULL @@ -57,8 +60,7 @@ linen_lock_acquire: ; if that assumption is wrong we'll get a segmentation fault. ; But we don't yet trust that [rdi] is a valid lock handle! ; To verify this we check the canary value stored at [rdi + 8]. - mov ecx, [rdi + 8] - cmp ecx, 0xCAFEBABE + cmp dword [rdi + 8], 0xCAFEBABE ; Oh CISC... jnz acquire_return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -70,7 +72,8 @@ linen_lock_acquire: ; See: man 2 gettid ; gettid: rax = system call ID - mov eax, SYS_GETTID + xor eax, eax + mov al, SYS_GETTID ; gettid: rax = gettid() syscall @@ -119,7 +122,7 @@ linen_lock_acquire: ; so in most real-world cases you can delete this with no downside. ; Loop counter - mov ecx, 10 + mov sil, 10 acquire_spinloop: ; The "pause" instruction is specially designed for loops like this ; and conserves power. It causes a small delay (makes sense here). @@ -133,7 +136,7 @@ linen_lock_acquire: jz acquire_success ; Decrement loop counter until zero - dec ecx + dec esi jnz acquire_spinloop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -150,24 +153,25 @@ linen_lock_acquire: ; futex: rsi = futex_op: which futex operation we want: ; - FUTEX_LOCK_PI: block until lock's owner uses FUTEX_UNLOCK_PI ; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process - mov esi, (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) + mov sil, (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) ; futex: r10 = timeout: in case we had a deadline (we don't) xor r10, r10 ; futex: rdx = val: ignored when FUTEX_LOCK_PI is used ; futex: r8 = uaddr2: ignored when FUTEX_LOCK_PI is used ; futex: r9 = val3: ignored when FUTEX_LOCK_PI is used ; futex: rax = system call ID - mov eax, SYS_FUTEX + xor eax, eax + mov al, SYS_FUTEX ; futex: rax = futex(rdi, rsi, (rdx), r10, (r8), (r9)) syscall ; Sometimes the lock is released after the "lock cmpxchg" instruction ; but just before the futex call. In that case, futex returns EAGAIN. - cmp rax, -11 ; (-EAGAIN) + cmp eax, -11 ; (-EAGAIN) je acquire_futex ; Any other negative return value means failure - test rax, rax + test eax, eax jnz acquire_return ; Indicate that we made a futex call (see below for why) diff --git a/src/lock_release.asm b/src/lock_release.asm index f86caa2..2892cc3 100644 --- a/src/lock_release.asm +++ b/src/lock_release.asm @@ -43,12 +43,15 @@ section .text ; Returns zero on success, or a standard error code. global linen_lock_release linen_lock_release: + ; It's handy to have a register that's 0 during most of this function + xor esi, esi + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Check validity of argument ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Return EINVAL if rdi is NULL or invalid - mov eax, -22 ; (EINVAL = -22) + ; Return EINVAL (-22) if rdi is NULL or invalid + lea eax, [rsi - 22] ; mov eax, -22 test rdi, rdi jz release_return ; rdi is NULL @@ -57,8 +60,7 @@ linen_lock_release: ; if that assumption is wrong we'll get a segmentation fault. ; But we don't yet trust that [rdi] is a valid lock handle! ; To verify this we check the canary value stored at [rdi + 8]. - mov ecx, [rdi + 8] - cmp ecx, 0xCAFEBABE + cmp dword [rdi + 8], 0xCAFEBABE ; Oh CISC... jnz release_return ; Lock owners are identified by their TID; let's find ours. @@ -66,17 +68,19 @@ linen_lock_release: ; See: man 2 gettid ; gettid: rax = system call ID - mov eax, SYS_GETTID + xor eax, eax + mov al, SYS_GETTID ; gettid: rax = gettid() syscall ; Save a copy of our TID (no need for an error check) mov edx, eax - ; Return EPERM if this lock currently doesn't belong to us - mov eax, -1 ; (EPERM = -1) + ; Return EPERM (-1) if this lock currently doesn't belong to us + or eax, -1 ; mov eax, -1 - ; Read the futex dword at [rdi] and keep its lowest 30 bits + ; Read the futex dword at [rdi] and keep its lowest 30 bits. + ; No need to use atomics, since we currently own this lock. mov ecx, [rdi] and ecx, FUTEX_TID_MASK ; Those bits contain the owner's TID; it should be our TID @@ -96,9 +100,8 @@ linen_lock_release: mov eax, edx ; Atomically try to set the dword at [rdi] to 0 if it was equal to our TID. - ; if ([rdi] == eax]) { [rdi] = 0; goto release_success; } else { eax = [rdi]; } - xor ecx, ecx - lock cmpxchg [rdi], ecx + ; if ([rdi] == eax) { [rdi] = 0; goto release_success; } else { eax = [rdi]; } + lock cmpxchg [rdi], esi ; esi = 0 je release_success ; We failed because [rdi] wasn't equal to our TID. In theory, @@ -111,18 +114,19 @@ linen_lock_release: ; futex: rsi = futex_op: which futex operation we want: ; - FUTEX_UNLOCK_PI: wake up one thread sleeping via FUTEX_LOCK_PI ; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process - mov esi, (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) ; futex: futex_op + mov sil, (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) ; futex: rdx = val: ignored when FUTEX_UNLOCK_PI is used ; futex: r10 = timeout: ignored when FUTEX_UNLOCK_PI is used ; futex: r8 = uaddr2: ignored when FUTEX_UNLOCK_PI is used ; futex: r9 = val3: ignored when FUTEX_UNLOCK_PI is used ; futex: rax = system call ID - mov eax, SYS_FUTEX + xor eax, eax + mov al, SYS_FUTEX ; futex: rax = futex(rdi, rsi, (rdx), (r10), (r8), (r9)) syscall ; Check result of futex: nonzero means failure - test rax, rax + test eax, eax jnz release_return release_success: diff --git a/src/thread_create.asm b/src/thread_create.asm index 9a6fe78..8dc8813 100644 --- a/src/thread_create.asm +++ b/src/thread_create.asm @@ -72,12 +72,16 @@ linen_thread_create: ; Callee-save registers push rbx + ; It's handy to have a register that's 0 for a while + xor ecx, ecx + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Check validity of arguments ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Return EINVAL if any argument is NULL - mov eax, -22 ; (EINVAL = -22) + ; Return EINVAL (-22) if any argument is NULL + lea eax, [rcx - 22] ; mov eax, -22 + test rdi, rdi jz create_return ; Nowhere to store the thread handle test rsi, rsi @@ -91,9 +95,9 @@ linen_thread_create: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Save these registers: we'll clobber them for the mmap call - mov rbx, rdi - push rdx - push rsi + push rdx ; argument + push rsi ; function + push rdi ; thread handle destination ; The mmap system call does many things, in this case allocate memory. ; See: man 2 mmap @@ -103,25 +107,26 @@ linen_thread_create: ; mmap: rsi = length: size of buffer to allocate mov esi, (STACK_SIZE + GUARD_PAGE) ; mmap: rdx = prot: mprotect-style access permissions - mov edx, (PROT_WRITE | PROT_READ) + lea edx, [rcx + 3] ; mov edx, (PROT_READ | PROT_WRITE) ; mmap: r10 = flags: configuration flags for mapping: ; - MAP_ANONYMOUS: there is no file backing this buffer ; - MAP_PRIVATE: only this process can see thread's stack ; - MAP_STACK: no-op; inform kernel that this is a stack - mov r10, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK) + mov r10d, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK) ; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1 - mov r8, -1 + lea r8, [rcx - 1] ; mov r8, -1 ; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used xor r9, r9 ; mmap: rax = system call ID - mov eax, SYS_MMAP - ; mmap: rax = mmap(rdi, rsi, rdx, r10, r8, 9) + lea eax, [rcx + SYS_MMAP] ; mov eax, SYS_MMAP + ; mmap: rax = mmap(rdi, rsi, rdx, r10, (r8), (r9)) syscall ; Pop these now before we start branching. Those registers ; won't be used by the next system calls, so they're safe. - pop r8 ; function - pop r9 ; argument + pop rbx ; thread handle destination + pop r8 ; function + pop r9 ; argument ; Check result of mmap: negative means failure, ; otherwise rax is the address of the new mapping. @@ -145,12 +150,13 @@ linen_thread_create: ; mprotect: rdx = prot: access permissions; zero for none xor edx, edx ; mprotect: rax = system call ID - mov eax, SYS_MPROTECT + xor eax, eax + mov al, SYS_MPROTECT ; mprotect: rax = mprotect(rdi, rsi, rdx) syscall ; Check result of mprotect: nonzero means failure - test rax, rax + test eax, eax jnz create_return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -162,7 +168,7 @@ linen_thread_create: ; once in the parent (0 if success) and once in the child (the TID). ; See: man 2 clone - ; clone: rsi = stack + ; clone: rsi = stack: pointer for child's initial rsp ; Currently rdi points to the lowest byte of the stack area. ; Again, stacks grow downward, so we calculate the address of ; the top qword to use as the child thread's starting point. @@ -194,14 +200,14 @@ linen_thread_create: ; clone: r8 = tls: ignored unless CLONE_SETTLS is used ; clone: rax = system call ID - mov eax, SYS_CLONE + mov al, SYS_CLONE ; clone: rax = clone(rdi, rsi, (rdx), r10, (r8)); syscall ; Ideally, both parent and new-born child are executing this code now. ; Check result of clone: - test rax, rax + test eax, eax js create_return ; Negative means failure jnz create_success ; Positive means we're in the parent thread ; Zero means we're in the child thread @@ -220,10 +226,16 @@ linen_thread_create: ; Once done, leave function's return value lying around push rax - ; Exit the thread with return value 0 + ; Exit the thread with status 0 using the exit system call. + ; See: man 2 exit + + ; exit: rdi = status to report xor edi, edi - mov rax, SYS_EXIT - syscall ; (never returns) + ; exit: rax = system call ID + xor eax, eax + mov al, SYS_EXIT + ; exit: call never returns + syscall ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Clean up in parent thread ;;;; diff --git a/src/thread_finish.asm b/src/thread_finish.asm index 860b0a4..c453d40 100644 --- a/src/thread_finish.asm +++ b/src/thread_finish.asm @@ -43,12 +43,15 @@ section .text ; Returns zero on success, or a standard error code. global linen_thread_finish linen_thread_finish: + ; It's handy to have a register that's 0 for a while + xor ecx, ecx + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Check validity of arguments ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Return EINVAL if rdi is NULL or otherwise invalid - mov eax, -22 ; (EINVAL = -22) + ; Return EINVAL (-22) if rdi is NULL or otherwise invalid + lea eax, [rcx - 22] ; mov eax, -22 test rdi, rdi jz finish_return ; rdi is NULL @@ -57,8 +60,7 @@ linen_thread_finish: ; if that assumption is wrong we'll get a segmentation fault. ; But we don't yet trust that [rdi] is a valid thread handle! ; To verify this we check the canary value stored at [rdi + 4]. - mov ecx, [rdi + 4] - cmp ecx, 0xDEADBEEF + cmp dword [rdi + 4], 0xDEADBEEF ; Oh CISC... jnz finish_return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -86,9 +88,9 @@ linen_thread_finish: ; futex: rdi = uaddr: address of the dword to watch ; futex: rsi = futex_op: which futex operation we want: - ; - FUTEX_WAIT: block until the value at [rdi] changes + ; - FUTEX_WAIT = 0: block until the value at [rdi] changes ; - FUTEX_PRIVATE_FLAG: FIXME waits forever, I don't understand why - mov esi, FUTEX_WAIT + xor esi, esi ; mov esi, FUTEX_WAIT ; futex: rdx = val: the expected value at [rdi] before it changes mov edx, eax ; futex: r10 = timeout: in case we had a deadline (we don't) @@ -96,17 +98,18 @@ linen_thread_finish: ; futex: r8 = uaddr2: ignored when FUTEX_WAIT is used ; futex: r9 = val3: ignored when FUTEX_WAIT is used ; futex: rax = system call ID - mov eax, SYS_FUTEX + xor eax, eax + mov al, SYS_FUTEX ; futex: rax = futex(rdi, rsi, rdx, r10, (r8), (r9)) syscall ; Sometimes the thread exits after the "lock cmpxchg" instruction ; but before the futex call. In that case, futex returns EAGAIN. - cmp rax, -11 ; (EAGAIN = -11) + cmp eax, -11 ; (EAGAIN = -11) je finish_retry ; Any other nonzero return value means failure - test rax, rax + test eax, eax jnz finish_return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -117,6 +120,11 @@ linen_thread_finish: ; The thread left its function return value on the stack, read it mov rdx, [rdi - 8] + ; Remove the canary value so the thread handle becomes invalid. We're about + ; to deallocate this memory anyway, so it's optional, but maybe the address + ; becomes valid again later in the program's life, who knows? Play it safe. + mov [rdi + 4], eax ; eax = 0 for all paths + ; The munmap system call destroys mappings created by mmap. ; In this case that means deallocating the stack buffer. ; See: man 2 munmap @@ -127,12 +135,12 @@ linen_thread_finish: ; munmap: rsi = length: size of region starting from rdi mov esi, (STACK_SIZE + GUARD_PAGE) ; munmap: rax = system call ID - mov eax, SYS_MUNMAP + mov al, SYS_MUNMAP ; munmap: rax = munmap(rdi, rsi) syscall ; Check result of munmap: nonzero means failure - test rax, rax + test eax, eax jnz finish_return ; Check if caller gave a location (r8) to save the return value (rdx) -- cgit v1.2.3