4 files changed, 84 insertions, 56 deletions
diff --git a/src/lock_acquire.asm b/src/lock_acquire.asm
index f32ba6a..8415d7f 100644
--- a/src/lock_acquire.asm
+++ b/src/lock_acquire.asm
@@ -43,12 +43,15 @@ section .text
 ; Returns zero on success, or a standard error code.
 global linen_lock_acquire
 linen_lock_acquire:
+		; It's handy to have a register that's 0 during most of this function
+		xor esi, esi
+
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 		;;;; Check validity of argument ;;;;
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-		; Return EINVAL if rdi is NULL or otherwise invalid
-		mov eax, -22 ; (EINVAL = -22)
+		; Return EINVAL (-22) if rdi is NULL or otherwise invalid
+		lea eax, [rsi - 22] ; mov eax, -22
 
 		test rdi, rdi
 		jz acquire_return ; rdi is NULL
@@ -57,8 +60,7 @@ linen_lock_acquire:
 		; if that assumption is wrong we'll get a segmentation fault.
 		; But we don't yet trust that [rdi] is a valid lock handle!
 		; To verify this we check the canary value stored at [rdi + 8].
-		mov ecx, [rdi + 8]
-		cmp ecx, 0xCAFEBABE
+		cmp dword [rdi + 8], 0xCAFEBABE ; Oh CISC...
 		jnz acquire_return
 
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -70,7 +72,8 @@ linen_lock_acquire:
 		; See: man 2 gettid
 
 		; gettid: rax = system call ID
-		mov eax, SYS_GETTID
+		xor eax, eax
+		mov al, SYS_GETTID
 		; gettid: rax = gettid()
 		syscall
 
@@ -119,7 +122,7 @@ linen_lock_acquire:
 		; so in most real-world cases you can delete this with no downside.
 
 		; Loop counter
-		mov ecx, 10
+		mov sil, 10
 	acquire_spinloop:
 		; The "pause" instruction is specially designed for loops like this
 		; and conserves power. It causes a small delay (makes sense here).
@@ -133,7 +136,7 @@ linen_lock_acquire:
 		jz acquire_success
 
 		; Decrement loop counter until zero
-		dec ecx
+		dec esi
 		jnz acquire_spinloop
 
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -150,24 +153,25 @@ linen_lock_acquire:
 		; futex: rsi = futex_op: which futex operation we want:
 		; - FUTEX_LOCK_PI:      block until lock's owner uses FUTEX_UNLOCK_PI
 		; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process
-		mov esi, (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
+		mov sil, (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
 		; futex: r10 = timeout: in case we had a deadline (we don't)
 		xor r10, r10
 		; futex: rdx = val:   ignored when FUTEX_LOCK_PI is used
 		; futex: r8 = uaddr2: ignored when FUTEX_LOCK_PI is used
 		; futex: r9 = val3:   ignored when FUTEX_LOCK_PI is used
 		; futex: rax = system call ID
-		mov eax, SYS_FUTEX
+		xor eax, eax
+		mov al, SYS_FUTEX
 		; futex: rax = futex(rdi, rsi, (rdx), r10, (r8), (r9))
 		syscall
  
 		; Sometimes the lock is released after the "lock cmpxchg" instruction
 		; but just before the futex call. In that case, futex returns EAGAIN.
-		cmp rax, -11 ; (-EAGAIN)
+		cmp eax, -11 ; (-EAGAIN)
 		je acquire_futex
 
 		; Any other negative return value means failure
-		test rax, rax
+		test eax, eax
 		jnz acquire_return
 
 		; Indicate that we made a futex call (see below for why)
diff --git a/src/lock_release.asm b/src/lock_release.asm
index f86caa2..2892cc3 100644
--- a/src/lock_release.asm
+++ b/src/lock_release.asm
@@ -43,12 +43,15 @@ section .text
 ; Returns zero on success, or a standard error code.
 global linen_lock_release
 linen_lock_release:
+		; It's handy to have a register that's 0 during most of this function
+		xor esi, esi
+
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 		;;;; Check validity of argument ;;;;
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-		; Return EINVAL if rdi is NULL or invalid
-		mov eax, -22 ; (EINVAL = -22)
+		; Return EINVAL (-22) if rdi is NULL or invalid
+		lea eax, [rsi - 22] ; mov eax, -22
 
 		test rdi, rdi
 		jz release_return ; rdi is NULL
@@ -57,8 +60,7 @@ linen_lock_release:
 		; if that assumption is wrong we'll get a segmentation fault.
 		; But we don't yet trust that [rdi] is a valid lock handle!
 		; To verify this we check the canary value stored at [rdi + 8].
-		mov ecx, [rdi + 8]
-		cmp ecx, 0xCAFEBABE
+		cmp dword [rdi + 8], 0xCAFEBABE ; Oh CISC...
 		jnz release_return
 
 		; Lock owners are identified by their TID; let's find ours.
@@ -66,17 +68,19 @@ linen_lock_release:
 		; See: man 2 gettid
 
 		; gettid: rax = system call ID
-		mov eax, SYS_GETTID
+		xor eax, eax
+		mov al, SYS_GETTID
 		; gettid: rax = gettid()
 		syscall
 
 		; Save a copy of our TID (no need for an error check)
 		mov edx, eax
 
-		; Return EPERM if this lock currently doesn't belong to us
-		mov eax, -1 ; (EPERM = -1)
+		; Return EPERM (-1) if this lock currently doesn't belong to us
+		or eax, -1 ; mov eax, -1
 
-		; Read the futex dword at [rdi] and keep its lowest 30 bits
+		; Read the futex dword at [rdi] and keep its lowest 30 bits.
+		; No need to use atomics, since we currently own this lock.
 		mov ecx, [rdi]
 		and ecx, FUTEX_TID_MASK
 		; Those bits contain the owner's TID; it should be our TID
@@ -96,9 +100,8 @@ linen_lock_release:
 		mov eax, edx
 
 		; Atomically try to set the dword at [rdi] to 0 if it was equal to our TID.
-		; if ([rdi] == eax]) { [rdi] = 0; goto release_success; } else { eax = [rdi]; }
-		xor ecx, ecx
-		lock cmpxchg [rdi], ecx
+		; if ([rdi] == eax) { [rdi] = 0; goto release_success; } else { eax = [rdi]; }
+		lock cmpxchg [rdi], esi ; esi = 0
 		je release_success
 
 		; We failed because [rdi] wasn't equal to our TID. In theory,
@@ -111,18 +114,19 @@ linen_lock_release:
 		; futex: rsi = futex_op: which futex operation we want:
 		; - FUTEX_UNLOCK_PI:    wake up one thread sleeping via FUTEX_LOCK_PI
 		; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process
-		mov esi, (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) ; futex: futex_op
+		mov sil, (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
 		; futex: rdx = val:     ignored when FUTEX_UNLOCK_PI is used
 		; futex: r10 = timeout: ignored when FUTEX_UNLOCK_PI is used
 		; futex: r8 = uaddr2:   ignored when FUTEX_UNLOCK_PI is used
 		; futex: r9 = val3:     ignored when FUTEX_UNLOCK_PI is used
 		; futex: rax = system call ID
-		mov eax, SYS_FUTEX
+		xor eax, eax
+		mov al, SYS_FUTEX
 		; futex: rax = futex(rdi, rsi, (rdx), (r10), (r8), (r9))
 		syscall
 
 		; Check result of futex: nonzero means failure
-		test rax, rax
+		test eax, eax
 		jnz release_return
 
 	release_success:
diff --git a/src/thread_create.asm b/src/thread_create.asm
index 9a6fe78..8dc8813 100644
--- a/src/thread_create.asm
+++ b/src/thread_create.asm
@@ -72,12 +72,16 @@ linen_thread_create:
 		; Callee-save registers
 		push rbx
 
+		; It's handy to have a register that's 0 for a while
+		xor ecx, ecx
+
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 		;;;; Check validity of arguments ;;;;
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-		; Return EINVAL if any argument is NULL
-		mov eax, -22 ; (EINVAL = -22)
+		; Return EINVAL (-22) if any argument is NULL
+		lea eax, [rcx - 22] ; mov eax, -22
+
 		test rdi, rdi
 		jz create_return ; Nowhere to store the thread handle
 		test rsi, rsi
@@ -91,9 +95,9 @@ linen_thread_create:
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 		; Save these registers: we'll clobber them for the mmap call
-		mov rbx, rdi
-		push rdx
-		push rsi
+		push rdx ; argument
+		push rsi ; function
+		push rdi ; thread handle destination
 
 		; The mmap system call does many things, in this case allocate memory.
 		; See: man 2 mmap
@@ -103,25 +107,26 @@ linen_thread_create:
 		; mmap: rsi = length: size of buffer to allocate
 		mov esi, (STACK_SIZE + GUARD_PAGE)
 		; mmap: rdx = prot: mprotect-style access permissions
-		mov edx, (PROT_WRITE | PROT_READ)
+		lea edx, [rcx + 3] ; mov edx, (PROT_READ | PROT_WRITE)
 		; mmap: r10 = flags: configuration flags for mapping:
 		; - MAP_ANONYMOUS: there is no file backing this buffer
 		; - MAP_PRIVATE:   only this process can see thread's stack
 		; - MAP_STACK:     no-op; inform kernel that this is a stack
-		mov r10, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK)
+		mov r10d, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK)
 		; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1
-		mov r8, -1
+		lea r8, [rcx - 1] ; mov r8, -1
 		; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used
 		xor r9, r9
 		; mmap: rax = system call ID
-		mov eax, SYS_MMAP
-		; mmap: rax = mmap(rdi, rsi, rdx, r10, r8, 9)
+		lea eax, [rcx + SYS_MMAP] ; mov eax, SYS_MMAP
+		; mmap: rax = mmap(rdi, rsi, rdx, r10, (r8), (r9))
 		syscall
 
 		; Pop these now before we start branching. Those registers
 		; won't be used by the next system calls, so they're safe.
-		pop r8 ; function
-		pop r9 ; argument
+		pop rbx ; thread handle destination
+		pop r8  ; function
+		pop r9  ; argument
 
 		; Check result of mmap: negative means failure,
 		; otherwise rax is the address of the new mapping.
@@ -145,12 +150,13 @@ linen_thread_create:
 		; mprotect: rdx = prot: access permissions; zero for none
 		xor edx, edx
 		; mprotect: rax = system call ID
-		mov eax, SYS_MPROTECT
+		xor eax, eax
+		mov al, SYS_MPROTECT
 		; mprotect: rax = mprotect(rdi, rsi, rdx)
 		syscall
 
 		; Check result of mprotect: nonzero means failure
-		test rax, rax
+		test eax, eax
 		jnz create_return
 
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -162,7 +168,7 @@ linen_thread_create:
 		; once in the parent (0 if success) and once in the child (the TID).
 		; See: man 2 clone
 
-		; clone: rsi = stack
+		; clone: rsi = stack: pointer for child's initial rsp
 		; Currently rdi points to the lowest byte of the stack area.
 		; Again, stacks grow downward, so we calculate the address of
 		; the top qword to use as the child thread's starting point.
@@ -194,14 +200,14 @@ linen_thread_create:
 		; clone: r8 = tls: ignored unless CLONE_SETTLS is used
 
 		; clone: rax = system call ID
-		mov eax, SYS_CLONE
+		mov al, SYS_CLONE
 		; clone: rax = clone(rdi, rsi, (rdx), r10, (r8));
 		syscall
 
 		; Ideally, both parent and new-born child are executing this code now.
 
 		; Check result of clone:
-		test rax, rax
+		test eax, eax
 		js create_return   ; Negative means failure
 		jnz create_success ; Positive means we're in the parent thread
 		; Zero means we're in the child thread
@@ -220,10 +226,16 @@ linen_thread_create:
 		; Once done, leave function's return value lying around
 		push rax
 
-		; Exit the thread with return value 0
+		; Exit the thread with status 0 using the exit system call.
+		; See: man 2 exit
+
+		; exit: rdi = status to report
 		xor edi, edi
-		mov rax, SYS_EXIT
-		syscall ; (never returns)
+		; exit: rax = system call ID
+		xor eax, eax
+		mov al, SYS_EXIT
+		; exit: call never returns
+		syscall
 
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 		;;;; Clean up in parent thread ;;;;
diff --git a/src/thread_finish.asm b/src/thread_finish.asm
index 860b0a4..c453d40 100644
--- a/src/thread_finish.asm
+++ b/src/thread_finish.asm
@@ -43,12 +43,15 @@ section .text
 ; Returns zero on success, or a standard error code.
 global linen_thread_finish
 linen_thread_finish:
+		; It's handy to have a register that's 0 for a while
+		xor ecx, ecx
+
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 		;;;; Check validity of arguments ;;;;
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-		; Return EINVAL if rdi is NULL or otherwise invalid
-		mov eax, -22 ; (EINVAL = -22)
+		; Return EINVAL (-22) if rdi is NULL or otherwise invalid
+		lea eax, [rcx - 22] ; mov eax, -22
 
 		test rdi, rdi
 		jz finish_return ; rdi is NULL
@@ -57,8 +60,7 @@ linen_thread_finish:
 		; if that assumption is wrong we'll get a segmentation fault.
 		; But we don't yet trust that [rdi] is a valid thread handle!
 		; To verify this we check the canary value stored at [rdi + 4].
-		mov ecx, [rdi + 4]
-		cmp ecx, 0xDEADBEEF
+		cmp dword [rdi + 4], 0xDEADBEEF ; Oh CISC...
 		jnz finish_return
 
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -86,9 +88,9 @@ linen_thread_finish:
 
 		; futex: rdi = uaddr: address of the dword to watch
 		; futex: rsi = futex_op: which futex operation we want:
-		; - FUTEX_WAIT:         block until the value at [rdi] changes
+		; - FUTEX_WAIT = 0:     block until the value at [rdi] changes
 		; - FUTEX_PRIVATE_FLAG: FIXME waits forever, I don't understand why
-		mov esi, FUTEX_WAIT
+		xor esi, esi ; mov esi, FUTEX_WAIT
 		; futex: rdx = val: the expected value at [rdi] before it changes
 		mov edx, eax
 		; futex: r10 = timeout: in case we had a deadline (we don't)
@@ -96,17 +98,18 @@ linen_thread_finish:
 		; futex: r8 = uaddr2: ignored when FUTEX_WAIT is used
 		; futex: r9 = val3:   ignored when FUTEX_WAIT is used
 		; futex: rax = system call ID
-		mov eax, SYS_FUTEX
+		xor eax, eax
+		mov al, SYS_FUTEX
 		; futex: rax = futex(rdi, rsi, rdx, r10, (r8), (r9))
 		syscall
 
 		; Sometimes the thread exits after the "lock cmpxchg" instruction
 		; but before the futex call. In that case, futex returns EAGAIN.
-		cmp rax, -11 ; (EAGAIN = -11)
+		cmp eax, -11 ; (EAGAIN = -11)
 		je finish_retry
 
 		; Any other nonzero return value means failure
-		test rax, rax
+		test eax, eax
 		jnz finish_return
 
 		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -117,6 +120,11 @@ linen_thread_finish:
 		; The thread left its function return value on the stack, read it
 		mov rdx, [rdi - 8]
 
+		; Remove the canary value so the thread handle becomes invalid. We're about
+		; to deallocate this memory anyway, so it's optional, but maybe the address
+		; becomes valid again later in the program's life, who knows? Play it safe.
+		mov [rdi + 4], eax ; eax = 0 for all paths
+
 		; The munmap system call destroys mappings created by mmap.
 		; In this case that means deallocating the stack buffer.
 		; See: man 2 munmap
@@ -127,12 +135,12 @@ linen_thread_finish:
 		; munmap: rsi = length: size of region starting from rdi
 		mov esi, (STACK_SIZE + GUARD_PAGE)
 		; munmap: rax = system call ID
-		mov eax, SYS_MUNMAP
+		mov al, SYS_MUNMAP
 		; munmap: rax = munmap(rdi, rsi)
 		syscall
 
 		; Check result of munmap: nonzero means failure
-		test rax, rax
+		test eax, eax
 		jnz finish_return
 
 		; Check if caller gave a location (r8) to save the return value (rdx)