From a211da8cfe9b0565881537cc81b09ae55c722111 Mon Sep 17 00:00:00 2001 From: Prefetch Date: Mon, 24 Jul 2023 16:23:27 +0200 Subject: Rename lib/ to src/ (better for Tab-completion) --- .gitignore | 2 +- Makefile | 6 +- lib/lock_acquire.asm | 204 -------------------------------------- lib/lock_release.asm | 132 ------------------------- lib/thread_create.asm | 264 -------------------------------------------------- lib/thread_finish.asm | 147 ---------------------------- src/lock_acquire.asm | 204 ++++++++++++++++++++++++++++++++++++++ src/lock_release.asm | 132 +++++++++++++++++++++++++ src/thread_create.asm | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/thread_finish.asm | 147 ++++++++++++++++++++++++++++ 10 files changed, 751 insertions(+), 751 deletions(-) delete mode 100644 lib/lock_acquire.asm delete mode 100644 lib/lock_release.asm delete mode 100644 lib/thread_create.asm delete mode 100644 lib/thread_finish.asm create mode 100644 src/lock_acquire.asm create mode 100644 src/lock_release.asm create mode 100644 src/thread_create.asm create mode 100644 src/thread_finish.asm diff --git a/.gitignore b/.gitignore index fab3e1d..422c49d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -/lib/*.o +/src/*.o /liblinen.so /tests/*.run diff --git a/Makefile b/Makefile index 33fe35b..6b1b0a5 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ -liblinen.so: lib/thread_create.o lib/thread_finish.o lib/lock_acquire.o lib/lock_release.o +liblinen.so: src/thread_create.o src/thread_finish.o src/lock_acquire.o src/lock_release.o gcc -shared -o $@ $^ -lib/%.o: lib/%.asm +src/%.o: src/%.asm nasm -f elf64 $< tests/%.run: tests/%.c liblinen.so linen.h @@ -14,4 +14,4 @@ tests: tests/test01.run clean: rm -f tests/*.run rm -f liblinen.so - rm -f lib/*.o + rm -f src/*.o diff --git a/lib/lock_acquire.asm b/lib/lock_acquire.asm deleted file mode 100644 index f32ba6a..0000000 --- a/lib/lock_acquire.asm +++ /dev/null @@ -1,204 +0,0 @@ -; Under MIT license, see /LICENSE.txt - - -; Cheat sheet for Linux' x86_64 calling convention: -; -; - free to overwrite (caller should save them): -; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 -; - caller expects be kept (callee should save them): -; rbx, rbp, r12-r15 -; -; - for passing paramters to functions: -; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 -; - for getting return values from functions: -; rax, rdx, xmm0 -; -; - for passing parameters to syscalls: -; rax, rdi, rsi, rdx, r10, r8, r9 -; - for getting return values from syscalls: -; rax, rdx -; - overwritten by syscalls (all others preserved): -; rcx, r11 - - -section .text - - -; Relevant system call IDs -%define SYS_GETTID 186 -%define SYS_FUTEX 202 - -; Relevant operations for futex -%define FUTEX_LOCK_PI 6 -%define FUTEX_PRIVATE_FLAG 0x80 - -; Relevant bits for futex dword -%define FUTEX_TID_MASK 0x3fffffff -%define FUTEX_OWNER_DIED 0x40000000 -%define FUTEX_WAITERS 0x80000000 - - -; Acquire a lock if possible, or wait until it gets released. Argument: -; rdi: struct{u32,u32,u32}* = handle of lock to acquire -; Returns zero on success, or a standard error code. -global linen_lock_acquire -linen_lock_acquire: - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Check validity of argument ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Return EINVAL if rdi is NULL or otherwise invalid - mov eax, -22 ; (EINVAL = -22) - - test rdi, rdi - jz acquire_return ; rdi is NULL - - ; rdi is nonzero, so let's just assume it's a valid pointer; - ; if that assumption is wrong we'll get a segmentation fault. - ; But we don't yet trust that [rdi] is a valid lock handle! - ; To verify this we check the canary value stored at [rdi + 8]. - mov ecx, [rdi + 8] - cmp ecx, 0xCAFEBABE - jnz acquire_return - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Check ownership of lock ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Lock owners are identified by their TID; let's find ours. - ; The gettid system call simply returns our Linux thread ID. - ; See: man 2 gettid - - ; gettid: rax = system call ID - mov eax, SYS_GETTID - ; gettid: rax = gettid() - syscall - - ; Save a copy of our TID (no need for an error check) - mov edx, eax - - ; There are four possible ownership situations for the lock, - ; which we can distinguish based on the dword value at [rdi]: - ; - Case 1: if [rdi] contains zero, then the lock is available. - ; - Case 2: if [rdi] has any of its highest 2 bits set, then the - ; lock isn't free, and kernel intervention is required. - ; - Case 3: if the lower 30 bits of [rdi] contain our TID, - ; then we already own it (recursive acquisition). - ; - Case 4: if the lower 30 bits of [rdi] contain another TID - ; and the high-bit flags aren't set, then we just wait - ; until we can acquire the lock using atomic operations - ; or, optionally, a futex call (usually more efficient). - - ; Atomically check whether the lock is owned by another thread, - ; and if not, try to take ownership by writing our TID to [rdi]. - ; if ([rdi] == 0) { [rdi] = edx; goto acquire_success; } else { eax = [rdi]; } - xor eax, eax - lock cmpxchg [rdi], edx - jz acquire_success ; case 1 - - ; The lock isn't free, so let's check how "clean" its state is. - ; The following flags are set by the kernel (see futex below): - ; - FUTEX_OWNER_DIED: the lock's owner died, so it's actually free - ; (but first the kernel needs to clean up) - ; - FUTEX_WAITERS: we aren't the only one waiting for this lock - ; (so let's sleep until the kernel wakes us up) - ; Either way, we need the kernel's help, so jump to the futex call. - test eax, (FUTEX_OWNER_DIED | FUTEX_WAITERS) - jnz acquire_futex ; case 2 - - ; It seems someone has the lock, check who: it may already be us. - ; If so, this is a recursive acquisition, good, let's continue. - and eax, FUTEX_TID_MASK - cmp eax, edx - je acquire_success ; case 3 - - ; Someone else has the lock, but we're the only one waiting for it. - ; System calls are expensive, so let's try a short spin loop first, - ; hoping it'll get released soon. This is arguably unnecessary, as - ; it's only beneficial when two threads are more or less "in sync", - ; so in most real-world cases you can delete this with no downside. - - ; Loop counter - mov ecx, 10 - acquire_spinloop: - ; The "pause" instruction is specially designed for loops like this - ; and conserves power. It causes a small delay (makes sense here). - pause - - ; Atomically check whether the lock is owned by another thread, - ; and if not, try to take ownership by writing our TID to [rdi]. - ; if ([rdi] == 0) { [rdi] = edx; goto acquire_success; } else { eax = [rdi]; } - xor eax, eax - lock cmpxchg [rdi], edx - jz acquire_success - - ; Decrement loop counter until zero - dec ecx - jnz acquire_spinloop - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Let the kernel handle it ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - acquire_futex: - ; The futex system call waits for the dword at an address (rdi) - ; changes in a certain way, as described above and in the futex - ; manual's section on so-called "priority-inheritance futexes". - ; See: man 2 futex - - ; futex: rdi = uaddr: address of the dword to watch - ; futex: rsi = futex_op: which futex operation we want: - ; - FUTEX_LOCK_PI: block until lock's owner uses FUTEX_UNLOCK_PI - ; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process - mov esi, (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) - ; futex: r10 = timeout: in case we had a deadline (we don't) - xor r10, r10 - ; futex: rdx = val: ignored when FUTEX_LOCK_PI is used - ; futex: r8 = uaddr2: ignored when FUTEX_LOCK_PI is used - ; futex: r9 = val3: ignored when FUTEX_LOCK_PI is used - ; futex: rax = system call ID - mov eax, SYS_FUTEX - ; futex: rax = futex(rdi, rsi, (rdx), r10, (r8), (r9)) - syscall - - ; Sometimes the lock is released after the "lock cmpxchg" instruction - ; but just before the futex call. In that case, futex returns EAGAIN. - cmp rax, -11 ; (-EAGAIN) - je acquire_futex - - ; Any other negative return value means failure - test rax, rax - jnz acquire_return - - ; Indicate that we made a futex call (see below for why) - xor edx, edx - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Update the recursion counter ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - acquire_success: - ; Read the recursion counter (we have the lock: no need for atomics) - mov ecx, [rdi + 4] - - ; The value in edx depends on how we came to the acquire_success label: - ; 1) We jumped here after a successful "lock cmpxchg": edx has our TID - ; 2) We finished a successful futex call: edx was set to 0 (see above) - test edx, edx - ; Why do we care? Well, in the latter case, the futex call may have been - ; necessary because there was a problem (i.e. FUTEX_OWNER_DIED was set), - ; in which case the recursion counter is stale and hence must be reset. - ; In any other case, whoever released the lock should've reset it already. - cmovz ecx, edx ; ecx = 0 - - ; Increment the recursion counter and write it back to memory - ; (if the lock is being used non-recursively, it should be 1) - inc ecx - mov [rdi + 4], ecx - - ; Lock acquisition was successful, so we'll return 0. In most cases - ; eax is already 0; we only need this if the recursion counter > 1. - xor eax, eax - - acquire_return: - ret diff --git a/lib/lock_release.asm b/lib/lock_release.asm deleted file mode 100644 index f86caa2..0000000 --- a/lib/lock_release.asm +++ /dev/null @@ -1,132 +0,0 @@ -; Under MIT license, see /LICENSE.txt - - -; Cheat sheet for Linux' x86_64 calling convention: -; -; - free to overwrite (caller should save them): -; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 -; - caller expects be kept (callee should save them): -; rbx, rbp, r12-r15 -; -; - for passing paramters to functions: -; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 -; - for getting return values from functions: -; rax, rdx, xmm0 -; -; - for passing parameters to syscalls: -; rax, rdi, rsi, rdx, r10, r8, r9 -; - for getting return values from syscalls: -; rax, rdx -; - overwritten by syscalls (all others preserved): -; rcx, r11 - - -section .text - - -; Relevant system call IDs -%define SYS_GETTID 186 -%define SYS_FUTEX 202 - -; Relevant operations for futex -%define FUTEX_UNLOCK_PI 7 -%define FUTEX_PRIVATE_FLAG 0x80 - -; Relevant bits for futex dword -%define FUTEX_TID_MASK 0x3fffffff -%define FUTEX_OWNER_DIED 0x40000000 -%define FUTEX_WAITERS 0x80000000 - - -; Release an acquired lock if we're who acquired it. Argument: -; rdi: struct{u32,u32,u32}* = handle of lock to release -; Returns zero on success, or a standard error code. -global linen_lock_release -linen_lock_release: - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Check validity of argument ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Return EINVAL if rdi is NULL or invalid - mov eax, -22 ; (EINVAL = -22) - - test rdi, rdi - jz release_return ; rdi is NULL - - ; rdi is nonzero, so let's just assume it's a valid pointer; - ; if that assumption is wrong we'll get a segmentation fault. - ; But we don't yet trust that [rdi] is a valid lock handle! - ; To verify this we check the canary value stored at [rdi + 8]. - mov ecx, [rdi + 8] - cmp ecx, 0xCAFEBABE - jnz release_return - - ; Lock owners are identified by their TID; let's find ours. - ; The gettid system call simply returns our Linux thread ID. - ; See: man 2 gettid - - ; gettid: rax = system call ID - mov eax, SYS_GETTID - ; gettid: rax = gettid() - syscall - - ; Save a copy of our TID (no need for an error check) - mov edx, eax - - ; Return EPERM if this lock currently doesn't belong to us - mov eax, -1 ; (EPERM = -1) - - ; Read the futex dword at [rdi] and keep its lowest 30 bits - mov ecx, [rdi] - and ecx, FUTEX_TID_MASK - ; Those bits contain the owner's TID; it should be our TID - cmp ecx, edx - jne release_return - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; (Partially) release our lock ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Decrement the recursion counter. If it's still > 1, we're done here. - dec dword [rdi + 4] - jnz release_success - ; If it reaches 0, it's time for a full release by setting [rdi] to 0. - - ; Restore our saved TID to eax for "lock cmpxchg" below - mov eax, edx - - ; Atomically try to set the dword at [rdi] to 0 if it was equal to our TID. - ; if ([rdi] == eax]) { [rdi] = 0; goto release_success; } else { eax = [rdi]; } - xor ecx, ecx - lock cmpxchg [rdi], ecx - je release_success - - ; We failed because [rdi] wasn't equal to our TID. In theory, - ; that can mean only one thing: [rdi] = (edx | FUTEX_WAITERS). - ; In that case we need to ask the kernel to wake up the threads - ; who are waiting (via a futex system call) for [rdi] to change. - ; See: man 2 futex - - ; futex: rdi = uaddr: address of the dword to announce for - ; futex: rsi = futex_op: which futex operation we want: - ; - FUTEX_UNLOCK_PI: wake up one thread sleeping via FUTEX_LOCK_PI - ; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process - mov esi, (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) ; futex: futex_op - ; futex: rdx = val: ignored when FUTEX_UNLOCK_PI is used - ; futex: r10 = timeout: ignored when FUTEX_UNLOCK_PI is used - ; futex: r8 = uaddr2: ignored when FUTEX_UNLOCK_PI is used - ; futex: r9 = val3: ignored when FUTEX_UNLOCK_PI is used - ; futex: rax = system call ID - mov eax, SYS_FUTEX - ; futex: rax = futex(rdi, rsi, (rdx), (r10), (r8), (r9)) - syscall - - ; Check result of futex: nonzero means failure - test rax, rax - jnz release_return - - release_success: - xor eax, eax - - release_return: - ret diff --git a/lib/thread_create.asm b/lib/thread_create.asm deleted file mode 100644 index 9a6fe78..0000000 --- a/lib/thread_create.asm +++ /dev/null @@ -1,264 +0,0 @@ -; Under MIT license, see /LICENSE.txt - - -; Cheat sheet for Linux' x86_64 calling convention: -; -; - free to overwrite (caller should save them): -; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 -; - caller expects be kept (callee should save them): -; rbx, rbp, r12-r15 -; -; - for passing paramters to functions: -; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 -; - for getting return values from functions: -; rax, rdx, xmm0 -; -; - for passing parameters to syscalls: -; rax, rdi, rsi, rdx, r10, r8, r9 -; - for getting return values from syscalls: -; rax, rdx -; - overwritten by syscalls (all others preserved): -; rcx, r11 - - -section .text - - -; Relevant system call IDs -%define SYS_MMAP 9 -%define SYS_MPROTECT 10 -%define SYS_CLONE 56 -%define SYS_EXIT 60 - -; Relevant flags for mmap -%define MAP_SHARED 0x00001 -%define MAP_PRIVATE 0x00002 -%define MAP_ANONYMOUS 0x00020 -;%define MAP_GROWSDOWN 0x00100 ; Insecure, segfaults anyway -%define MAP_LOCKED 0x02000 -%define MAP_POPULATE 0x08000 -%define MAP_STACK 0x20000 - -; Relevant flags for mprotect -%define PROT_READ 0x1 -%define PROT_WRITE 0x2 - -; Relevant flags for clone -%define CLONE_VM 0x00000100 -%define CLONE_FS 0x00000200 -%define CLONE_FILES 0x00000400 -%define CLONE_SIGHAND 0x00000800 -%define CLONE_PARENT 0x00008000 -%define CLONE_THREAD 0x00010000 -%define CLONE_SYSVSEM 0x00040000 -%define CLONE_SETTLS 0x00080000 -%define CLONE_PARENT_SETTID 0x00100000 -%define CLONE_CHILD_CLEARTID 0x00200000 -%define CLONE_CHILD_SETTID 0x01000000 -%define CLONE_IO 0x80000000 - - -%define STACK_SIZE 2097152 ; 2 MiB stack -%define GUARD_PAGE 4096 ; 4 KiB guard page - - -; Create a new thread executing a given function. Arguments: -; rdi: struct{u32,u32}** = where to put the thread handle -; rsi: void* (*)(void*) = function to make the child run -; rdx: void* = single argument for function -; Returns zero on success, or a standard error code. -global linen_thread_create -linen_thread_create: - ; Callee-save registers - push rbx - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Check validity of arguments ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Return EINVAL if any argument is NULL - mov eax, -22 ; (EINVAL = -22) - test rdi, rdi - jz create_return ; Nowhere to store the thread handle - test rsi, rsi - jz create_return ; No function for the thread to run - - ; Note: we allow rdx to be NULL; in that case the worst that can happen - ; is a segmentation fault in the user's code (not really our problem). - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Allocate a stack and guard page ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Save these registers: we'll clobber them for the mmap call - mov rbx, rdi - push rdx - push rsi - - ; The mmap system call does many things, in this case allocate memory. - ; See: man 2 mmap - - ; mmap: rdi = addr: address for mapping; 0 lets kernel choose - xor edi, edi - ; mmap: rsi = length: size of buffer to allocate - mov esi, (STACK_SIZE + GUARD_PAGE) - ; mmap: rdx = prot: mprotect-style access permissions - mov edx, (PROT_WRITE | PROT_READ) - ; mmap: r10 = flags: configuration flags for mapping: - ; - MAP_ANONYMOUS: there is no file backing this buffer - ; - MAP_PRIVATE: only this process can see thread's stack - ; - MAP_STACK: no-op; inform kernel that this is a stack - mov r10, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK) - ; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1 - mov r8, -1 - ; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used - xor r9, r9 - ; mmap: rax = system call ID - mov eax, SYS_MMAP - ; mmap: rax = mmap(rdi, rsi, rdx, r10, r8, 9) - syscall - - ; Pop these now before we start branching. Those registers - ; won't be used by the next system calls, so they're safe. - pop r8 ; function - pop r9 ; argument - - ; Check result of mmap: negative means failure, - ; otherwise rax is the address of the new mapping. - test rax, rax - js create_return - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Revoke guard page's R/W permissions ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Keep in mind that stacks grow downward, so the guard page is at - ; the lowest address of the newly-allocated buffer, i.e. at [rax]. - - ; The mprotect system call changes the permissions of a memory region. - ; See: man 2 mprotect - - ; mprotect: rdi = addr: lower address of region to control - mov rdi, rax - ; mprotect: rsi = len: size of region, one page in this case - mov esi, GUARD_PAGE - ; mprotect: rdx = prot: access permissions; zero for none - xor edx, edx - ; mprotect: rax = system call ID - mov eax, SYS_MPROTECT - ; mprotect: rax = mprotect(rdi, rsi, rdx) - syscall - - ; Check result of mprotect: nonzero means failure - test rax, rax - jnz create_return - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Spawn a thread with the new stack ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; The clone system call spawns a new thread, cloned from a parent. - ; Both threads end up running the same code, i.e. it returns "twice", - ; once in the parent (0 if success) and once in the child (the TID). - ; See: man 2 clone - - ; clone: rsi = stack - ; Currently rdi points to the lowest byte of the stack area. - ; Again, stacks grow downward, so we calculate the address of - ; the top qword to use as the child thread's starting point. - lea rsi, [rdi + (STACK_SIZE + GUARD_PAGE - 8)] - - ; clone: rdi = flags: settings for cloned thread - ; These flags make the parent and child share resources: - ; - CLONE_VM: memory address space - ; - CLONE_FS: filesystem information, e.g. working directory - ; - CLONE_FILES: file descriptor table - ; - CLONE_IO: I/O scheduler context - ; - CLONE_SIGHAND: signal handlers - ; - CLONE_PARENT: parent process (implied by CLONE_THREAD?) - ; - CLONE_THREAD: shared PID, distinguish by TID instead (I think?) - ; These flags are relevant for a threading API: - ; - CLONE_CHILD_SETTID: store child's TID at supplied address (in r10) - ; - CLONE_CHILD_CLEARTID: set stored TID to zero when child finishes - ; (this will be used for joining threads) - mov edi, (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO \ - | CLONE_SIGHAND | CLONE_PARENT | CLONE_THREAD \ - | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID) - - ; clone: rdx = parent_tid: ignored unless CLONE_PARENT_SETTID is used - - ; clone: r10 = child_tid: address to store new thread's TID - ; We use "bottom" of stack (rsi), i.e. where child will start. - mov r10, rsi - - ; clone: r8 = tls: ignored unless CLONE_SETTLS is used - - ; clone: rax = system call ID - mov eax, SYS_CLONE - ; clone: rax = clone(rdi, rsi, (rdx), r10, (r8)); - syscall - - ; Ideally, both parent and new-born child are executing this code now. - - ; Check result of clone: - test rax, rax - js create_return ; Negative means failure - jnz create_success ; Positive means we're in the parent thread - ; Zero means we're in the child thread - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Initialization in child thread ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Best practice is to clear the frame pointer - xor ebp, ebp - - ; Move argument into place and call supplied function - mov rdi, r9 - call r8 - - ; Once done, leave function's return value lying around - push rax - - ; Exit the thread with return value 0 - xor edi, edi - mov rax, SYS_EXIT - syscall ; (never returns) - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Clean up in parent thread ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - create_success: - ; We use the highest dword of the child's stack buffer as a futex - ; to detect when it has finished (see CLONE_CHILD_CLEARTID above). - ; That dword's address also acts as a thread handle for our API, - ; so we store it at the address the caller supplied (now in rbx). - mov [rbx], rsi - - ; We place a canary value in the unused dword at the top: - ; checking this value tells us if a thread handle is valid. - mov dword [rsi + 4], 0xDEADBEEF - - ; "Sketch" of child's stack buffer's layout: - ; - ; (bottom of range allocated by mmap) - ; 4 KiB: guard page, unused - ; (bottom of usable buffer) - ; ... - ; ... Child is currently doing work here ... - ; ... - ; qword: return address of function called by child (from r8) - ; dword: futex to detect when child has returned (address: rsi) - ; dword: canary value to know if handle is valid (address: rsi + 4) - ; (top of range allocated by mmap = top of usable buffer) - - ; Return 0 for success - xor eax, eax - - create_return: - ; Restore callee-save registers - pop rbx - - ret - diff --git a/lib/thread_finish.asm b/lib/thread_finish.asm deleted file mode 100644 index 860b0a4..0000000 --- a/lib/thread_finish.asm +++ /dev/null @@ -1,147 +0,0 @@ -; Under MIT license, see /LICENSE.txt - - -; Cheat sheet for Linux' x86_64 calling convention: -; -; - free to overwrite (caller should save them): -; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 -; - caller expects be kept (callee should save them): -; rbx, rbp, r12-r15 -; -; - for passing paramters to functions: -; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 -; - for getting return values from functions: -; rax, rdx, xmm0 -; -; - for passing parameters to syscalls: -; rax, rdi, rsi, rdx, r10, r8, r9 -; - for getting return values from syscalls: -; rax, rdx -; - overwritten by syscalls (all others preserved): -; rcx, r11 - - -section .text - - -; Relevant system call IDs -%define SYS_MUNMAP 11 -%define SYS_FUTEX 202 - -; Relevant operations for futex -%define FUTEX_WAIT 0 -%define FUTEX_PRIVATE_FLAG 0x80 - - -%define STACK_SIZE 2097152 ; 2 MiB stack -%define GUARD_PAGE 4096 ; 4 KiB guard page - - -; Wait for thread to exit, save its return value, and clean up. Arguments: -; rdi: struct{u32,u32}* = handle of the thread to wait for -; rsi: void** = where to put void* returned by thread -; Returns zero on success, or a standard error code. -global linen_thread_finish -linen_thread_finish: - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Check validity of arguments ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Return EINVAL if rdi is NULL or otherwise invalid - mov eax, -22 ; (EINVAL = -22) - - test rdi, rdi - jz finish_return ; rdi is NULL - - ; rdi is nonzero, so let's just assume it's a valid pointer; - ; if that assumption is wrong we'll get a segmentation fault. - ; But we don't yet trust that [rdi] is a valid thread handle! - ; To verify this we check the canary value stored at [rdi + 4]. - mov ecx, [rdi + 4] - cmp ecx, 0xDEADBEEF - jnz finish_return - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Wait until thread is finished ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; We'll clobber rsi if we need to set up a futex call - mov r8, rsi - - finish_retry: - ; When spawning, we set CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID: - ; [rdi] contains the child thread's TID, and will get automatically - ; cleared (to 0) when the child exits; this is what we'll watch for. - - ; Atomically check whether the target thread is still running. - ; if ([rdi] == 0) { goto finish_success; } else { eax = [rdi]; } - xor eax, eax - lock cmpxchg [rdi], eax - jz finish_success - - ; The thread is still busy, so block until it's done. - ; The futex system call waits until the dword at an - ; address (rdi) deviates from an expected value (eax). - ; See: man 2 futex - - ; futex: rdi = uaddr: address of the dword to watch - ; futex: rsi = futex_op: which futex operation we want: - ; - FUTEX_WAIT: block until the value at [rdi] changes - ; - FUTEX_PRIVATE_FLAG: FIXME waits forever, I don't understand why - mov esi, FUTEX_WAIT - ; futex: rdx = val: the expected value at [rdi] before it changes - mov edx, eax - ; futex: r10 = timeout: in case we had a deadline (we don't) - xor r10, r10 - ; futex: r8 = uaddr2: ignored when FUTEX_WAIT is used - ; futex: r9 = val3: ignored when FUTEX_WAIT is used - ; futex: rax = system call ID - mov eax, SYS_FUTEX - ; futex: rax = futex(rdi, rsi, rdx, r10, (r8), (r9)) - syscall - - ; Sometimes the thread exits after the "lock cmpxchg" instruction - ; but before the futex call. In that case, futex returns EAGAIN. - cmp rax, -11 ; (EAGAIN = -11) - je finish_retry - - ; Any other nonzero return value means failure - test rax, rax - jnz finish_return - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;;;; Clean up after thread's exit ;;;; - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - finish_success: - ; The thread left its function return value on the stack, read it - mov rdx, [rdi - 8] - - ; The munmap system call destroys mappings created by mmap. - ; In this case that means deallocating the stack buffer. - ; See: man 2 munmap - - ; munmap: rdi = addr: lowest address of region to unmap - ; Our rdi is near the buffer's top, so we must subtract - sub rdi, (STACK_SIZE + GUARD_PAGE - 8) - ; munmap: rsi = length: size of region starting from rdi - mov esi, (STACK_SIZE + GUARD_PAGE) - ; munmap: rax = system call ID - mov eax, SYS_MUNMAP - ; munmap: rax = munmap(rdi, rsi) - syscall - - ; Check result of munmap: nonzero means failure - test rax, rax - jnz finish_return - - ; Check if caller gave a location (r8) to save the return value (rdx) - test r8, r8 - jz finish_return ; caller doesn't care: gave NULL pointer - mov [r8], rdx - ; Note: if munmap failed, the buffer is still there, so we - ; can safely return an error without losing the return value. - - finish_return: - ret - diff --git a/src/lock_acquire.asm b/src/lock_acquire.asm new file mode 100644 index 0000000..f32ba6a --- /dev/null +++ b/src/lock_acquire.asm @@ -0,0 +1,204 @@ +; Under MIT license, see /LICENSE.txt + + +; Cheat sheet for Linux' x86_64 calling convention: +; +; - free to overwrite (caller should save them): +; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 +; - caller expects be kept (callee should save them): +; rbx, rbp, r12-r15 +; +; - for passing paramters to functions: +; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 +; - for getting return values from functions: +; rax, rdx, xmm0 +; +; - for passing parameters to syscalls: +; rax, rdi, rsi, rdx, r10, r8, r9 +; - for getting return values from syscalls: +; rax, rdx +; - overwritten by syscalls (all others preserved): +; rcx, r11 + + +section .text + + +; Relevant system call IDs +%define SYS_GETTID 186 +%define SYS_FUTEX 202 + +; Relevant operations for futex +%define FUTEX_LOCK_PI 6 +%define FUTEX_PRIVATE_FLAG 0x80 + +; Relevant bits for futex dword +%define FUTEX_TID_MASK 0x3fffffff +%define FUTEX_OWNER_DIED 0x40000000 +%define FUTEX_WAITERS 0x80000000 + + +; Acquire a lock if possible, or wait until it gets released. Argument: +; rdi: struct{u32,u32,u32}* = handle of lock to acquire +; Returns zero on success, or a standard error code. +global linen_lock_acquire +linen_lock_acquire: + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Check validity of argument ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Return EINVAL if rdi is NULL or otherwise invalid + mov eax, -22 ; (EINVAL = -22) + + test rdi, rdi + jz acquire_return ; rdi is NULL + + ; rdi is nonzero, so let's just assume it's a valid pointer; + ; if that assumption is wrong we'll get a segmentation fault. + ; But we don't yet trust that [rdi] is a valid lock handle! + ; To verify this we check the canary value stored at [rdi + 8]. + mov ecx, [rdi + 8] + cmp ecx, 0xCAFEBABE + jnz acquire_return + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Check ownership of lock ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Lock owners are identified by their TID; let's find ours. + ; The gettid system call simply returns our Linux thread ID. + ; See: man 2 gettid + + ; gettid: rax = system call ID + mov eax, SYS_GETTID + ; gettid: rax = gettid() + syscall + + ; Save a copy of our TID (no need for an error check) + mov edx, eax + + ; There are four possible ownership situations for the lock, + ; which we can distinguish based on the dword value at [rdi]: + ; - Case 1: if [rdi] contains zero, then the lock is available. + ; - Case 2: if [rdi] has any of its highest 2 bits set, then the + ; lock isn't free, and kernel intervention is required. + ; - Case 3: if the lower 30 bits of [rdi] contain our TID, + ; then we already own it (recursive acquisition). + ; - Case 4: if the lower 30 bits of [rdi] contain another TID + ; and the high-bit flags aren't set, then we just wait + ; until we can acquire the lock using atomic operations + ; or, optionally, a futex call (usually more efficient). + + ; Atomically check whether the lock is owned by another thread, + ; and if not, try to take ownership by writing our TID to [rdi]. + ; if ([rdi] == 0) { [rdi] = edx; goto acquire_success; } else { eax = [rdi]; } + xor eax, eax + lock cmpxchg [rdi], edx + jz acquire_success ; case 1 + + ; The lock isn't free, so let's check how "clean" its state is. + ; The following flags are set by the kernel (see futex below): + ; - FUTEX_OWNER_DIED: the lock's owner died, so it's actually free + ; (but first the kernel needs to clean up) + ; - FUTEX_WAITERS: we aren't the only one waiting for this lock + ; (so let's sleep until the kernel wakes us up) + ; Either way, we need the kernel's help, so jump to the futex call. + test eax, (FUTEX_OWNER_DIED | FUTEX_WAITERS) + jnz acquire_futex ; case 2 + + ; It seems someone has the lock, check who: it may already be us. + ; If so, this is a recursive acquisition, good, let's continue. + and eax, FUTEX_TID_MASK + cmp eax, edx + je acquire_success ; case 3 + + ; Someone else has the lock, but we're the only one waiting for it. + ; System calls are expensive, so let's try a short spin loop first, + ; hoping it'll get released soon. This is arguably unnecessary, as + ; it's only beneficial when two threads are more or less "in sync", + ; so in most real-world cases you can delete this with no downside. + + ; Loop counter + mov ecx, 10 + acquire_spinloop: + ; The "pause" instruction is specially designed for loops like this + ; and conserves power. It causes a small delay (makes sense here). + pause + + ; Atomically check whether the lock is owned by another thread, + ; and if not, try to take ownership by writing our TID to [rdi]. + ; if ([rdi] == 0) { [rdi] = edx; goto acquire_success; } else { eax = [rdi]; } + xor eax, eax + lock cmpxchg [rdi], edx + jz acquire_success + + ; Decrement loop counter until zero + dec ecx + jnz acquire_spinloop + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Let the kernel handle it ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + acquire_futex: + ; The futex system call waits for the dword at an address (rdi) + ; changes in a certain way, as described above and in the futex + ; manual's section on so-called "priority-inheritance futexes". + ; See: man 2 futex + + ; futex: rdi = uaddr: address of the dword to watch + ; futex: rsi = futex_op: which futex operation we want: + ; - FUTEX_LOCK_PI: block until lock's owner uses FUTEX_UNLOCK_PI + ; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process + mov esi, (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) + ; futex: r10 = timeout: in case we had a deadline (we don't) + xor r10, r10 + ; futex: rdx = val: ignored when FUTEX_LOCK_PI is used + ; futex: r8 = uaddr2: ignored when FUTEX_LOCK_PI is used + ; futex: r9 = val3: ignored when FUTEX_LOCK_PI is used + ; futex: rax = system call ID + mov eax, SYS_FUTEX + ; futex: rax = futex(rdi, rsi, (rdx), r10, (r8), (r9)) + syscall + + ; Sometimes the lock is released after the "lock cmpxchg" instruction + ; but just before the futex call. In that case, futex returns EAGAIN. + cmp rax, -11 ; (-EAGAIN) + je acquire_futex + + ; Any other negative return value means failure + test rax, rax + jnz acquire_return + + ; Indicate that we made a futex call (see below for why) + xor edx, edx + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Update the recursion counter ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + acquire_success: + ; Read the recursion counter (we have the lock: no need for atomics) + mov ecx, [rdi + 4] + + ; The value in edx depends on how we came to the acquire_success label: + ; 1) We jumped here after a successful "lock cmpxchg": edx has our TID + ; 2) We finished a successful futex call: edx was set to 0 (see above) + test edx, edx + ; Why do we care? Well, in the latter case, the futex call may have been + ; necessary because there was a problem (i.e. FUTEX_OWNER_DIED was set), + ; in which case the recursion counter is stale and hence must be reset. + ; In any other case, whoever released the lock should've reset it already. + cmovz ecx, edx ; ecx = 0 + + ; Increment the recursion counter and write it back to memory + ; (if the lock is being used non-recursively, it should be 1) + inc ecx + mov [rdi + 4], ecx + + ; Lock acquisition was successful, so we'll return 0. In most cases + ; eax is already 0; we only need this if the recursion counter > 1. + xor eax, eax + + acquire_return: + ret diff --git a/src/lock_release.asm b/src/lock_release.asm new file mode 100644 index 0000000..f86caa2 --- /dev/null +++ b/src/lock_release.asm @@ -0,0 +1,132 @@ +; Under MIT license, see /LICENSE.txt + + +; Cheat sheet for Linux' x86_64 calling convention: +; +; - free to overwrite (caller should save them): +; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 +; - caller expects be kept (callee should save them): +; rbx, rbp, r12-r15 +; +; - for passing paramters to functions: +; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 +; - for getting return values from functions: +; rax, rdx, xmm0 +; +; - for passing parameters to syscalls: +; rax, rdi, rsi, rdx, r10, r8, r9 +; - for getting return values from syscalls: +; rax, rdx +; - overwritten by syscalls (all others preserved): +; rcx, r11 + + +section .text + + +; Relevant system call IDs +%define SYS_GETTID 186 +%define SYS_FUTEX 202 + +; Relevant operations for futex +%define FUTEX_UNLOCK_PI 7 +%define FUTEX_PRIVATE_FLAG 0x80 + +; Relevant bits for futex dword +%define FUTEX_TID_MASK 0x3fffffff +%define FUTEX_OWNER_DIED 0x40000000 +%define FUTEX_WAITERS 0x80000000 + + +; Release an acquired lock if we're who acquired it. Argument: +; rdi: struct{u32,u32,u32}* = handle of lock to release +; Returns zero on success, or a standard error code. +global linen_lock_release +linen_lock_release: + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Check validity of argument ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Return EINVAL if rdi is NULL or invalid + mov eax, -22 ; (EINVAL = -22) + + test rdi, rdi + jz release_return ; rdi is NULL + + ; rdi is nonzero, so let's just assume it's a valid pointer; + ; if that assumption is wrong we'll get a segmentation fault. + ; But we don't yet trust that [rdi] is a valid lock handle! + ; To verify this we check the canary value stored at [rdi + 8]. + mov ecx, [rdi + 8] + cmp ecx, 0xCAFEBABE + jnz release_return + + ; Lock owners are identified by their TID; let's find ours. + ; The gettid system call simply returns our Linux thread ID. + ; See: man 2 gettid + + ; gettid: rax = system call ID + mov eax, SYS_GETTID + ; gettid: rax = gettid() + syscall + + ; Save a copy of our TID (no need for an error check) + mov edx, eax + + ; Return EPERM if this lock currently doesn't belong to us + mov eax, -1 ; (EPERM = -1) + + ; Read the futex dword at [rdi] and keep its lowest 30 bits + mov ecx, [rdi] + and ecx, FUTEX_TID_MASK + ; Those bits contain the owner's TID; it should be our TID + cmp ecx, edx + jne release_return + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; (Partially) release our lock ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Decrement the recursion counter. If it's still > 1, we're done here. + dec dword [rdi + 4] + jnz release_success + ; If it reaches 0, it's time for a full release by setting [rdi] to 0. + + ; Restore our saved TID to eax for "lock cmpxchg" below + mov eax, edx + + ; Atomically try to set the dword at [rdi] to 0 if it was equal to our TID. + ; if ([rdi] == eax]) { [rdi] = 0; goto release_success; } else { eax = [rdi]; } + xor ecx, ecx + lock cmpxchg [rdi], ecx + je release_success + + ; We failed because [rdi] wasn't equal to our TID. In theory, + ; that can mean only one thing: [rdi] = (edx | FUTEX_WAITERS). + ; In that case we need to ask the kernel to wake up the threads + ; who are waiting (via a futex system call) for [rdi] to change. + ; See: man 2 futex + + ; futex: rdi = uaddr: address of the dword to announce for + ; futex: rsi = futex_op: which futex operation we want: + ; - FUTEX_UNLOCK_PI: wake up one thread sleeping via FUTEX_LOCK_PI + ; - FUTEX_PRIVATE_FLAG: this lock isn't shared with another process + mov esi, (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) ; futex: futex_op + ; futex: rdx = val: ignored when FUTEX_UNLOCK_PI is used + ; futex: r10 = timeout: ignored when FUTEX_UNLOCK_PI is used + ; futex: r8 = uaddr2: ignored when FUTEX_UNLOCK_PI is used + ; futex: r9 = val3: ignored when FUTEX_UNLOCK_PI is used + ; futex: rax = system call ID + mov eax, SYS_FUTEX + ; futex: rax = futex(rdi, rsi, (rdx), (r10), (r8), (r9)) + syscall + + ; Check result of futex: nonzero means failure + test rax, rax + jnz release_return + + release_success: + xor eax, eax + + release_return: + ret diff --git a/src/thread_create.asm b/src/thread_create.asm new file mode 100644 index 0000000..9a6fe78 --- /dev/null +++ b/src/thread_create.asm @@ -0,0 +1,264 @@ +; Under MIT license, see /LICENSE.txt + + +; Cheat sheet for Linux' x86_64 calling convention: +; +; - free to overwrite (caller should save them): +; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 +; - caller expects be kept (callee should save them): +; rbx, rbp, r12-r15 +; +; - for passing paramters to functions: +; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 +; - for getting return values from functions: +; rax, rdx, xmm0 +; +; - for passing parameters to syscalls: +; rax, rdi, rsi, rdx, r10, r8, r9 +; - for getting return values from syscalls: +; rax, rdx +; - overwritten by syscalls (all others preserved): +; rcx, r11 + + +section .text + + +; Relevant system call IDs +%define SYS_MMAP 9 +%define SYS_MPROTECT 10 +%define SYS_CLONE 56 +%define SYS_EXIT 60 + +; Relevant flags for mmap +%define MAP_SHARED 0x00001 +%define MAP_PRIVATE 0x00002 +%define MAP_ANONYMOUS 0x00020 +;%define MAP_GROWSDOWN 0x00100 ; Insecure, segfaults anyway +%define MAP_LOCKED 0x02000 +%define MAP_POPULATE 0x08000 +%define MAP_STACK 0x20000 + +; Relevant flags for mprotect +%define PROT_READ 0x1 +%define PROT_WRITE 0x2 + +; Relevant flags for clone +%define CLONE_VM 0x00000100 +%define CLONE_FS 0x00000200 +%define CLONE_FILES 0x00000400 +%define CLONE_SIGHAND 0x00000800 +%define CLONE_PARENT 0x00008000 +%define CLONE_THREAD 0x00010000 +%define CLONE_SYSVSEM 0x00040000 +%define CLONE_SETTLS 0x00080000 +%define CLONE_PARENT_SETTID 0x00100000 +%define CLONE_CHILD_CLEARTID 0x00200000 +%define CLONE_CHILD_SETTID 0x01000000 +%define CLONE_IO 0x80000000 + + +%define STACK_SIZE 2097152 ; 2 MiB stack +%define GUARD_PAGE 4096 ; 4 KiB guard page + + +; Create a new thread executing a given function. Arguments: +; rdi: struct{u32,u32}** = where to put the thread handle +; rsi: void* (*)(void*) = function to make the child run +; rdx: void* = single argument for function +; Returns zero on success, or a standard error code. +global linen_thread_create +linen_thread_create: + ; Callee-save registers + push rbx + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Check validity of arguments ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Return EINVAL if any argument is NULL + mov eax, -22 ; (EINVAL = -22) + test rdi, rdi + jz create_return ; Nowhere to store the thread handle + test rsi, rsi + jz create_return ; No function for the thread to run + + ; Note: we allow rdx to be NULL; in that case the worst that can happen + ; is a segmentation fault in the user's code (not really our problem). + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Allocate a stack and guard page ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Save these registers: we'll clobber them for the mmap call + mov rbx, rdi + push rdx + push rsi + + ; The mmap system call does many things, in this case allocate memory. + ; See: man 2 mmap + + ; mmap: rdi = addr: address for mapping; 0 lets kernel choose + xor edi, edi + ; mmap: rsi = length: size of buffer to allocate + mov esi, (STACK_SIZE + GUARD_PAGE) + ; mmap: rdx = prot: mprotect-style access permissions + mov edx, (PROT_WRITE | PROT_READ) + ; mmap: r10 = flags: configuration flags for mapping: + ; - MAP_ANONYMOUS: there is no file backing this buffer + ; - MAP_PRIVATE: only this process can see thread's stack + ; - MAP_STACK: no-op; inform kernel that this is a stack + mov r10, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK) + ; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1 + mov r8, -1 + ; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used + xor r9, r9 + ; mmap: rax = system call ID + mov eax, SYS_MMAP + ; mmap: rax = mmap(rdi, rsi, rdx, r10, r8, 9) + syscall + + ; Pop these now before we start branching. Those registers + ; won't be used by the next system calls, so they're safe. + pop r8 ; function + pop r9 ; argument + + ; Check result of mmap: negative means failure, + ; otherwise rax is the address of the new mapping. + test rax, rax + js create_return + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Revoke guard page's R/W permissions ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Keep in mind that stacks grow downward, so the guard page is at + ; the lowest address of the newly-allocated buffer, i.e. at [rax]. + + ; The mprotect system call changes the permissions of a memory region. + ; See: man 2 mprotect + + ; mprotect: rdi = addr: lower address of region to control + mov rdi, rax + ; mprotect: rsi = len: size of region, one page in this case + mov esi, GUARD_PAGE + ; mprotect: rdx = prot: access permissions; zero for none + xor edx, edx + ; mprotect: rax = system call ID + mov eax, SYS_MPROTECT + ; mprotect: rax = mprotect(rdi, rsi, rdx) + syscall + + ; Check result of mprotect: nonzero means failure + test rax, rax + jnz create_return + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Spawn a thread with the new stack ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; The clone system call spawns a new thread, cloned from a parent. + ; Both threads end up running the same code, i.e. it returns "twice", + ; once in the parent (0 if success) and once in the child (the TID). + ; See: man 2 clone + + ; clone: rsi = stack + ; Currently rdi points to the lowest byte of the stack area. + ; Again, stacks grow downward, so we calculate the address of + ; the top qword to use as the child thread's starting point. + lea rsi, [rdi + (STACK_SIZE + GUARD_PAGE - 8)] + + ; clone: rdi = flags: settings for cloned thread + ; These flags make the parent and child share resources: + ; - CLONE_VM: memory address space + ; - CLONE_FS: filesystem information, e.g. working directory + ; - CLONE_FILES: file descriptor table + ; - CLONE_IO: I/O scheduler context + ; - CLONE_SIGHAND: signal handlers + ; - CLONE_PARENT: parent process (implied by CLONE_THREAD?) + ; - CLONE_THREAD: shared PID, distinguish by TID instead (I think?) + ; These flags are relevant for a threading API: + ; - CLONE_CHILD_SETTID: store child's TID at supplied address (in r10) + ; - CLONE_CHILD_CLEARTID: set stored TID to zero when child finishes + ; (this will be used for joining threads) + mov edi, (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO \ + | CLONE_SIGHAND | CLONE_PARENT | CLONE_THREAD \ + | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID) + + ; clone: rdx = parent_tid: ignored unless CLONE_PARENT_SETTID is used + + ; clone: r10 = child_tid: address to store new thread's TID + ; We use "bottom" of stack (rsi), i.e. where child will start. + mov r10, rsi + + ; clone: r8 = tls: ignored unless CLONE_SETTLS is used + + ; clone: rax = system call ID + mov eax, SYS_CLONE + ; clone: rax = clone(rdi, rsi, (rdx), r10, (r8)); + syscall + + ; Ideally, both parent and new-born child are executing this code now. + + ; Check result of clone: + test rax, rax + js create_return ; Negative means failure + jnz create_success ; Positive means we're in the parent thread + ; Zero means we're in the child thread + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Initialization in child thread ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Best practice is to clear the frame pointer + xor ebp, ebp + + ; Move argument into place and call supplied function + mov rdi, r9 + call r8 + + ; Once done, leave function's return value lying around + push rax + + ; Exit the thread with return value 0 + xor edi, edi + mov rax, SYS_EXIT + syscall ; (never returns) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Clean up in parent thread ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + create_success: + ; We use the highest dword of the child's stack buffer as a futex + ; to detect when it has finished (see CLONE_CHILD_CLEARTID above). + ; That dword's address also acts as a thread handle for our API, + ; so we store it at the address the caller supplied (now in rbx). + mov [rbx], rsi + + ; We place a canary value in the unused dword at the top: + ; checking this value tells us if a thread handle is valid. + mov dword [rsi + 4], 0xDEADBEEF + + ; "Sketch" of child's stack buffer's layout: + ; + ; (bottom of range allocated by mmap) + ; 4 KiB: guard page, unused + ; (bottom of usable buffer) + ; ... + ; ... Child is currently doing work here ... + ; ... + ; qword: return address of function called by child (from r8) + ; dword: futex to detect when child has returned (address: rsi) + ; dword: canary value to know if handle is valid (address: rsi + 4) + ; (top of range allocated by mmap = top of usable buffer) + + ; Return 0 for success + xor eax, eax + + create_return: + ; Restore callee-save registers + pop rbx + + ret + diff --git a/src/thread_finish.asm b/src/thread_finish.asm new file mode 100644 index 0000000..860b0a4 --- /dev/null +++ b/src/thread_finish.asm @@ -0,0 +1,147 @@ +; Under MIT license, see /LICENSE.txt + + +; Cheat sheet for Linux' x86_64 calling convention: +; +; - free to overwrite (caller should save them): +; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 +; - caller expects be kept (callee should save them): +; rbx, rbp, r12-r15 +; +; - for passing paramters to functions: +; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 +; - for getting return values from functions: +; rax, rdx, xmm0 +; +; - for passing parameters to syscalls: +; rax, rdi, rsi, rdx, r10, r8, r9 +; - for getting return values from syscalls: +; rax, rdx +; - overwritten by syscalls (all others preserved): +; rcx, r11 + + +section .text + + +; Relevant system call IDs +%define SYS_MUNMAP 11 +%define SYS_FUTEX 202 + +; Relevant operations for futex +%define FUTEX_WAIT 0 +%define FUTEX_PRIVATE_FLAG 0x80 + + +%define STACK_SIZE 2097152 ; 2 MiB stack +%define GUARD_PAGE 4096 ; 4 KiB guard page + + +; Wait for thread to exit, save its return value, and clean up. Arguments: +; rdi: struct{u32,u32}* = handle of the thread to wait for +; rsi: void** = where to put void* returned by thread +; Returns zero on success, or a standard error code. +global linen_thread_finish +linen_thread_finish: + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Check validity of arguments ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Return EINVAL if rdi is NULL or otherwise invalid + mov eax, -22 ; (EINVAL = -22) + + test rdi, rdi + jz finish_return ; rdi is NULL + + ; rdi is nonzero, so let's just assume it's a valid pointer; + ; if that assumption is wrong we'll get a segmentation fault. + ; But we don't yet trust that [rdi] is a valid thread handle! + ; To verify this we check the canary value stored at [rdi + 4]. + mov ecx, [rdi + 4] + cmp ecx, 0xDEADBEEF + jnz finish_return + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Wait until thread is finished ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; We'll clobber rsi if we need to set up a futex call + mov r8, rsi + + finish_retry: + ; When spawning, we set CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID: + ; [rdi] contains the child thread's TID, and will get automatically + ; cleared (to 0) when the child exits; this is what we'll watch for. + + ; Atomically check whether the target thread is still running. + ; if ([rdi] == 0) { goto finish_success; } else { eax = [rdi]; } + xor eax, eax + lock cmpxchg [rdi], eax + jz finish_success + + ; The thread is still busy, so block until it's done. + ; The futex system call waits until the dword at an + ; address (rdi) deviates from an expected value (eax). + ; See: man 2 futex + + ; futex: rdi = uaddr: address of the dword to watch + ; futex: rsi = futex_op: which futex operation we want: + ; - FUTEX_WAIT: block until the value at [rdi] changes + ; - FUTEX_PRIVATE_FLAG: FIXME waits forever, I don't understand why + mov esi, FUTEX_WAIT + ; futex: rdx = val: the expected value at [rdi] before it changes + mov edx, eax + ; futex: r10 = timeout: in case we had a deadline (we don't) + xor r10, r10 + ; futex: r8 = uaddr2: ignored when FUTEX_WAIT is used + ; futex: r9 = val3: ignored when FUTEX_WAIT is used + ; futex: rax = system call ID + mov eax, SYS_FUTEX + ; futex: rax = futex(rdi, rsi, rdx, r10, (r8), (r9)) + syscall + + ; Sometimes the thread exits after the "lock cmpxchg" instruction + ; but before the futex call. In that case, futex returns EAGAIN. + cmp rax, -11 ; (EAGAIN = -11) + je finish_retry + + ; Any other nonzero return value means failure + test rax, rax + jnz finish_return + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;;; Clean up after thread's exit ;;;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + finish_success: + ; The thread left its function return value on the stack, read it + mov rdx, [rdi - 8] + + ; The munmap system call destroys mappings created by mmap. + ; In this case that means deallocating the stack buffer. + ; See: man 2 munmap + + ; munmap: rdi = addr: lowest address of region to unmap + ; Our rdi is near the buffer's top, so we must subtract + sub rdi, (STACK_SIZE + GUARD_PAGE - 8) + ; munmap: rsi = length: size of region starting from rdi + mov esi, (STACK_SIZE + GUARD_PAGE) + ; munmap: rax = system call ID + mov eax, SYS_MUNMAP + ; munmap: rax = munmap(rdi, rsi) + syscall + + ; Check result of munmap: nonzero means failure + test rax, rax + jnz finish_return + + ; Check if caller gave a location (r8) to save the return value (rdx) + test r8, r8 + jz finish_return ; caller doesn't care: gave NULL pointer + mov [r8], rdx + ; Note: if munmap failed, the buffer is still there, so we + ; can safely return an error without losing the return value. + + finish_return: + ret + -- cgit v1.2.3