summaryrefslogtreecommitdiff
path: root/lib/thread_create.asm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/thread_create.asm')
-rw-r--r--lib/thread_create.asm284
1 files changed, 284 insertions, 0 deletions
diff --git a/lib/thread_create.asm b/lib/thread_create.asm
new file mode 100644
index 0000000..4163d29
--- /dev/null
+++ b/lib/thread_create.asm
@@ -0,0 +1,284 @@
+; MIT license:
+;
+; Copyright (c) 2023 Marcus R.A. Newman (prefetch@prefetch.eu)
+;
+; Permission is hereby granted, free of charge, to any person obtaining a copy
+; of this software and associated documentation files (the "Software"), to deal
+; in the Software without restriction, including without limitation the rights
+; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+; copies of the Software, and to permit persons to whom the Software is
+; furnished to do so, subject to the following conditions:
+;
+; The above copyright notice and this permission notice shall be included in
+; all copies or substantial portions of the Software.
+;
+; THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+; SOFTWARE.
+
+
+; Cheat sheet for Linux' x86_64 calling convention:
+;
+; - free to overwrite; the caller should save them:
+; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15
+; - caller expects no change; callee should save them:
+; rbx, rbp, r12-r15
+;
+; - for passing paramters to functions:
+; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7
+; - for getting return values from functions:
+; rax, rdx, xmm0
+;
+; - for passing parameters to syscalls:
+; rax, rdi, rsi, rdx, r10, r8, r9
+; - for getting return values from syscalls:
+; rax, rdx
+; - overwritten by syscalls (all others preserved):
+; rcx, r11
+
+
+section .text
+
+
+; Relevant system call IDs
+%define SYS_MMAP 9
+%define SYS_MPROTECT 10
+%define SYS_CLONE 56
+%define SYS_EXIT 60
+
+; Relevant flags for mmap
+%define MAP_SHARED 0x00001
+%define MAP_PRIVATE 0x00002
+%define MAP_ANONYMOUS 0x00020
+;%define MAP_GROWSDOWN 0x00100 ; Insecure, segfaults anyway
+%define MAP_LOCKED 0x02000
+%define MAP_POPULATE 0x08000
+%define MAP_STACK 0x20000
+
+; Relevant flags for mprotect
+%define PROT_READ 0x1
+%define PROT_WRITE 0x2
+
+; Relevant flags for clone
+%define CLONE_VM 0x00000100
+%define CLONE_FS 0x00000200
+%define CLONE_FILES 0x00000400
+%define CLONE_SIGHAND 0x00000800
+%define CLONE_PARENT 0x00008000
+%define CLONE_THREAD 0x00010000
+%define CLONE_SYSVSEM 0x00040000
+%define CLONE_SETTLS 0x00080000
+%define CLONE_PARENT_SETTID 0x00100000
+%define CLONE_CHILD_CLEARTID 0x00200000
+%define CLONE_CHILD_SETTID 0x01000000
+%define CLONE_IO 0x80000000
+
+
+%define STACK_SIZE 2097152 ; 2 MiB stack
+%define GUARD_PAGE 4096 ; 4 KiB guard page
+
+
+; Create a new thread executing a given function. Arguments:
+; rdi: u32** = where to put the thread handle
+; rsi: void* (*)(void*) = function to make the child run
+; rdx: void* = single argument for function
+; Returns zero on success, or a standard error code.
+global linen_thread_create
+linen_thread_create:
+ ; Callee-save registers
+ push rbx
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;; Check validity of arguments ;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Return EINVAL if any argument is NULL
+ mov eax, -22 ; (EINVAL = -22)
+ test rdi, rdi
+ jz create_end ; Nowhere to store the thread handle
+ test rsi, rsi
+ jz create_end ; No function for the thread to run
+
+ ; Note: we allow rdx to be NULL; in that case the worst that can happen
+ ; is a segmentation fault in the user's code (not really our problem).
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;; Allocate a stack and guard page ;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Save these registers: we'll clobber them for the mmap call
+ mov rbx, rdi
+ push rdx
+ push rsi
+
+ ; The mmap system call does many things, in this case allocate memory.
+ ; See: man 2 mmap
+
+ ; mmap: rdi = addr: address for mapping; 0 lets kernel choose
+ xor edi, edi
+ ; mmap: rsi = length: size of buffer to allocate
+ mov esi, (STACK_SIZE + GUARD_PAGE)
+ ; mmap: rdx = prot: mprotect-style access permissions
+ mov edx, (PROT_WRITE | PROT_READ)
+ ; mmap: r10 = flags: settings for mapping
+ ; - MAP_ANONYMOUS: there is no file backing this buffer
+ ; - MAP_PRIVATE: only this process can see thread's stack
+ ; - MAP_STACK: no-op; inform kernel that this is a stack
+ mov r10, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK)
+ ; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1
+ mov r8, -1
+ ; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used
+ xor r9, r9
+ ; mmap: rax = system call ID
+ mov eax, SYS_MMAP
+ ; mmap: rax = mmap(rdi, rsi, rdx, r10, r8, 9)
+ syscall
+
+ ; Pop these now before we start branching. Those registers
+ ; won't be used by the next system calls, so they're safe.
+ pop r8 ; function
+ pop r9 ; argument
+
+ ; Check result of mmap: negative means failure,
+ ; otherwise rax is the address of the new mapping.
+ test rax, rax
+ js create_end
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;; Revoke guard page's R/W permissions ;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Keep in mind that stacks grow downward, so the guard page is at
+ ; the lowest address of the newly-allocated buffer, i.e. at [rax].
+
+ ; The mprotect system call changes the permissions of a memory region.
+ ; See: man 2 mprotect
+
+ ; mprotect: rdi = addr: lower address of region to control
+ mov rdi, rax
+ ; mprotect: rsi = len: size of region, one page in this case
+ mov esi, GUARD_PAGE
+ ; mprotect: rdx = prot: access permissions; zero for none
+ xor edx, edx
+ ; mprotect: rax = system call ID
+ mov eax, SYS_MPROTECT
+ ; mprotect: rax = mprotect(rdi, rsi, rdx)
+ syscall
+
+ ; Check result of mprotect: nonzero means failure
+ test rax, rax
+ jnz create_end
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;; Spawn a thread with the new stack ;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; The clone system call spawns a new thread, cloned from a parent.
+ ; Both threads end up running the same code, i.e. it returns "twice",
+ ; once in the parent (0 if success) and once in the child (the TID).
+ ; See: man 2 clone
+
+ ; clone: rsi = stack
+ ; Currently rdi points to the lowest byte of the stack area.
+ ; Again, stacks grow downward, so we calculate the address of
+ ; the top qword to use as the child thread's starting point.
+ lea rsi, [rdi + (STACK_SIZE + GUARD_PAGE - 8)]
+
+ ; clone: rdi = flags: settings for cloned thread
+ ; These flags make the parent and child share resources:
+ ; - CLONE_VM: memory address space
+ ; - CLONE_FS: filesystem information, e.g. working directory
+ ; - CLONE_FILES: file descriptor table
+ ; - CLONE_IO: I/O scheduler context
+ ; - CLONE_SIGHAND: signal handlers
+ ; - CLONE_PARENT: parent process (implied by CLONE_THREAD?)
+ ; - CLONE_THREAD: shared PID, distinguish by TID instead (I think?)
+ ; These flags are relevant for a threading API:
+ ; - CLONE_CHILD_SETTID: store child's TID at supplied address (in r10)
+ ; - CLONE_CHILD_CLEARTID: set stored TID to zero when child finishes
+ ; (this will be used for joining threads)
+ mov edi, (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO \
+ | CLONE_SIGHAND | CLONE_PARENT | CLONE_THREAD \
+ | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
+
+ ; clone: rdx = parent_tid: ignored unless CLONE_PARENT_SETTID is used
+
+ ; clone: r10 = child_tid: address to store new thread's TID
+ ; We use "bottom" of stack (rsi), i.e. where child will start.
+ mov r10, rsi
+
+ ; clone: r8 = tls: ignored unless CLONE_SETTLS is used
+
+ ; clone: rax = system call ID
+ mov eax, SYS_CLONE
+ ; clone: rax = clone(rdi, rsi, (rdx), r10, (r8));
+ syscall
+
+ ; Ideally, both parent and new-born child are executing this code now.
+
+ ; Check result of clone:
+ test rax, rax
+ js create_end ; Negative means failure
+ jnz create_success ; Positive means we're in the parent thread
+ ; Zero means we're in the child thread
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;; Initialization in child thread ;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Best practice is to clear the frame pointer
+ xor ebp, ebp
+
+ ; Move argument into place and call supplied function
+ mov rdi, r9
+ call r8
+
+ ; Once done, leave function's return value lying around
+ push rax
+
+ ; Exit the thread with return value 0
+ xor edi, edi
+ mov rax, SYS_EXIT
+ syscall ; (never returns)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;; Clean up in parent thread ;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ create_success:
+ ; We use the highest dword of the child's stack buffer as a futex
+ ; to detect when it has finished (see CLONE_CHILD_CLEARTID above).
+ ; That dword's address also acts as a thread handle for our API,
+ ; so we store it at the address the caller supplied (now in rbx).
+ mov [rbx], rsi
+
+ ; We place a canary value in the unused dword at the top:
+ ; checking this value tells us if a thread handle is valid.
+ mov dword [rsi + 4], 0xDEADBEEF
+
+ ; "Sketch" of child's stack buffer's layout:
+ ;
+ ; (bottom of range allocated by mmap)
+ ; 4 KiB: guard page, unused
+ ; (bottom of usable buffer)
+ ; ...
+ ; ... Child is currently doing work here ...
+ ; ...
+ ; qword: return address of function called by child (from r8)
+ ; dword: futex to detect when child has returned (address: rsi)
+ ; dword: canary value to know if handle is valid (address: rsi + 4)
+ ; (top of range allocated by mmap = top of usable buffer)
+
+ ; Return 0 for success
+ xor eax, eax
+
+ create_end:
+ ; Restore callee-save registers
+ pop rbx
+
+ ret
+