; MIT license: ; ; Copyright (c) 2023 Marcus R.A. Newman (prefetch@prefetch.eu) ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this software and associated documentation files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ; SOFTWARE. ; Cheat sheet for Linux' x86_64 calling convention: ; ; - free to overwrite; the caller should save them: ; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15 ; - caller expects no change; callee should save them: ; rbx, rbp, r12-r15 ; ; - for passing paramters to functions: ; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7 ; - for getting return values from functions: ; rax, rdx, xmm0 ; ; - for passing parameters to syscalls: ; rax, rdi, rsi, rdx, r10, r8, r9 ; - for getting return values from syscalls: ; rax, rdx ; - overwritten by syscalls (all others preserved): ; rcx, r11 section .text ; Relevant system call IDs %define SYS_MMAP 9 %define SYS_MPROTECT 10 %define SYS_CLONE 56 %define SYS_EXIT 60 ; Relevant flags for mmap %define MAP_SHARED 0x00001 %define MAP_PRIVATE 0x00002 %define MAP_ANONYMOUS 0x00020 ;%define MAP_GROWSDOWN 0x00100 ; Insecure, segfaults anyway %define MAP_LOCKED 0x02000 %define MAP_POPULATE 0x08000 %define MAP_STACK 0x20000 ; Relevant flags for mprotect %define PROT_READ 0x1 %define PROT_WRITE 0x2 ; Relevant flags for clone %define CLONE_VM 0x00000100 %define CLONE_FS 0x00000200 %define CLONE_FILES 0x00000400 %define CLONE_SIGHAND 0x00000800 %define CLONE_PARENT 0x00008000 %define CLONE_THREAD 0x00010000 %define CLONE_SYSVSEM 0x00040000 %define CLONE_SETTLS 0x00080000 %define CLONE_PARENT_SETTID 0x00100000 %define CLONE_CHILD_CLEARTID 0x00200000 %define CLONE_CHILD_SETTID 0x01000000 %define CLONE_IO 0x80000000 %define STACK_SIZE 2097152 ; 2 MiB stack %define GUARD_PAGE 4096 ; 4 KiB guard page ; Create a new thread executing a given function. Arguments: ; rdi: u32** = where to put the thread handle ; rsi: void* (*)(void*) = function to make the child run ; rdx: void* = single argument for function ; Returns zero on success, or a standard error code. global linen_thread_create linen_thread_create: ; Callee-save registers push rbx ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Check validity of arguments ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Return EINVAL if any argument is NULL mov eax, -22 ; (EINVAL = -22) test rdi, rdi jz create_end ; Nowhere to store the thread handle test rsi, rsi jz create_end ; No function for the thread to run ; Note: we allow rdx to be NULL; in that case the worst that can happen ; is a segmentation fault in the user's code (not really our problem). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Allocate a stack and guard page ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Save these registers: we'll clobber them for the mmap call mov rbx, rdi push rdx push rsi ; The mmap system call does many things, in this case allocate memory. ; See: man 2 mmap ; mmap: rdi = addr: address for mapping; 0 lets kernel choose xor edi, edi ; mmap: rsi = length: size of buffer to allocate mov esi, (STACK_SIZE + GUARD_PAGE) ; mmap: rdx = prot: mprotect-style access permissions mov edx, (PROT_WRITE | PROT_READ) ; mmap: r10 = flags: settings for mapping ; - MAP_ANONYMOUS: there is no file backing this buffer ; - MAP_PRIVATE: only this process can see thread's stack ; - MAP_STACK: no-op; inform kernel that this is a stack mov r10, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK) ; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1 mov r8, -1 ; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used xor r9, r9 ; mmap: rax = system call ID mov eax, SYS_MMAP ; mmap: rax = mmap(rdi, rsi, rdx, r10, r8, 9) syscall ; Pop these now before we start branching. Those registers ; won't be used by the next system calls, so they're safe. pop r8 ; function pop r9 ; argument ; Check result of mmap: negative means failure, ; otherwise rax is the address of the new mapping. test rax, rax js create_end ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Revoke guard page's R/W permissions ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Keep in mind that stacks grow downward, so the guard page is at ; the lowest address of the newly-allocated buffer, i.e. at [rax]. ; The mprotect system call changes the permissions of a memory region. ; See: man 2 mprotect ; mprotect: rdi = addr: lower address of region to control mov rdi, rax ; mprotect: rsi = len: size of region, one page in this case mov esi, GUARD_PAGE ; mprotect: rdx = prot: access permissions; zero for none xor edx, edx ; mprotect: rax = system call ID mov eax, SYS_MPROTECT ; mprotect: rax = mprotect(rdi, rsi, rdx) syscall ; Check result of mprotect: nonzero means failure test rax, rax jnz create_end ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Spawn a thread with the new stack ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; The clone system call spawns a new thread, cloned from a parent. ; Both threads end up running the same code, i.e. it returns "twice", ; once in the parent (0 if success) and once in the child (the TID). ; See: man 2 clone ; clone: rsi = stack ; Currently rdi points to the lowest byte of the stack area. ; Again, stacks grow downward, so we calculate the address of ; the top qword to use as the child thread's starting point. lea rsi, [rdi + (STACK_SIZE + GUARD_PAGE - 8)] ; clone: rdi = flags: settings for cloned thread ; These flags make the parent and child share resources: ; - CLONE_VM: memory address space ; - CLONE_FS: filesystem information, e.g. working directory ; - CLONE_FILES: file descriptor table ; - CLONE_IO: I/O scheduler context ; - CLONE_SIGHAND: signal handlers ; - CLONE_PARENT: parent process (implied by CLONE_THREAD?) ; - CLONE_THREAD: shared PID, distinguish by TID instead (I think?) ; These flags are relevant for a threading API: ; - CLONE_CHILD_SETTID: store child's TID at supplied address (in r10) ; - CLONE_CHILD_CLEARTID: set stored TID to zero when child finishes ; (this will be used for joining threads) mov edi, (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO \ | CLONE_SIGHAND | CLONE_PARENT | CLONE_THREAD \ | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID) ; clone: rdx = parent_tid: ignored unless CLONE_PARENT_SETTID is used ; clone: r10 = child_tid: address to store new thread's TID ; We use "bottom" of stack (rsi), i.e. where child will start. mov r10, rsi ; clone: r8 = tls: ignored unless CLONE_SETTLS is used ; clone: rax = system call ID mov eax, SYS_CLONE ; clone: rax = clone(rdi, rsi, (rdx), r10, (r8)); syscall ; Ideally, both parent and new-born child are executing this code now. ; Check result of clone: test rax, rax js create_end ; Negative means failure jnz create_success ; Positive means we're in the parent thread ; Zero means we're in the child thread ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Initialization in child thread ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Best practice is to clear the frame pointer xor ebp, ebp ; Move argument into place and call supplied function mov rdi, r9 call r8 ; Once done, leave function's return value lying around push rax ; Exit the thread with return value 0 xor edi, edi mov rax, SYS_EXIT syscall ; (never returns) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Clean up in parent thread ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; create_success: ; We use the highest dword of the child's stack buffer as a futex ; to detect when it has finished (see CLONE_CHILD_CLEARTID above). ; That dword's address also acts as a thread handle for our API, ; so we store it at the address the caller supplied (now in rbx). mov [rbx], rsi ; We place a canary value in the unused dword at the top: ; checking this value tells us if a thread handle is valid. mov dword [rsi + 4], 0xDEADBEEF ; "Sketch" of child's stack buffer's layout: ; ; (bottom of range allocated by mmap) ; 4 KiB: guard page, unused ; (bottom of usable buffer) ; ... ; ... Child is currently doing work here ... ; ... ; qword: return address of function called by child (from r8) ; dword: futex to detect when child has returned (address: rsi) ; dword: canary value to know if handle is valid (address: rsi + 4) ; (top of range allocated by mmap = top of usable buffer) ; Return 0 for success xor eax, eax create_end: ; Restore callee-save registers pop rbx ret