; Under MIT license, see /LICENSE.txt


; Cheat sheet for Linux' x86_64 calling convention:
;
; - free to overwrite (caller should save them):
;       rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15
; - caller expects be kept (callee should save them):
;       rbx, rbp, r12-r15
;
; - for passing paramters to functions:
;       rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7
; - for getting return values from functions:
;       rax, rdx, xmm0
;
; - for passing parameters to syscalls:
;       rax, rdi, rsi, rdx, r10, r8, r9
; - for getting return values from syscalls:
;       rax, rdx
; - overwritten by syscalls (all others preserved):
;       rcx, r11


section .text


; Relevant system call IDs
%define SYS_MMAP      9
%define SYS_MPROTECT 10
%define SYS_CLONE    56
%define SYS_EXIT     60

; Relevant flags for mmap
%define MAP_SHARED    0x00001
%define MAP_PRIVATE   0x00002
%define MAP_ANONYMOUS 0x00020
;%define MAP_GROWSDOWN 0x00100 ; Insecure, segfaults anyway
%define MAP_LOCKED    0x02000
%define MAP_POPULATE  0x08000
%define MAP_STACK     0x20000

; Relevant flags for mprotect
%define PROT_READ  0x1
%define PROT_WRITE 0x2

; Relevant flags for clone
%define CLONE_VM             0x00000100
%define CLONE_FS             0x00000200
%define CLONE_FILES          0x00000400
%define CLONE_SIGHAND        0x00000800
%define CLONE_PARENT         0x00008000
%define CLONE_THREAD         0x00010000
%define CLONE_SYSVSEM        0x00040000
%define CLONE_SETTLS         0x00080000
%define CLONE_PARENT_SETTID  0x00100000
%define CLONE_CHILD_CLEARTID 0x00200000
%define CLONE_CHILD_SETTID   0x01000000
%define CLONE_IO             0x80000000


%define STACK_SIZE 2097152 ; 2 MiB stack
%define GUARD_PAGE    4096 ; 4 KiB guard page


; Create a new thread executing a given function. Arguments:
;	rdi: struct{u32,u32}** = where to put the thread handle
;	rsi: void* (*)(void*)  = function to make the child run
;	rdx: void*             = single argument for function
; Returns zero on success, or a standard error code.
global linen_thread_create
linen_thread_create:
		; Callee-save registers
		push rbx

		; It's handy to have a register that's 0 for a while
		xor ecx, ecx

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Check validity of arguments ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; Return EINVAL (-22) if any argument is NULL
		lea eax, [rcx - 22] ; mov eax, -22

		test rdi, rdi
		jz create_return ; Nowhere to store the thread handle
		test rsi, rsi
		jz create_return ; No function for the thread to run

		; Note: we allow rdx to be NULL; in that case the worst that can happen
		; is a segmentation fault in the user's code (not really our problem).

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Allocate a stack and guard page ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; Save these registers: we'll clobber them for the mmap call
		push rdx ; argument
		push rsi ; function
		push rdi ; thread handle destination

		; The mmap system call does many things, in this case allocate memory.
		; See: man 2 mmap

		; mmap: rdi = addr: address for mapping; 0 lets kernel choose
		xor edi, edi
		; mmap: rsi = length: size of buffer to allocate
		mov esi, (STACK_SIZE + GUARD_PAGE)
		; mmap: rdx = prot: mprotect-style access permissions
		lea edx, [rcx + 3] ; mov edx, (PROT_READ | PROT_WRITE)
		; mmap: r10 = flags: configuration flags for mapping:
		; - MAP_ANONYMOUS: there is no file backing this buffer
		; - MAP_PRIVATE:   only this process can see thread's stack
		; - MAP_STACK:     no-op; inform kernel that this is a stack
		mov r10d, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK)
		; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1
		lea r8, [rcx - 1] ; mov r8, -1
		; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used
		xor r9, r9
		; mmap: rax = system call ID
		lea eax, [rcx + SYS_MMAP] ; mov eax, SYS_MMAP
		; mmap: rax = mmap(rdi, rsi, rdx, r10, (r8), (r9))
		syscall

		; Pop these now before we start branching. Those registers
		; won't be used by the next system calls, so they're safe.
		pop rbx ; thread handle destination
		pop r8  ; function
		pop r9  ; argument

		; Check result of mmap: negative means failure,
		; otherwise rax is the address of the new mapping.
		test rax, rax
		js create_return

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Revoke guard page's R/W permissions ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; Keep in mind that stacks grow downward, so the guard page is at
		; the lowest address of the newly-allocated buffer, i.e. at [rax].

		; The mprotect system call changes the permissions of a memory region.
		; See: man 2 mprotect

		; mprotect: rdi = addr: lower address of region to control
		mov rdi, rax
		; mprotect: rsi = len: size of region, one page in this case
		mov esi, GUARD_PAGE
		; mprotect: rdx = prot: access permissions; zero for none
		xor edx, edx
		; mprotect: rax = system call ID
		xor eax, eax
		mov al, SYS_MPROTECT
		; mprotect: rax = mprotect(rdi, rsi, rdx)
		syscall

		; Check result of mprotect: nonzero means failure
		test eax, eax
		jnz create_return

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Spawn a thread with the new stack ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; The clone system call spawns a new thread, cloned from a parent.
		; Both threads end up running the same code, i.e. it returns "twice",
		; once in the parent (0 if success) and once in the child (the TID).
		; See: man 2 clone

		; clone: rsi = stack: pointer for child's initial rsp
		; Currently rdi points to the lowest byte of the stack area.
		; Again, stacks grow downward, so we calculate the address of
		; the top qword to use as the child thread's starting point.
		lea rsi, [rdi + (STACK_SIZE + GUARD_PAGE - 8)]

		; clone: rdi = flags: settings for cloned thread
		; These flags make the parent and child share resources:
		; - CLONE_VM:      memory address space
		; - CLONE_FS:      filesystem information, e.g. working directory
		; - CLONE_FILES:   file descriptor table
		; - CLONE_IO:      I/O scheduler context
		; - CLONE_SIGHAND: signal handlers
		; - CLONE_PARENT:  parent process (implied by CLONE_THREAD?)
		; - CLONE_THREAD:  shared PID, distinguish by TID instead (I think?)
		; These flags are relevant for a threading API:
		; - CLONE_CHILD_SETTID:   store child's TID at supplied address (in r10)
		; - CLONE_CHILD_CLEARTID: set stored TID to zero when child finishes
		;                         (this will be used for joining threads)
		mov edi, (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO \
		        | CLONE_SIGHAND | CLONE_PARENT | CLONE_THREAD \
		        | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)

		; clone: rdx = parent_tid: ignored unless CLONE_PARENT_SETTID is used

		; clone: r10 = child_tid: address to store new thread's TID
		; We use "bottom" of stack (rsi), i.e. where child will start.
		mov r10, rsi

		; clone: r8 = tls: ignored unless CLONE_SETTLS is used

		; clone: rax = system call ID
		mov al, SYS_CLONE
		; clone: rax = clone(rdi, rsi, (rdx), r10, (r8));
		syscall

		; Ideally, both parent and new-born child are executing this code now.

		; Check result of clone:
		test eax, eax
		js create_return   ; Negative means failure
		jnz create_success ; Positive means we're in the parent thread
		; Zero means we're in the child thread

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Initialization in child thread ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; Best practice is to clear the frame pointer
		xor ebp, ebp

		; Move argument into place and call supplied function
		mov rdi, r9
		call r8

		; Once done, leave function's return value lying around
		push rax

		; Exit the thread with status 0 using the exit system call.
		; See: man 2 exit

		; exit: rdi = status to report
		xor edi, edi
		; exit: rax = system call ID
		xor eax, eax
		mov al, SYS_EXIT
		; exit: call never returns
		syscall

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Clean up in parent thread ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	create_success:
		; We use the highest dword of the child's stack buffer as a futex
		; to detect when it has finished (see CLONE_CHILD_CLEARTID above).
		; That dword's address also acts as a thread handle for our API,
		; so we store it at the address the caller supplied (now in rbx).
		mov [rbx], rsi

		; We place a canary value in the unused dword at the top:
		; checking this value tells us if a thread handle is valid.
		mov dword [rsi + 4], 0xDEADBEEF

		; "Sketch" of child's stack buffer's layout:
		;
		; (bottom of range allocated by mmap)
		; 4 KiB: guard page, unused
		; (bottom of usable buffer)
		; ...
		; ... Child is currently doing work here ...
		; ...
		; qword: return address of function called by child (from r8)
		; dword: futex to detect when child has returned (address: rsi)
		; dword: canary value to know if handle is valid (address: rsi + 4)
		; (top of range allocated by mmap = top of usable buffer)

		; Return 0 for success
		xor eax, eax

	create_return:
		; Restore callee-save registers
		pop rbx

		ret