; Under MIT license, see /LICENSE.txt


; Cheat sheet for Linux' x86_64 calling convention:
;
; - free to overwrite (caller should save them):
;       rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15
; - caller expects be kept (callee should save them):
;       rbx, rbp, r12-r15
;
; - for passing paramters to functions:
;       rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7
; - for getting return values from functions:
;       rax, rdx, xmm0
;
; - for passing parameters to syscalls:
;       rax, rdi, rsi, rdx, r10, r8, r9
; - for getting return values from syscalls:
;       rax, rdx
; - overwritten by syscalls (all others preserved):
;       rcx, r11


section .text


; Relevant system call IDs
%define SYS_MUNMAP  11
%define SYS_FUTEX  202

; Relevant operations for futex
%define FUTEX_WAIT         0
%define FUTEX_PRIVATE_FLAG 0x80


%define STACK_SIZE 2097152 ; 2 MiB stack
%define GUARD_PAGE    4096 ; 4 KiB guard page


; Wait for thread to exit, save its return value, and clean up. Arguments:
;	rdi: struct{u32,u32}* = handle of the thread to wait for
;	rsi: void**           = where to put void* returned by thread
; Returns zero on success, or a standard error code.
global linen_thread_finish
linen_thread_finish:
		; It's handy to have a register that's 0 for a while
		xor ecx, ecx

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Check validity of arguments ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; Return EINVAL (-22) if rdi is NULL or otherwise invalid
		lea eax, [rcx - 22] ; mov eax, -22

		test rdi, rdi
		jz finish_return ; rdi is NULL

		; rdi is nonzero, so let's just assume it's a valid pointer;
		; if that assumption is wrong we'll get a segmentation fault.
		; But we don't yet trust that [rdi] is a valid thread handle!
		; To verify this we check the canary value stored at [rdi + 4].
		cmp dword [rdi + 4], 0xDEADBEEF ; Oh CISC...
		jnz finish_return

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Wait until thread is finished ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		; We'll clobber rsi if we need to set up a futex call
		mov r8, rsi

	finish_retry:
		; When spawning, we set CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID:
		; [rdi] contains the child thread's TID, and will get automatically
		; cleared (to 0) when the child exits; this is what we'll watch for.

		; Atomically check whether the target thread is still running.
		; if ([rdi] == 0) { goto finish_success; } else { eax = [rdi]; }
		xor eax, eax
		lock cmpxchg [rdi], eax
		jz finish_success

		; The thread is still busy, so block until it's done.
		; The futex system call waits until the dword at an
		; address (rdi) deviates from an expected value (eax).
		; See: man 2 futex

		; futex: rdi = uaddr: address of the dword to watch
		; futex: rsi = futex_op: which futex operation we want:
		; - FUTEX_WAIT = 0:     block until the value at [rdi] changes
		; - FUTEX_PRIVATE_FLAG: FIXME waits forever, I don't understand why
		xor esi, esi ; mov esi, FUTEX_WAIT
		; futex: rdx = val: the expected value at [rdi] before it changes
		mov edx, eax
		; futex: r10 = timeout: in case we had a deadline (we don't)
		xor r10, r10
		; futex: r8 = uaddr2: ignored when FUTEX_WAIT is used
		; futex: r9 = val3:   ignored when FUTEX_WAIT is used
		; futex: rax = system call ID
		xor eax, eax
		mov al, SYS_FUTEX
		; futex: rax = futex(rdi, rsi, rdx, r10, (r8), (r9))
		syscall

		; Sometimes the thread exits after the "lock cmpxchg" instruction
		; but before the futex call. In that case, futex returns EAGAIN.
		cmp eax, -11 ; (EAGAIN = -11)
		je finish_retry

		; Any other nonzero return value means failure
		test eax, eax
		jnz finish_return

		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		;;;; Clean up after thread's exit ;;;;
		;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	finish_success:
		; The thread left its function return value on the stack, read it
		mov rdx, [rdi - 8]

		; Remove the canary value so the thread handle becomes invalid. We're about
		; to deallocate this memory anyway, so it's optional, but maybe the address
		; becomes valid again later in the program's life, who knows? Play it safe.
		mov [rdi + 4], eax ; eax = 0 for all paths

		; The munmap system call destroys mappings created by mmap.
		; In this case that means deallocating the stack buffer.
		; See: man 2 munmap

		; munmap: rdi = addr: lowest address of region to unmap
		; Our rdi is near the buffer's top, so we must subtract
		sub rdi, (STACK_SIZE + GUARD_PAGE - 8)
		; munmap: rsi = length: size of region starting from rdi
		mov esi, (STACK_SIZE + GUARD_PAGE)
		; munmap: rax = system call ID
		mov al, SYS_MUNMAP
		; munmap: rax = munmap(rdi, rsi)
		syscall

		; Check result of munmap: nonzero means failure
		test eax, eax
		jnz finish_return

		; Check if caller gave a location (r8) to save the return value (rdx)
		test r8, r8
		jz finish_return ; caller doesn't care: gave NULL pointer
		mov [r8], rdx
		; Note: if munmap failed, the buffer is still there, so we
		; can safely return an error without losing the return value.

	finish_return:
		ret