1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
; Under MIT license, see /LICENSE.txt
; Cheat sheet for Linux' x86_64 calling convention:
;
; - free to overwrite (caller should save them):
; rax, rcx, rdx, rsi, rdi, r8-r11, xmm0-xmm15
; - caller expects be kept (callee should save them):
; rbx, rbp, r12-r15
;
; - for passing paramters to functions:
; rdi, rsi, rdx, rcx, r8, r9, xmm0-xmm7
; - for getting return values from functions:
; rax, rdx, xmm0
;
; - for passing parameters to syscalls:
; rax, rdi, rsi, rdx, r10, r8, r9
; - for getting return values from syscalls:
; rax, rdx
; - overwritten by syscalls (all others preserved):
; rcx, r11
section .text
; Relevant system call IDs
%define SYS_MMAP 9
%define SYS_MPROTECT 10
%define SYS_CLONE 56
%define SYS_EXIT 60
; Relevant flags for mmap
%define MAP_SHARED 0x00001
%define MAP_PRIVATE 0x00002
%define MAP_ANONYMOUS 0x00020
;%define MAP_GROWSDOWN 0x00100 ; Insecure, segfaults anyway
%define MAP_LOCKED 0x02000
%define MAP_POPULATE 0x08000
%define MAP_STACK 0x20000
; Relevant flags for mprotect
%define PROT_READ 0x1
%define PROT_WRITE 0x2
; Relevant flags for clone
%define CLONE_VM 0x00000100
%define CLONE_FS 0x00000200
%define CLONE_FILES 0x00000400
%define CLONE_SIGHAND 0x00000800
%define CLONE_PARENT 0x00008000
%define CLONE_THREAD 0x00010000
%define CLONE_SYSVSEM 0x00040000
%define CLONE_SETTLS 0x00080000
%define CLONE_PARENT_SETTID 0x00100000
%define CLONE_CHILD_CLEARTID 0x00200000
%define CLONE_CHILD_SETTID 0x01000000
%define CLONE_IO 0x80000000
%define STACK_SIZE 2097152 ; 2 MiB stack
%define GUARD_PAGE 4096 ; 4 KiB guard page
; Create a new thread executing a given function. Arguments:
; rdi: struct{u32,u32}** = where to put the thread handle
; rsi: void* (*)(void*) = function to make the child run
; rdx: void* = single argument for function
; Returns zero on success, or a standard error code.
global linen_thread_create
linen_thread_create:
; Callee-save registers
push rbx
; It's handy to have a register that's 0 for a while
xor ecx, ecx
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Check validity of arguments ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Return EINVAL (-22) if any argument is NULL
lea eax, [rcx - 22] ; mov eax, -22
test rdi, rdi
jz create_return ; Nowhere to store the thread handle
test rsi, rsi
jz create_return ; No function for the thread to run
; Note: we allow rdx to be NULL; in that case the worst that can happen
; is a segmentation fault in the user's code (not really our problem).
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Allocate a stack and guard page ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Save these registers: we'll clobber them for the mmap call
push rdx ; argument
push rsi ; function
push rdi ; thread handle destination
; The mmap system call does many things, in this case allocate memory.
; See: man 2 mmap
; mmap: rdi = addr: address for mapping; 0 lets kernel choose
xor edi, edi
; mmap: rsi = length: size of buffer to allocate
mov esi, (STACK_SIZE + GUARD_PAGE)
; mmap: rdx = prot: mprotect-style access permissions
lea edx, [rcx + 3] ; mov edx, (PROT_READ | PROT_WRITE)
; mmap: r10 = flags: configuration flags for mapping:
; - MAP_ANONYMOUS: there is no file backing this buffer
; - MAP_PRIVATE: only this process can see thread's stack
; - MAP_STACK: no-op; inform kernel that this is a stack
mov r10d, (MAP_ANONYMOUS | MAP_PRIVATE | MAP_STACK)
; mmap: r8 = fd: ignored for MAP_ANONYMOUS, recommended -1
lea r8, [rcx - 1] ; mov r8, -1
; mmap: r9 = offset: should be 0 when MAP_ANONYMOUS is used
xor r9, r9
; mmap: rax = system call ID
lea eax, [rcx + SYS_MMAP] ; mov eax, SYS_MMAP
; mmap: rax = mmap(rdi, rsi, rdx, r10, (r8), (r9))
syscall
; Pop these now before we start branching. Those registers
; won't be used by the next system calls, so they're safe.
pop rbx ; thread handle destination
pop r8 ; function
pop r9 ; argument
; Check result of mmap: negative means failure,
; otherwise rax is the address of the new mapping.
test rax, rax
js create_return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Revoke guard page's R/W permissions ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Keep in mind that stacks grow downward, so the guard page is at
; the lowest address of the newly-allocated buffer, i.e. at [rax].
; The mprotect system call changes the permissions of a memory region.
; See: man 2 mprotect
; mprotect: rdi = addr: lower address of region to control
mov rdi, rax
; mprotect: rsi = len: size of region, one page in this case
mov esi, GUARD_PAGE
; mprotect: rdx = prot: access permissions; zero for none
xor edx, edx
; mprotect: rax = system call ID
xor eax, eax
mov al, SYS_MPROTECT
; mprotect: rax = mprotect(rdi, rsi, rdx)
syscall
; Check result of mprotect: nonzero means failure
test eax, eax
jnz create_return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Spawn a thread with the new stack ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; The clone system call spawns a new thread, cloned from a parent.
; Both threads end up running the same code, i.e. it returns "twice",
; once in the parent (0 if success) and once in the child (the TID).
; See: man 2 clone
; clone: rsi = stack: pointer for child's initial rsp
; Currently rdi points to the lowest byte of the stack area.
; Again, stacks grow downward, so we calculate the address of
; the top qword to use as the child thread's starting point.
lea rsi, [rdi + (STACK_SIZE + GUARD_PAGE - 8)]
; clone: rdi = flags: settings for cloned thread
; These flags make the parent and child share resources:
; - CLONE_VM: memory address space
; - CLONE_FS: filesystem information, e.g. working directory
; - CLONE_FILES: file descriptor table
; - CLONE_IO: I/O scheduler context
; - CLONE_SIGHAND: signal handlers
; - CLONE_PARENT: parent process (implied by CLONE_THREAD?)
; - CLONE_THREAD: shared PID, distinguish by TID instead (I think?)
; These flags are relevant for a threading API:
; - CLONE_CHILD_SETTID: store child's TID at supplied address (in r10)
; - CLONE_CHILD_CLEARTID: set stored TID to zero when child finishes
; (this will be used for joining threads)
mov edi, (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO \
| CLONE_SIGHAND | CLONE_PARENT | CLONE_THREAD \
| CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
; clone: rdx = parent_tid: ignored unless CLONE_PARENT_SETTID is used
; clone: r10 = child_tid: address to store new thread's TID
; We use "bottom" of stack (rsi), i.e. where child will start.
mov r10, rsi
; clone: r8 = tls: ignored unless CLONE_SETTLS is used
; clone: rax = system call ID
mov al, SYS_CLONE
; clone: rax = clone(rdi, rsi, (rdx), r10, (r8));
syscall
; Ideally, both parent and new-born child are executing this code now.
; Check result of clone:
test eax, eax
js create_return ; Negative means failure
jnz create_success ; Positive means we're in the parent thread
; Zero means we're in the child thread
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Initialization in child thread ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Best practice is to clear the frame pointer
xor ebp, ebp
; Move argument into place and call supplied function
mov rdi, r9
call r8
; Once done, leave function's return value lying around
push rax
; Exit the thread with status 0 using the exit system call.
; See: man 2 exit
; exit: rdi = status to report
xor edi, edi
; exit: rax = system call ID
xor eax, eax
mov al, SYS_EXIT
; exit: call never returns
syscall
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Clean up in parent thread ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
create_success:
; We use the highest dword of the child's stack buffer as a futex
; to detect when it has finished (see CLONE_CHILD_CLEARTID above).
; That dword's address also acts as a thread handle for our API,
; so we store it at the address the caller supplied (now in rbx).
mov [rbx], rsi
; We place a canary value in the unused dword at the top:
; checking this value tells us if a thread handle is valid.
mov dword [rsi + 4], 0xDEADBEEF
; "Sketch" of child's stack buffer's layout:
;
; (bottom of range allocated by mmap)
; 4 KiB: guard page, unused
; (bottom of usable buffer)
; ...
; ... Child is currently doing work here ...
; ...
; qword: return address of function called by child (from r8)
; dword: futex to detect when child has returned (address: rsi)
; dword: canary value to know if handle is valid (address: rsi + 4)
; (top of range allocated by mmap = top of usable buffer)
; Return 0 for success
xor eax, eax
create_return:
; Restore callee-save registers
pop rbx
ret
|