@@ -8,8 +8,9 @@ L_dyn_call_begin:
88 # At this point , the following registers are bound :
99 #
1010 # rdi < - callee
11- # rsi < - argv
12- # rdx < - argc
11+ # rsi < - process
12+ # rdx < - argv
13+ # rcx < - argc
1314 #
1415 # Save the parent base pointer for when control returns to this call frame.
1516 # CFA directives will inform the unwinder to expect rbp at the bottom of the
@@ -20,14 +21,16 @@ L_dyn_call_begin:
2021 mov rbp , rsp
2122 .cfi_def_cfa_register rbp
2223
23- # Save our callee and argv pointers , and argc
24+ # Pin callee pointer to r10
2425 mov r10 , rdi
25- mov r11 , rsi
26- mov rax , rdx
26+ # Pin the argv pointer to r11
27+ mov r11 , rdx
28+ # The process pointer needs to be in rdi
29+ mov rdi , rsi
2730
28- # Determine if spills are needed
31+ # Determine if spills are needed (argc + 1 should be <= 8 when not needed)
2932 # In the common case in which they are not , we perform a tail call
30- cmp rdx , 7
33+ cmp rcx , 6
3134 ja L_dyn_call_spill
3235
3336L_dyn_call_no_spill:
@@ -38,71 +41,72 @@ L_dyn_call_no_spill:
3841
3942 # Calculate offset in jump table to block which handles the specific
4043 # number of registers we have arguments for , then jump to th at block
44+ mov rax , rcx
4145 lea rcx , [ rip + L_dyn_call_jt ]
42- mov rax , [ rcx + rax * 4 ]
46+ movsxd rax , dword ptr [ rcx + 4 * rax ]
4347 add rax , rcx
44- jmp [ rax ]
48+ jmp rax
4549
4650 # All of these basic blocks perform a tail call . As such ,
4751 # the unwinder will skip over this frame should the callee
4852 # throw an exception
4953L_dyn_call_regs0:
5054 pop rbp
51- jmp [ r10 ]
55+ jmp r10
5256
5357L_dyn_call_regs1:
54- mov rdi , [ r11 ]
58+ mov rsi , [ r11 ]
5559 pop rbp
56- jmp [ r10 ]
60+ jmp r10
5761
5862L_dyn_call_regs2:
59- mov rdi , [ r11 ]
60- mov rsi , [ r11 + 8 ]
63+ mov rsi , [ r11 ]
64+ mov rdx , [ r11 + 8 ]
6165 pop rbp
62- jmp [ r10 ]
66+ jmp r10
6367
6468L_dyn_call_regs3:
65- mov rdi , [ r11 ]
66- mov rsi , [ r11 + 8 ]
67- mov rdx , [ r11 + 16 ]
69+ mov rsi , [ r11 ]
70+ mov rdx , [ r11 + 8 ]
71+ mov rcx , [ r11 + 16 ]
6872 pop rbp
69- jmp [ r10 ]
73+ jmp r10
7074
7175L_dyn_call_regs4:
72- mov rdi , [ r11 ]
73- mov rsi , [ r11 + 8 ]
74- mov rdx , [ r11 + 16 ]
75- mov rcx , [ r11 + 24 ]
76+ mov rsi , [ r11 ]
77+ mov rdx , [ r11 + 8 ]
78+ mov rcx , [ r11 + 16 ]
79+ mov r8 , [ r11 + 24 ]
7680 pop rbp
77- jmp [ r10 ]
81+ jmp r10
7882
7983L_dyn_call_regs5:
80- mov rdi , [ r11 ]
81- mov rsi , [ r11 + 8 ]
82- mov rdx , [ r11 + 16 ]
83- mov rcx , [ r11 + 24 ]
84- mov r8 , [ r11 + 32 ]
84+ mov rsi , [ r11 ]
85+ mov rdx , [ r11 + 8 ]
86+ mov rcx , [ r11 + 16 ]
87+ mov r8 , [ r11 + 24 ]
88+ mov r9 , [ r11 + 32 ]
8589 pop rbp
86- jmp [ r10 ]
87-
88- L_dyn_call_regs6:
89- mov rdi , [ r11 ]
90- mov rsi , [ r11 + 8 ]
91- mov rdx , [ r11 + 16 ]
92- mov rcx , [ r11 + 24 ]
93- mov r8 , [ r11 + 32 ]
94- mov r9 , [ r11 + 40 ]
95- pop rbp
96- jmp [ r10 ]
90+ jmp r10
9791
9892L_dyn_call_spill:
9993 # If we hit this block , we have identified th at there are
10094 # arguments to spill. We perform some setup for the actual
10195 # spilling , which is a loop built on ` rep movsq `
96+ #
97+ # At this point , the following registers are occupied/hold these values:
98+ #
99+ # r10 < - callee
100+ # rdi < - process
101+ # r11 < - argv
102+ # rcx < - argc
103+
104+ # rcx , rdi , and rsi are used by ` rep movsq ` , so save them temporarily
105+ mov r8 , rcx
106+ mov r9 , rdi
102107
103- # Calculate spill count for later ( rep uses rcx for the iteration count ,
108+ # Calculate spill count for later ( rep uses rcx for the iteration count `i` ,
104109 # which in this case is the number of quadwords to copy)
105- mov rcx , rdx
106110 sub rcx , 6
107111
108112 # Calculate spill space , and ensure it is rounded up to the nearest 16 bytes.
@@ -113,21 +117,22 @@ L_dyn_call_spill:
113117 sub rsp , rax
114118
115119 # load source pointer (last item of argv)
116- lea rsi , [ r11 + rdx * 8 - 8 ]
120+ lea rsi , [ r11 + r8 * 8 - 8 ]
117121 # load destination pointer (top of spill region)
118- lea rdi , [ rsp + rcx * 8 - 8 ]
119- # copy rcx quadwords from rsi to rdi , in reverse
122+ lea rdi , [ rsp + rcx * 8 - 8 ]
123+ # copy `i` quadwords from source to destination , in reverse
120124 std
121125 rep movsq
122126 cld
123127
124- # We've spilled arguments , so we have at least 6 args
125- mov rdi , [ r11 ]
126- mov rsi , [ r11 + 8 ]
127- mov rdx , [ r11 + 16 ]
128- mov rcx , [ r11 + 24 ]
129- mov r8 , [ r11 + 32 ]
130- mov r9 , [ r11 + 40 ]
128+ # We've spilled arguments , so we have at least 6 args , move them into their
129+ # final destination registers in preparation for the call
130+ mov rdi , r9
131+ mov rsi , [ r11 ]
132+ mov rdx , [ r11 + 8 ]
133+ mov rcx , [ r11 + 16 ]
134+ mov r8 , [ r11 + 24 ]
135+ mov r9 , [ r11 + 32 ]
131136
132137L_dyn_call_exec:
133138 # If we spill arguments to the stack , we can't perform
@@ -141,7 +146,7 @@ L_dyn_call_exec:
141146 # This instruction will push the return address and jump ,
142147 # and we can expect rbp to be the same as we left it upon
143148 # return.
144- call [ r10 ]
149+ call r10
145150
146151L_dyn_call_ret:
147152 # Non - tail call completed successfully
@@ -156,21 +161,19 @@ L_dyn_call_end:
156161 # a variable number of register - based arguments
157162 .p2align 2
158163 .data_region jt32
159- .set L_dyn_call_jt_entry0 , L_dyn_call_exec - L_dyn_call_jt
164+ .set L_dyn_call_jt_entry0 , L_dyn_call_regs0 - L_dyn_call_jt
160165 .set L_dyn_call_jt_entry1 , L_dyn_call_regs1 - L_dyn_call_jt
161166 .set L_dyn_call_jt_entry2 , L_dyn_call_regs2 - L_dyn_call_jt
162167 .set L_dyn_call_jt_entry3 , L_dyn_call_regs3 - L_dyn_call_jt
163168 .set L_dyn_call_jt_entry4 , L_dyn_call_regs4 - L_dyn_call_jt
164169 .set L_dyn_call_jt_entry5 , L_dyn_call_regs5 - L_dyn_call_jt
165- .set L_dyn_call_jt_entry6 , L_dyn_call_regs6 - L_dyn_call_jt
166170L_dyn_call_jt:
167171 .long L_dyn_call_jt_entry0
168172 .long L_dyn_call_jt_entry1
169173 .long L_dyn_call_jt_entry2
170174 .long L_dyn_call_jt_entry3
171175 .long L_dyn_call_jt_entry4
172176 .long L_dyn_call_jt_entry5
173- .long L_dyn_call_jt_entry6
174177 .end_data_region
175178
176179 # The following is the LSDA metadata for exception handling
0 commit comments