Text file
src/math/big/arith_amd64.s
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build !math_big_pure_go
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 // func mulWW(x, y Word) (z1, z0 Word)
13 TEXT ·mulWW(SB),NOSPLIT,$0
14 MOVQ x+0(FP), AX
15 MULQ y+8(FP)
16 MOVQ DX, z1+16(FP)
17 MOVQ AX, z0+24(FP)
18 RET
19
20
21 // func divWW(x1, x0, y Word) (q, r Word)
22 TEXT ·divWW(SB),NOSPLIT,$0
23 MOVQ x1+0(FP), DX
24 MOVQ x0+8(FP), AX
25 DIVQ y+16(FP)
26 MOVQ AX, q+24(FP)
27 MOVQ DX, r+32(FP)
28 RET
29
30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
32 // This is faster than using rotate instructions.
33
34 // func addVV(z, x, y []Word) (c Word)
35 TEXT ·addVV(SB),NOSPLIT,$0
36 MOVQ z_len+8(FP), DI
37 MOVQ x+24(FP), R8
38 MOVQ y+48(FP), R9
39 MOVQ z+0(FP), R10
40
41 MOVQ $0, CX // c = 0
42 MOVQ $0, SI // i = 0
43
44 // s/JL/JMP/ below to disable the unrolled loop
45 SUBQ $4, DI // n -= 4
46 JL V1 // if n < 0 goto V1
47
48 U1: // n >= 0
49 // regular loop body unrolled 4x
50 ADDQ CX, CX // restore CF
51 MOVQ 0(R8)(SI*8), R11
52 MOVQ 8(R8)(SI*8), R12
53 MOVQ 16(R8)(SI*8), R13
54 MOVQ 24(R8)(SI*8), R14
55 ADCQ 0(R9)(SI*8), R11
56 ADCQ 8(R9)(SI*8), R12
57 ADCQ 16(R9)(SI*8), R13
58 ADCQ 24(R9)(SI*8), R14
59 MOVQ R11, 0(R10)(SI*8)
60 MOVQ R12, 8(R10)(SI*8)
61 MOVQ R13, 16(R10)(SI*8)
62 MOVQ R14, 24(R10)(SI*8)
63 SBBQ CX, CX // save CF
64
65 ADDQ $4, SI // i += 4
66 SUBQ $4, DI // n -= 4
67 JGE U1 // if n >= 0 goto U1
68
69 V1: ADDQ $4, DI // n += 4
70 JLE E1 // if n <= 0 goto E1
71
72 L1: // n > 0
73 ADDQ CX, CX // restore CF
74 MOVQ 0(R8)(SI*8), R11
75 ADCQ 0(R9)(SI*8), R11
76 MOVQ R11, 0(R10)(SI*8)
77 SBBQ CX, CX // save CF
78
79 ADDQ $1, SI // i++
80 SUBQ $1, DI // n--
81 JG L1 // if n > 0 goto L1
82
83 E1: NEGQ CX
84 MOVQ CX, c+72(FP) // return c
85 RET
86
87
88 // func subVV(z, x, y []Word) (c Word)
89 // (same as addVV except for SBBQ instead of ADCQ and label names)
90 TEXT ·subVV(SB),NOSPLIT,$0
91 MOVQ z_len+8(FP), DI
92 MOVQ x+24(FP), R8
93 MOVQ y+48(FP), R9
94 MOVQ z+0(FP), R10
95
96 MOVQ $0, CX // c = 0
97 MOVQ $0, SI // i = 0
98
99 // s/JL/JMP/ below to disable the unrolled loop
100 SUBQ $4, DI // n -= 4
101 JL V2 // if n < 0 goto V2
102
103 U2: // n >= 0
104 // regular loop body unrolled 4x
105 ADDQ CX, CX // restore CF
106 MOVQ 0(R8)(SI*8), R11
107 MOVQ 8(R8)(SI*8), R12
108 MOVQ 16(R8)(SI*8), R13
109 MOVQ 24(R8)(SI*8), R14
110 SBBQ 0(R9)(SI*8), R11
111 SBBQ 8(R9)(SI*8), R12
112 SBBQ 16(R9)(SI*8), R13
113 SBBQ 24(R9)(SI*8), R14
114 MOVQ R11, 0(R10)(SI*8)
115 MOVQ R12, 8(R10)(SI*8)
116 MOVQ R13, 16(R10)(SI*8)
117 MOVQ R14, 24(R10)(SI*8)
118 SBBQ CX, CX // save CF
119
120 ADDQ $4, SI // i += 4
121 SUBQ $4, DI // n -= 4
122 JGE U2 // if n >= 0 goto U2
123
124 V2: ADDQ $4, DI // n += 4
125 JLE E2 // if n <= 0 goto E2
126
127 L2: // n > 0
128 ADDQ CX, CX // restore CF
129 MOVQ 0(R8)(SI*8), R11
130 SBBQ 0(R9)(SI*8), R11
131 MOVQ R11, 0(R10)(SI*8)
132 SBBQ CX, CX // save CF
133
134 ADDQ $1, SI // i++
135 SUBQ $1, DI // n--
136 JG L2 // if n > 0 goto L2
137
138 E2: NEGQ CX
139 MOVQ CX, c+72(FP) // return c
140 RET
141
142
143 // func addVW(z, x []Word, y Word) (c Word)
144 TEXT ·addVW(SB),NOSPLIT,$0
145 MOVQ z_len+8(FP), DI
146 MOVQ x+24(FP), R8
147 MOVQ y+48(FP), CX // c = y
148 MOVQ z+0(FP), R10
149
150 MOVQ $0, SI // i = 0
151
152 // s/JL/JMP/ below to disable the unrolled loop
153 SUBQ $4, DI // n -= 4
154 JL V3 // if n < 4 goto V3
155
156 U3: // n >= 0
157 // regular loop body unrolled 4x
158 MOVQ 0(R8)(SI*8), R11
159 MOVQ 8(R8)(SI*8), R12
160 MOVQ 16(R8)(SI*8), R13
161 MOVQ 24(R8)(SI*8), R14
162 ADDQ CX, R11
163 ADCQ $0, R12
164 ADCQ $0, R13
165 ADCQ $0, R14
166 SBBQ CX, CX // save CF
167 NEGQ CX
168 MOVQ R11, 0(R10)(SI*8)
169 MOVQ R12, 8(R10)(SI*8)
170 MOVQ R13, 16(R10)(SI*8)
171 MOVQ R14, 24(R10)(SI*8)
172
173 ADDQ $4, SI // i += 4
174 SUBQ $4, DI // n -= 4
175 JGE U3 // if n >= 0 goto U3
176
177 V3: ADDQ $4, DI // n += 4
178 JLE E3 // if n <= 0 goto E3
179
180 L3: // n > 0
181 ADDQ 0(R8)(SI*8), CX
182 MOVQ CX, 0(R10)(SI*8)
183 SBBQ CX, CX // save CF
184 NEGQ CX
185
186 ADDQ $1, SI // i++
187 SUBQ $1, DI // n--
188 JG L3 // if n > 0 goto L3
189
190 E3: MOVQ CX, c+56(FP) // return c
191 RET
192
193
194 // func subVW(z, x []Word, y Word) (c Word)
195 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
196 TEXT ·subVW(SB),NOSPLIT,$0
197 MOVQ z_len+8(FP), DI
198 MOVQ x+24(FP), R8
199 MOVQ y+48(FP), CX // c = y
200 MOVQ z+0(FP), R10
201
202 MOVQ $0, SI // i = 0
203
204 // s/JL/JMP/ below to disable the unrolled loop
205 SUBQ $4, DI // n -= 4
206 JL V4 // if n < 4 goto V4
207
208 U4: // n >= 0
209 // regular loop body unrolled 4x
210 MOVQ 0(R8)(SI*8), R11
211 MOVQ 8(R8)(SI*8), R12
212 MOVQ 16(R8)(SI*8), R13
213 MOVQ 24(R8)(SI*8), R14
214 SUBQ CX, R11
215 SBBQ $0, R12
216 SBBQ $0, R13
217 SBBQ $0, R14
218 SBBQ CX, CX // save CF
219 NEGQ CX
220 MOVQ R11, 0(R10)(SI*8)
221 MOVQ R12, 8(R10)(SI*8)
222 MOVQ R13, 16(R10)(SI*8)
223 MOVQ R14, 24(R10)(SI*8)
224
225 ADDQ $4, SI // i += 4
226 SUBQ $4, DI // n -= 4
227 JGE U4 // if n >= 0 goto U4
228
229 V4: ADDQ $4, DI // n += 4
230 JLE E4 // if n <= 0 goto E4
231
232 L4: // n > 0
233 MOVQ 0(R8)(SI*8), R11
234 SUBQ CX, R11
235 MOVQ R11, 0(R10)(SI*8)
236 SBBQ CX, CX // save CF
237 NEGQ CX
238
239 ADDQ $1, SI // i++
240 SUBQ $1, DI // n--
241 JG L4 // if n > 0 goto L4
242
243 E4: MOVQ CX, c+56(FP) // return c
244 RET
245
246
247 // func shlVU(z, x []Word, s uint) (c Word)
248 TEXT ·shlVU(SB),NOSPLIT,$0
249 MOVQ z_len+8(FP), BX // i = z
250 SUBQ $1, BX // i--
251 JL X8b // i < 0 (n <= 0)
252
253 // n > 0
254 MOVQ z+0(FP), R10
255 MOVQ x+24(FP), R8
256 MOVQ s+48(FP), CX
257 MOVQ (R8)(BX*8), AX // w1 = x[n-1]
258 MOVQ $0, DX
259 SHLQ CX, DX:AX // w1>>ŝ
260 MOVQ DX, c+56(FP)
261
262 CMPQ BX, $0
263 JLE X8a // i <= 0
264
265 // i > 0
266 L8: MOVQ AX, DX // w = w1
267 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
268 SHLQ CX, DX:AX // w<<s | w1>>ŝ
269 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
270 SUBQ $1, BX // i--
271 JG L8 // i > 0
272
273 // i <= 0
274 X8a: SHLQ CX, AX // w1<<s
275 MOVQ AX, (R10) // z[0] = w1<<s
276 RET
277
278 X8b: MOVQ $0, c+56(FP)
279 RET
280
281
282 // func shrVU(z, x []Word, s uint) (c Word)
283 TEXT ·shrVU(SB),NOSPLIT,$0
284 MOVQ z_len+8(FP), R11
285 SUBQ $1, R11 // n--
286 JL X9b // n < 0 (n <= 0)
287
288 // n > 0
289 MOVQ z+0(FP), R10
290 MOVQ x+24(FP), R8
291 MOVQ s+48(FP), CX
292 MOVQ (R8), AX // w1 = x[0]
293 MOVQ $0, DX
294 SHRQ CX, DX:AX // w1<<ŝ
295 MOVQ DX, c+56(FP)
296
297 MOVQ $0, BX // i = 0
298 JMP E9
299
300 // i < n-1
301 L9: MOVQ AX, DX // w = w1
302 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
303 SHRQ CX, DX:AX // w>>s | w1<<ŝ
304 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
305 ADDQ $1, BX // i++
306
307 E9: CMPQ BX, R11
308 JL L9 // i < n-1
309
310 // i >= n-1
311 X9a: SHRQ CX, AX // w1>>s
312 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
313 RET
314
315 X9b: MOVQ $0, c+56(FP)
316 RET
317
318
319 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
320 TEXT ·mulAddVWW(SB),NOSPLIT,$0
321 MOVQ z+0(FP), R10
322 MOVQ x+24(FP), R8
323 MOVQ y+48(FP), R9
324 MOVQ r+56(FP), CX // c = r
325 MOVQ z_len+8(FP), R11
326 MOVQ $0, BX // i = 0
327
328 CMPQ R11, $4
329 JL E5
330
331 U5: // i+4 <= n
332 // regular loop body unrolled 4x
333 MOVQ (0*8)(R8)(BX*8), AX
334 MULQ R9
335 ADDQ CX, AX
336 ADCQ $0, DX
337 MOVQ AX, (0*8)(R10)(BX*8)
338 MOVQ DX, CX
339 MOVQ (1*8)(R8)(BX*8), AX
340 MULQ R9
341 ADDQ CX, AX
342 ADCQ $0, DX
343 MOVQ AX, (1*8)(R10)(BX*8)
344 MOVQ DX, CX
345 MOVQ (2*8)(R8)(BX*8), AX
346 MULQ R9
347 ADDQ CX, AX
348 ADCQ $0, DX
349 MOVQ AX, (2*8)(R10)(BX*8)
350 MOVQ DX, CX
351 MOVQ (3*8)(R8)(BX*8), AX
352 MULQ R9
353 ADDQ CX, AX
354 ADCQ $0, DX
355 MOVQ AX, (3*8)(R10)(BX*8)
356 MOVQ DX, CX
357 ADDQ $4, BX // i += 4
358
359 LEAQ 4(BX), DX
360 CMPQ DX, R11
361 JLE U5
362 JMP E5
363
364 L5: MOVQ (R8)(BX*8), AX
365 MULQ R9
366 ADDQ CX, AX
367 ADCQ $0, DX
368 MOVQ AX, (R10)(BX*8)
369 MOVQ DX, CX
370 ADDQ $1, BX // i++
371
372 E5: CMPQ BX, R11 // i < n
373 JL L5
374
375 MOVQ CX, c+64(FP)
376 RET
377
378
379 // func addMulVVW(z, x []Word, y Word) (c Word)
380 TEXT ·addMulVVW(SB),NOSPLIT,$0
381 CMPB ·support_adx(SB), $1
382 JEQ adx
383 MOVQ z+0(FP), R10
384 MOVQ x+24(FP), R8
385 MOVQ y+48(FP), R9
386 MOVQ z_len+8(FP), R11
387 MOVQ $0, BX // i = 0
388 MOVQ $0, CX // c = 0
389 MOVQ R11, R12
390 ANDQ $-2, R12
391 CMPQ R11, $2
392 JAE A6
393 JMP E6
394
395 A6:
396 MOVQ (R8)(BX*8), AX
397 MULQ R9
398 ADDQ (R10)(BX*8), AX
399 ADCQ $0, DX
400 ADDQ CX, AX
401 ADCQ $0, DX
402 MOVQ DX, CX
403 MOVQ AX, (R10)(BX*8)
404
405 MOVQ (8)(R8)(BX*8), AX
406 MULQ R9
407 ADDQ (8)(R10)(BX*8), AX
408 ADCQ $0, DX
409 ADDQ CX, AX
410 ADCQ $0, DX
411 MOVQ DX, CX
412 MOVQ AX, (8)(R10)(BX*8)
413
414 ADDQ $2, BX
415 CMPQ BX, R12
416 JL A6
417 JMP E6
418
419 L6: MOVQ (R8)(BX*8), AX
420 MULQ R9
421 ADDQ CX, AX
422 ADCQ $0, DX
423 ADDQ AX, (R10)(BX*8)
424 ADCQ $0, DX
425 MOVQ DX, CX
426 ADDQ $1, BX // i++
427
428 E6: CMPQ BX, R11 // i < n
429 JL L6
430
431 MOVQ CX, c+56(FP)
432 RET
433
434 adx:
435 MOVQ z_len+8(FP), R11
436 MOVQ z+0(FP), R10
437 MOVQ x+24(FP), R8
438 MOVQ y+48(FP), DX
439 MOVQ $0, BX // i = 0
440 MOVQ $0, CX // carry
441 CMPQ R11, $8
442 JAE adx_loop_header
443 CMPQ BX, R11
444 JL adx_short
445 MOVQ CX, c+56(FP)
446 RET
447
448 adx_loop_header:
449 MOVQ R11, R13
450 ANDQ $-8, R13
451 adx_loop:
452 XORQ R9, R9 // unset flags
453 MULXQ (R8), SI, DI
454 ADCXQ CX,SI
455 ADOXQ (R10), SI
456 MOVQ SI,(R10)
457
458 MULXQ 8(R8), AX, CX
459 ADCXQ DI, AX
460 ADOXQ 8(R10), AX
461 MOVQ AX, 8(R10)
462
463 MULXQ 16(R8), SI, DI
464 ADCXQ CX, SI
465 ADOXQ 16(R10), SI
466 MOVQ SI, 16(R10)
467
468 MULXQ 24(R8), AX, CX
469 ADCXQ DI, AX
470 ADOXQ 24(R10), AX
471 MOVQ AX, 24(R10)
472
473 MULXQ 32(R8), SI, DI
474 ADCXQ CX, SI
475 ADOXQ 32(R10), SI
476 MOVQ SI, 32(R10)
477
478 MULXQ 40(R8), AX, CX
479 ADCXQ DI, AX
480 ADOXQ 40(R10), AX
481 MOVQ AX, 40(R10)
482
483 MULXQ 48(R8), SI, DI
484 ADCXQ CX, SI
485 ADOXQ 48(R10), SI
486 MOVQ SI, 48(R10)
487
488 MULXQ 56(R8), AX, CX
489 ADCXQ DI, AX
490 ADOXQ 56(R10), AX
491 MOVQ AX, 56(R10)
492
493 ADCXQ R9, CX
494 ADOXQ R9, CX
495
496 ADDQ $64, R8
497 ADDQ $64, R10
498 ADDQ $8, BX
499
500 CMPQ BX, R13
501 JL adx_loop
502 MOVQ z+0(FP), R10
503 MOVQ x+24(FP), R8
504 CMPQ BX, R11
505 JL adx_short
506 MOVQ CX, c+56(FP)
507 RET
508
509 adx_short:
510 MULXQ (R8)(BX*8), SI, DI
511 ADDQ CX, SI
512 ADCQ $0, DI
513 ADDQ SI, (R10)(BX*8)
514 ADCQ $0, DI
515 MOVQ DI, CX
516 ADDQ $1, BX // i++
517
518 CMPQ BX, R11
519 JL adx_short
520
521 MOVQ CX, c+56(FP)
522 RET
523
524
525
526 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
527 TEXT ·divWVW(SB),NOSPLIT,$0
528 MOVQ z+0(FP), R10
529 MOVQ xn+24(FP), DX // r = xn
530 MOVQ x+32(FP), R8
531 MOVQ y+56(FP), R9
532 MOVQ z_len+8(FP), BX // i = z
533 JMP E7
534
535 L7: MOVQ (R8)(BX*8), AX
536 DIVQ R9
537 MOVQ AX, (R10)(BX*8)
538
539 E7: SUBQ $1, BX // i--
540 JGE L7 // i >= 0
541
542 MOVQ DX, r+64(FP)
543 RET
View as plain text