...

# Text file src/math/big/arith_ppc64x.s

## Documentation: math/big

```     1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
4
5// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// func mulWW(x, y Word) (z1, z0 Word)
13TEXT ·mulWW(SB), NOSPLIT, \$0
14	MOVD   x+0(FP), R4
15	MOVD   y+8(FP), R5
16	MULHDU R4, R5, R6
17	MULLD  R4, R5, R7
18	MOVD   R6, z1+16(FP)
19	MOVD   R7, z0+24(FP)
20	RET
21
22// func addVV(z, y, y []Word) (c Word)
23// z[i] = x[i] + y[i] for all i, carrying
25	MOVD  z_len+8(FP), R7   // R7 = z_len
26	MOVD  x+24(FP), R8      // R8 = x[]
27	MOVD  y+48(FP), R9      // R9 = y[]
28	MOVD  z+0(FP), R10      // R10 = z[]
29
30	// If z_len = 0, we are done
31	CMP   R0, R7
32	MOVD  R0, R4
33	BEQ   done
34
35	// Process the first iteration out of the loop so we can
36	// use MOVDU and avoid 3 index registers updates.
37	MOVD  0(R8), R11      // R11 = x[i]
38	MOVD  0(R9), R12      // R12 = y[i]
39	ADD   \$-1, R7         // R7 = z_len - 1
40	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
41	CMP   R0, R7
42	MOVD  R15, 0(R10)     // z[i]
43	BEQ   final          // If z_len was 1, we are done
44
45	SRD   \$2, R7, R5      // R5 = z_len/4
46	CMP   R0, R5
47	MOVD  R5, CTR         // Set up loop counter
48	BEQ   tail            // If R5 = 0, we can't use the loop
49
50	// Process 4 elements per iteration. Unrolling this loop
51	// means a performance trade-off: we will lose performance
52	// for small values of z_len (0.90x in the worst case), but
53	// gain significant performance as z_len increases (up to
54	// 1.45x).
55loop:
56	MOVD  8(R8), R11      // R11 = x[i]
57	MOVD  16(R8), R12     // R12 = x[i+1]
58	MOVD  24(R8), R14     // R14 = x[i+2]
59	MOVDU 32(R8), R15     // R15 = x[i+3]
60	MOVD  8(R9), R16      // R16 = y[i]
61	MOVD  16(R9), R17     // R17 = y[i+1]
62	MOVD  24(R9), R18     // R18 = y[i+2]
63	MOVDU 32(R9), R19     // R19 = y[i+3]
64	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
65	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
66	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
67	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
68	MOVD  R20, 8(R10)     // z[i]
69	MOVD  R21, 16(R10)    // z[i+1]
70	MOVD  R22, 24(R10)    // z[i+2]
71	MOVDU R23, 32(R10)    // z[i+3]
72	ADD   \$-4, R7         // R7 = z_len - 4
73	BC  16, 0, loop       // bdnz
74
75	// We may have more elements to read
76	CMP   R0, R7
77	BEQ   final
78
79	// Process the remaining elements, one at a time
80tail:
81	MOVDU 8(R8), R11      // R11 = x[i]
82	MOVDU 8(R9), R16      // R16 = y[i]
83	ADD   \$-1, R7         // R7 = z_len - 1
84	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
85	CMP   R0, R7
86	MOVDU R20, 8(R10)     // z[i]
87	BEQ   final           // If R7 = 0, we are done
88
89	MOVDU 8(R8), R11
90	MOVDU 8(R9), R16
93	CMP   R0, R7
94	MOVDU R20, 8(R10)
95	BEQ   final
96
97	MOVD  8(R8), R11
98	MOVD  8(R9), R16
100	MOVD  R20, 8(R10)
101
102final:
103	ADDZE R4              // Capture CA
104
105done:
106	MOVD  R4, c+72(FP)
107	RET
108
109// func subVV(z, x, y []Word) (c Word)
110// z[i] = x[i] - y[i] for all i, carrying
111TEXT ·subVV(SB), NOSPLIT, \$0
112	MOVD  z_len+8(FP), R7 // R7 = z_len
113	MOVD  x+24(FP), R8    // R8 = x[]
114	MOVD  y+48(FP), R9    // R9 = y[]
115	MOVD  z+0(FP), R10    // R10 = z[]
116
117	// If z_len = 0, we are done
118	CMP   R0, R7
119	MOVD  R0, R4
120	BEQ   done
121
122	// Process the first iteration out of the loop so we can
123	// use MOVDU and avoid 3 index registers updates.
124	MOVD  0(R8), R11      // R11 = x[i]
125	MOVD  0(R9), R12      // R12 = y[i]
126	ADD   \$-1, R7         // R7 = z_len - 1
127	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
128	CMP   R0, R7
129	MOVD  R15, 0(R10)     // z[i]
130	BEQ   final           // If z_len was 1, we are done
131
132	SRD   \$2, R7, R5      // R5 = z_len/4
133	CMP   R0, R5
134	MOVD  R5, CTR         // Set up loop counter
135	BEQ   tail            // If R5 = 0, we can't use the loop
136
137	// Process 4 elements per iteration. Unrolling this loop
138	// means a performance trade-off: we will lose performance
139	// for small values of z_len (0.92x in the worst case), but
140	// gain significant performance as z_len increases (up to
141	// 1.45x).
142loop:
143	MOVD  8(R8), R11      // R11 = x[i]
144	MOVD  16(R8), R12     // R12 = x[i+1]
145	MOVD  24(R8), R14     // R14 = x[i+2]
146	MOVDU 32(R8), R15     // R15 = x[i+3]
147	MOVD  8(R9), R16      // R16 = y[i]
148	MOVD  16(R9), R17     // R17 = y[i+1]
149	MOVD  24(R9), R18     // R18 = y[i+2]
150	MOVDU 32(R9), R19     // R19 = y[i+3]
151	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
152	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
153	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
154	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
155	MOVD  R20, 8(R10)     // z[i]
156	MOVD  R21, 16(R10)    // z[i+1]
157	MOVD  R22, 24(R10)    // z[i+2]
158	MOVDU R23, 32(R10)    // z[i+3]
159	ADD   \$-4, R7         // R7 = z_len - 4
160	BC  16, 0, loop       // bdnz
161
162	// We may have more elements to read
163	CMP   R0, R7
164	BEQ   final
165
166	// Process the remaining elements, one at a time
167tail:
168	MOVDU 8(R8), R11      // R11 = x[i]
169	MOVDU 8(R9), R16      // R16 = y[i]
170	ADD   \$-1, R7         // R7 = z_len - 1
171	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
172	CMP   R0, R7
173	MOVDU R20, 8(R10)     // z[i]
174	BEQ   final           // If R7 = 0, we are done
175
176	MOVDU 8(R8), R11
177	MOVDU 8(R9), R16
179	SUBE  R16, R11, R20
180	CMP   R0, R7
181	MOVDU R20, 8(R10)
182	BEQ   final
183
184	MOVD  8(R8), R11
185	MOVD  8(R9), R16
186	SUBE  R16, R11, R20
187	MOVD  R20, 8(R10)
188
189final:
191	XOR   \$1, R4
192
193done:
194	MOVD  R4, c+72(FP)
195	RET
196
197// func addVW(z, x []Word, y Word) (c Word)
199	MOVD z+0(FP), R10	// R10 = z[]
200	MOVD x+24(FP), R8	// R8 = x[]
201	MOVD y+48(FP), R4	// R4 = y = c
202	MOVD z_len+8(FP), R11	// R11 = z_len
203
204	CMP   R0, R11		// If z_len is zero, return
205	BEQ   done
206
207	// We will process the first iteration out of the loop so we capture
208	// the value of c. In the subsequent iterations, we will rely on the
209	// value of CA set here.
210	MOVD  0(R8), R20	// R20 = x[i]
211	ADD   \$-1, R11		// R11 = z_len - 1
212	ADDC  R20, R4, R6	// R6 = x[i] + c
213	CMP   R0, R11		// If z_len was 1, we are done
214	MOVD  R6, 0(R10)	// z[i]
215	BEQ   final
216
217	// We will read 4 elements per iteration
218	SRD   \$2, R11, R9	// R9 = z_len/4
219	DCBT  (R8)
220	CMP   R0, R9
221	MOVD  R9, CTR		// Set up the loop counter
222	BEQ   tail		// If R9 = 0, we can't use the loop
223
224loop:
225	MOVD  8(R8), R20	// R20 = x[i]
226	MOVD  16(R8), R21	// R21 = x[i+1]
227	MOVD  24(R8), R22	// R22 = x[i+2]
228	MOVDU 32(R8), R23	// R23 = x[i+3]
229	ADDZE R20, R24		// R24 = x[i] + CA
230	ADDZE R21, R25		// R25 = x[i+1] + CA
231	ADDZE R22, R26		// R26 = x[i+2] + CA
232	ADDZE R23, R27		// R27 = x[i+3] + CA
233	MOVD  R24, 8(R10)	// z[i]
234	MOVD  R25, 16(R10)	// z[i+1]
235	MOVD  R26, 24(R10)	// z[i+2]
236	MOVDU R27, 32(R10)	// z[i+3]
237	ADD   \$-4, R11		// R11 = z_len - 4
238	BC    16, 0, loop	// bdnz
239
240	// We may have some elements to read
241	CMP R0, R11
242	BEQ final
243
244tail:
245	MOVDU 8(R8), R20
248	MOVDU R24, 8(R10)
249	CMP R0, R11
250	BEQ final
251
252	MOVDU 8(R8), R20
255	MOVDU R24, 8(R10)
256	CMP R0, R11
257	BEQ final
258
259	MOVD 8(R8), R20
261	MOVD R24, 8(R10)
262
263final:
264	ADDZE R0, R4		// c = CA
265done:
266	MOVD  R4, c+56(FP)
267	RET
268
269// func subVW(z, x []Word, y Word) (c Word)
270TEXT ·subVW(SB), NOSPLIT, \$0
271	MOVD  z+0(FP), R10	// R10 = z[]
272	MOVD  x+24(FP), R8	// R8 = x[]
273	MOVD  y+48(FP), R4	// R4 = y = c
274	MOVD  z_len+8(FP), R11	// R11 = z_len
275
276	CMP   R0, R11		// If z_len is zero, return
277	BEQ   done
278
279	// We will process the first iteration out of the loop so we capture
280	// the value of c. In the subsequent iterations, we will rely on the
281	// value of CA set here.
282	MOVD  0(R8), R20	// R20 = x[i]
283	ADD   \$-1, R11		// R11 = z_len - 1
284	SUBC  R4, R20, R6	// R6 = x[i] - c
285	CMP   R0, R11		// If z_len was 1, we are done
286	MOVD  R6, 0(R10)	// z[i]
287	BEQ   final
288
289	// We will read 4 elements per iteration
290	SRD   \$2, R11, R9	// R9 = z_len/4
291	DCBT  (R8)
292	CMP   R0, R9
293	MOVD  R9, CTR		// Set up the loop counter
294	BEQ   tail		// If R9 = 0, we can't use the loop
295
296	// The loop here is almost the same as the one used in s390x, but
297	// we don't need to capture CA every iteration because we've already
298	// done that above.
299loop:
300	MOVD  8(R8), R20
301	MOVD  16(R8), R21
302	MOVD  24(R8), R22
303	MOVDU 32(R8), R23
304	SUBE  R0, R20
305	SUBE  R0, R21
306	SUBE  R0, R22
307	SUBE  R0, R23
308	MOVD  R20, 8(R10)
309	MOVD  R21, 16(R10)
310	MOVD  R22, 24(R10)
311	MOVDU R23, 32(R10)
313	BC    16, 0, loop	// bdnz
314
315	// We may have some elements to read
316	CMP   R0, R11
317	BEQ   final
318
319tail:
320	MOVDU 8(R8), R20
321	SUBE  R0, R20
323	MOVDU R20, 8(R10)
324	CMP   R0, R11
325	BEQ   final
326
327	MOVDU 8(R8), R20
328	SUBE  R0, R20
330	MOVDU R20, 8(R10)
331	CMP   R0, R11
332	BEQ   final
333
334	MOVD  8(R8), R20
335	SUBE  R0, R20
336	MOVD  R20, 8(R10)
337
338final:
339	// Capture CA
340	SUBE  R4, R4
341	NEG   R4, R4
342
343done:
344	MOVD  R4, c+56(FP)
345	RET
346
347TEXT ·shlVU(SB), NOSPLIT, \$0
348	BR ·shlVU_g(SB)
349
350TEXT ·shrVU(SB), NOSPLIT, \$0
351	BR ·shrVU_g(SB)
352
353// func mulAddVWW(z, x []Word, y, r Word) (c Word)
355	MOVD    z+0(FP), R10      // R10 = z[]
356	MOVD    x+24(FP), R8      // R8 = x[]
357	MOVD    y+48(FP), R9      // R9 = y
358	MOVD    r+56(FP), R4      // R4 = r = c
359	MOVD    z_len+8(FP), R11  // R11 = z_len
360
361	CMP     R0, R11
362	BEQ     done
363
364	MOVD    0(R8), R20
366	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
367	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
368	ADDC    R4, R6            // R6 = z0 + r
369	ADDZE   R7                // R7 = z1 + CA
370	CMP     R0, R11
371	MOVD    R7, R4            // R4 = c
372	MOVD    R6, 0(R10)        // z[i]
373	BEQ     done
374
375	// We will read 4 elements per iteration
376	SRD     \$2, R11, R14      // R14 = z_len/4
377	DCBT    (R8)
378	CMP     R0, R14
379	MOVD    R14, CTR          // Set up the loop counter
380	BEQ     tail              // If R9 = 0, we can't use the loop
381
382loop:
383	MOVD    8(R8), R20        // R20 = x[i]
384	MOVD    16(R8), R21       // R21 = x[i+1]
385	MOVD    24(R8), R22       // R22 = x[i+2]
386	MOVDU   32(R8), R23       // R23 = x[i+3]
387	MULLD   R9, R20, R24      // R24 = z0[i]
388	MULHDU  R9, R20, R20      // R20 = z1[i]
389	ADDC    R4, R24           // R24 = z0[i] + c
390	ADDZE   R20               // R7 = z1[i] + CA
391	MULLD   R9, R21, R25
392	MULHDU  R9, R21, R21
395	MULLD   R9, R22, R26
396	MULHDU  R9, R22, R22
399	MULLD   R9, R23, R27
400	MULHDU  R9, R23, R23
403	MOVD    R24, 8(R10)       // z[i]
404	MOVD    R25, 16(R10)      // z[i+1]
405	MOVD    R26, 24(R10)      // z[i+2]
406	MOVDU   R27, 32(R10)      // z[i+3]
407	MOVD    R23, R4           // R4 = c
408	ADD     \$-4, R11          // R11 = z_len - 4
409	BC      16, 0, loop       // bdnz
410
411	// We may have some elements to read
412	CMP   R0, R11
413	BEQ   done
414
415	// Process the remaining elements, one at a time
416tail:
417	MOVDU   8(R8), R20        // R20 = x[i]
418	MULLD   R9, R20, R24      // R24 = z0[i]
419	MULHDU  R9, R20, R25      // R25 = z1[i]
420	ADD     \$-1, R11          // R11 = z_len - 1
423	MOVDU   R24, 8(R10)       // z[i]
424	CMP     R0, R11
425	MOVD    R25, R4           // R4 = c
426	BEQ     done              // If R11 = 0, we are done
427
428	MOVDU   8(R8), R20
429	MULLD   R9, R20, R24
430	MULHDU  R9, R20, R25
434	MOVDU   R24, 8(R10)
435	CMP     R0, R11
436	MOVD    R25, R4
437	BEQ     done
438
439	MOVD    8(R8), R20
440	MULLD   R9, R20, R24
441	MULHDU  R9, R20, R25
445	MOVD    R24, 8(R10)
446	MOVD    R25, R4
447
448done:
449	MOVD    R4, c+64(FP)
450	RET
451
452// func addMulVVW(z, x []Word, y Word) (c Word)
454	MOVD z+0(FP), R10	// R10 = z[]
455	MOVD x+24(FP), R8	// R8 = x[]
456	MOVD y+48(FP), R9	// R9 = y
457	MOVD z_len+8(FP), R22	// R22 = z_len
458
459	MOVD R0, R3		// R3 will be the index register
460	CMP  R0, R22
461	MOVD R0, R4		// R4 = c = 0
462	MOVD R22, CTR		// Initialize loop counter
463	BEQ  done
464
465loop:
466	MOVD  (R8)(R3), R20	// Load x[i]
467	MOVD  (R10)(R3), R21	// Load z[i]
468	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
469	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
470	ADDC   R21, R6		// R6 = z0
471	ADDZE  R7		// R7 = z1
472	ADDC   R4, R6		// R6 = z0 + c + 0
473	ADDZE  R7, R4           // c += z1
474	MOVD   R6, (R10)(R3)	// Store z[i]
476	BC  16, 0, loop		// bdnz
477
478done:
479	MOVD R4, c+56(FP)
480	RET
481
482// func divWW(x1, x0, y Word) (q, r Word)
483TEXT ·divWW(SB), NOSPLIT, \$0
484	MOVD x1+0(FP), R4
485	MOVD x0+8(FP), R5
486	MOVD y+16(FP), R6
487
488	CMPU R4, R6
489	BGE  divbigger
490
491	// from the programmer's note in ch. 3 of the ISA manual, p.74
492	DIVDEU R6, R4, R3
493	DIVDU  R6, R5, R7
494	MULLD  R6, R3, R8
495	MULLD  R6, R7, R20
496	SUB    R20, R5, R10
498	SUB    R8, R10, R4
499	CMPU   R4, R10
501	CMPU   R4, R6
502	BLT    end
503
505	MOVD \$1, R21
507	SUB  R6, R4, R4
508
509end:
510	MOVD R3, q+24(FP)
511	MOVD R4, r+32(FP)
512
513	RET
514
515divbigger:
516	MOVD \$-1, R7
517	MOVD R7, q+24(FP)
518	MOVD R7, r+32(FP)
519	RET
520
521TEXT ·divWVW(SB), NOSPLIT, \$0
522	BR ·divWVW_g(SB)
```

View as plain text