...
Run Format

Text file src/math/big/arith_ppc64x.s

Documentation: math/big

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// func mulWW(x, y Word) (z1, z0 Word)
    13TEXT ·mulWW(SB), NOSPLIT, $0
    14	MOVD   x+0(FP), R4
    15	MOVD   y+8(FP), R5
    16	MULHDU R4, R5, R6
    17	MULLD  R4, R5, R7
    18	MOVD   R6, z1+16(FP)
    19	MOVD   R7, z0+24(FP)
    20	RET
    21
    22// func addVV(z, y, y []Word) (c Word)
    23// z[i] = x[i] + y[i] for all i, carrying
    24TEXT ·addVV(SB), NOSPLIT, $0
    25	MOVD  z_len+8(FP), R7   // R7 = z_len
    26	MOVD  x+24(FP), R8      // R8 = x[]
    27	MOVD  y+48(FP), R9      // R9 = y[]
    28	MOVD  z+0(FP), R10      // R10 = z[]
    29
    30	// If z_len = 0, we are done
    31	CMP   R0, R7
    32	MOVD  R0, R4
    33	BEQ   done
    34
    35	// Process the first iteration out of the loop so we can
    36	// use MOVDU and avoid 3 index registers updates.
    37	MOVD  0(R8), R11      // R11 = x[i]
    38	MOVD  0(R9), R12      // R12 = y[i]
    39	ADD   $-1, R7         // R7 = z_len - 1
    40	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    41	CMP   R0, R7
    42	MOVD  R15, 0(R10)     // z[i]
    43	BEQ   final          // If z_len was 1, we are done
    44
    45	SRD   $2, R7, R5      // R5 = z_len/4
    46	CMP   R0, R5
    47	MOVD  R5, CTR         // Set up loop counter
    48	BEQ   tail            // If R5 = 0, we can't use the loop
    49
    50	// Process 4 elements per iteration. Unrolling this loop
    51	// means a performance trade-off: we will lose performance
    52	// for small values of z_len (0.90x in the worst case), but
    53	// gain significant performance as z_len increases (up to
    54	// 1.45x).
    55loop:
    56	MOVD  8(R8), R11      // R11 = x[i]
    57	MOVD  16(R8), R12     // R12 = x[i+1]
    58	MOVD  24(R8), R14     // R14 = x[i+2]
    59	MOVDU 32(R8), R15     // R15 = x[i+3]
    60	MOVD  8(R9), R16      // R16 = y[i]
    61	MOVD  16(R9), R17     // R17 = y[i+1]
    62	MOVD  24(R9), R18     // R18 = y[i+2]
    63	MOVDU 32(R9), R19     // R19 = y[i+3]
    64	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    65	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    66	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    67	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    68	MOVD  R20, 8(R10)     // z[i]
    69	MOVD  R21, 16(R10)    // z[i+1]
    70	MOVD  R22, 24(R10)    // z[i+2]
    71	MOVDU R23, 32(R10)    // z[i+3]
    72	ADD   $-4, R7         // R7 = z_len - 4
    73	BC  16, 0, loop       // bdnz
    74
    75	// We may have more elements to read
    76	CMP   R0, R7
    77	BEQ   final
    78
    79	// Process the remaining elements, one at a time
    80tail:
    81	MOVDU 8(R8), R11      // R11 = x[i]
    82	MOVDU 8(R9), R16      // R16 = y[i]
    83	ADD   $-1, R7         // R7 = z_len - 1
    84	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    85	CMP   R0, R7
    86	MOVDU R20, 8(R10)     // z[i]
    87	BEQ   final           // If R7 = 0, we are done
    88
    89	MOVDU 8(R8), R11
    90	MOVDU 8(R9), R16
    91	ADD   $-1, R7
    92	ADDE  R11, R16, R20
    93	CMP   R0, R7
    94	MOVDU R20, 8(R10)
    95	BEQ   final
    96
    97	MOVD  8(R8), R11
    98	MOVD  8(R9), R16
    99	ADDE  R11, R16, R20
   100	MOVD  R20, 8(R10)
   101
   102final:
   103	ADDZE R4              // Capture CA
   104
   105done:
   106	MOVD  R4, c+72(FP)
   107	RET
   108
   109// func subVV(z, x, y []Word) (c Word)
   110// z[i] = x[i] - y[i] for all i, carrying
   111TEXT ·subVV(SB), NOSPLIT, $0
   112	MOVD  z_len+8(FP), R7 // R7 = z_len
   113	MOVD  x+24(FP), R8    // R8 = x[]
   114	MOVD  y+48(FP), R9    // R9 = y[]
   115	MOVD  z+0(FP), R10    // R10 = z[]
   116
   117	// If z_len = 0, we are done
   118	CMP   R0, R7
   119	MOVD  R0, R4
   120	BEQ   done
   121
   122	// Process the first iteration out of the loop so we can
   123	// use MOVDU and avoid 3 index registers updates.
   124	MOVD  0(R8), R11      // R11 = x[i]
   125	MOVD  0(R9), R12      // R12 = y[i]
   126	ADD   $-1, R7         // R7 = z_len - 1
   127	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   128	CMP   R0, R7
   129	MOVD  R15, 0(R10)     // z[i]
   130	BEQ   final           // If z_len was 1, we are done
   131
   132	SRD   $2, R7, R5      // R5 = z_len/4
   133	CMP   R0, R5
   134	MOVD  R5, CTR         // Set up loop counter
   135	BEQ   tail            // If R5 = 0, we can't use the loop
   136
   137	// Process 4 elements per iteration. Unrolling this loop
   138	// means a performance trade-off: we will lose performance
   139	// for small values of z_len (0.92x in the worst case), but
   140	// gain significant performance as z_len increases (up to
   141	// 1.45x).
   142loop:
   143	MOVD  8(R8), R11      // R11 = x[i]
   144	MOVD  16(R8), R12     // R12 = x[i+1]
   145	MOVD  24(R8), R14     // R14 = x[i+2]
   146	MOVDU 32(R8), R15     // R15 = x[i+3]
   147	MOVD  8(R9), R16      // R16 = y[i]
   148	MOVD  16(R9), R17     // R17 = y[i+1]
   149	MOVD  24(R9), R18     // R18 = y[i+2]
   150	MOVDU 32(R9), R19     // R19 = y[i+3]
   151	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   152	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   153	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   154	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   155	MOVD  R20, 8(R10)     // z[i]
   156	MOVD  R21, 16(R10)    // z[i+1]
   157	MOVD  R22, 24(R10)    // z[i+2]
   158	MOVDU R23, 32(R10)    // z[i+3]
   159	ADD   $-4, R7         // R7 = z_len - 4
   160	BC  16, 0, loop       // bdnz
   161
   162	// We may have more elements to read
   163	CMP   R0, R7
   164	BEQ   final
   165
   166	// Process the remaining elements, one at a time
   167tail:
   168	MOVDU 8(R8), R11      // R11 = x[i]
   169	MOVDU 8(R9), R16      // R16 = y[i]
   170	ADD   $-1, R7         // R7 = z_len - 1
   171	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   172	CMP   R0, R7
   173	MOVDU R20, 8(R10)     // z[i]
   174	BEQ   final           // If R7 = 0, we are done
   175
   176	MOVDU 8(R8), R11
   177	MOVDU 8(R9), R16
   178	ADD   $-1, R7
   179	SUBE  R16, R11, R20
   180	CMP   R0, R7
   181	MOVDU R20, 8(R10)
   182	BEQ   final
   183
   184	MOVD  8(R8), R11
   185	MOVD  8(R9), R16
   186	SUBE  R16, R11, R20
   187	MOVD  R20, 8(R10)
   188
   189final:
   190	ADDZE R4
   191	XOR   $1, R4
   192
   193done:
   194	MOVD  R4, c+72(FP)
   195	RET
   196
   197// func addVW(z, x []Word, y Word) (c Word)
   198TEXT ·addVW(SB), NOSPLIT, $0
   199	MOVD z+0(FP), R10	// R10 = z[]
   200	MOVD x+24(FP), R8	// R8 = x[]
   201	MOVD y+48(FP), R4	// R4 = y = c
   202	MOVD z_len+8(FP), R11	// R11 = z_len
   203
   204	CMP   R0, R11		// If z_len is zero, return
   205	BEQ   done
   206
   207	// We will process the first iteration out of the loop so we capture
   208	// the value of c. In the subsequent iterations, we will rely on the
   209	// value of CA set here.
   210	MOVD  0(R8), R20	// R20 = x[i]
   211	ADD   $-1, R11		// R11 = z_len - 1
   212	ADDC  R20, R4, R6	// R6 = x[i] + c
   213	CMP   R0, R11		// If z_len was 1, we are done
   214	MOVD  R6, 0(R10)	// z[i]
   215	BEQ   final
   216
   217	// We will read 4 elements per iteration
   218	SRD   $2, R11, R9	// R9 = z_len/4
   219	DCBT  (R8)
   220	CMP   R0, R9
   221	MOVD  R9, CTR		// Set up the loop counter
   222	BEQ   tail		// If R9 = 0, we can't use the loop
   223
   224loop:
   225	MOVD  8(R8), R20	// R20 = x[i]
   226	MOVD  16(R8), R21	// R21 = x[i+1]
   227	MOVD  24(R8), R22	// R22 = x[i+2]
   228	MOVDU 32(R8), R23	// R23 = x[i+3]
   229	ADDZE R20, R24		// R24 = x[i] + CA
   230	ADDZE R21, R25		// R25 = x[i+1] + CA
   231	ADDZE R22, R26		// R26 = x[i+2] + CA
   232	ADDZE R23, R27		// R27 = x[i+3] + CA
   233	MOVD  R24, 8(R10)	// z[i]
   234	MOVD  R25, 16(R10)	// z[i+1]
   235	MOVD  R26, 24(R10)	// z[i+2]
   236	MOVDU R27, 32(R10)	// z[i+3]
   237	ADD   $-4, R11		// R11 = z_len - 4
   238	BC    16, 0, loop	// bdnz
   239
   240	// We may have some elements to read
   241	CMP R0, R11
   242	BEQ final
   243
   244tail:
   245	MOVDU 8(R8), R20
   246	ADDZE R20, R24
   247	ADD $-1, R11
   248	MOVDU R24, 8(R10)
   249	CMP R0, R11
   250	BEQ final
   251
   252	MOVDU 8(R8), R20
   253	ADDZE R20, R24
   254	ADD $-1, R11
   255	MOVDU R24, 8(R10)
   256	CMP R0, R11
   257	BEQ final
   258
   259	MOVD 8(R8), R20
   260	ADDZE R20, R24
   261	MOVD R24, 8(R10)
   262
   263final:
   264	ADDZE R0, R4		// c = CA
   265done:
   266	MOVD  R4, c+56(FP)
   267	RET
   268
   269// func subVW(z, x []Word, y Word) (c Word)
   270TEXT ·subVW(SB), NOSPLIT, $0
   271	MOVD  z+0(FP), R10	// R10 = z[]
   272	MOVD  x+24(FP), R8	// R8 = x[]
   273	MOVD  y+48(FP), R4	// R4 = y = c
   274	MOVD  z_len+8(FP), R11	// R11 = z_len
   275
   276	CMP   R0, R11		// If z_len is zero, return
   277	BEQ   done
   278
   279	// We will process the first iteration out of the loop so we capture
   280	// the value of c. In the subsequent iterations, we will rely on the
   281	// value of CA set here.
   282	MOVD  0(R8), R20	// R20 = x[i]
   283	ADD   $-1, R11		// R11 = z_len - 1
   284	SUBC  R4, R20, R6	// R6 = x[i] - c
   285	CMP   R0, R11		// If z_len was 1, we are done
   286	MOVD  R6, 0(R10)	// z[i]
   287	BEQ   final
   288
   289	// We will read 4 elements per iteration
   290	SRD   $2, R11, R9	// R9 = z_len/4
   291	DCBT  (R8)
   292	CMP   R0, R9
   293	MOVD  R9, CTR		// Set up the loop counter
   294	BEQ   tail		// If R9 = 0, we can't use the loop
   295
   296	// The loop here is almost the same as the one used in s390x, but
   297	// we don't need to capture CA every iteration because we've already
   298	// done that above.
   299loop:
   300	MOVD  8(R8), R20
   301	MOVD  16(R8), R21
   302	MOVD  24(R8), R22
   303	MOVDU 32(R8), R23
   304	SUBE  R0, R20
   305	SUBE  R0, R21
   306	SUBE  R0, R22
   307	SUBE  R0, R23
   308	MOVD  R20, 8(R10)
   309	MOVD  R21, 16(R10)
   310	MOVD  R22, 24(R10)
   311	MOVDU R23, 32(R10)
   312	ADD   $-4, R11
   313	BC    16, 0, loop	// bdnz
   314
   315	// We may have some elements to read
   316	CMP   R0, R11
   317	BEQ   final
   318
   319tail:
   320	MOVDU 8(R8), R20
   321	SUBE  R0, R20
   322	ADD   $-1, R11
   323	MOVDU R20, 8(R10)
   324	CMP   R0, R11
   325	BEQ   final
   326
   327	MOVDU 8(R8), R20
   328	SUBE  R0, R20
   329	ADD   $-1, R11
   330	MOVDU R20, 8(R10)
   331	CMP   R0, R11
   332	BEQ   final
   333
   334	MOVD  8(R8), R20
   335	SUBE  R0, R20
   336	MOVD  R20, 8(R10)
   337
   338final:
   339	// Capture CA
   340	SUBE  R4, R4
   341	NEG   R4, R4
   342
   343done:
   344	MOVD  R4, c+56(FP)
   345	RET
   346
   347TEXT ·shlVU(SB), NOSPLIT, $0
   348	BR ·shlVU_g(SB)
   349
   350TEXT ·shrVU(SB), NOSPLIT, $0
   351	BR ·shrVU_g(SB)
   352
   353// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   354TEXT ·mulAddVWW(SB), NOSPLIT, $0
   355	MOVD    z+0(FP), R10      // R10 = z[]
   356	MOVD    x+24(FP), R8      // R8 = x[]
   357	MOVD    y+48(FP), R9      // R9 = y
   358	MOVD    r+56(FP), R4      // R4 = r = c
   359	MOVD    z_len+8(FP), R11  // R11 = z_len
   360
   361	CMP     R0, R11
   362	BEQ     done
   363
   364	MOVD    0(R8), R20
   365	ADD     $-1, R11
   366	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   367	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   368	ADDC    R4, R6            // R6 = z0 + r
   369	ADDZE   R7                // R7 = z1 + CA
   370	CMP     R0, R11
   371	MOVD    R7, R4            // R4 = c
   372	MOVD    R6, 0(R10)        // z[i]
   373	BEQ     done
   374
   375	// We will read 4 elements per iteration
   376	SRD     $2, R11, R14      // R14 = z_len/4
   377	DCBT    (R8)
   378	CMP     R0, R14
   379	MOVD    R14, CTR          // Set up the loop counter
   380	BEQ     tail              // If R9 = 0, we can't use the loop
   381
   382loop:
   383	MOVD    8(R8), R20        // R20 = x[i]
   384	MOVD    16(R8), R21       // R21 = x[i+1]
   385	MOVD    24(R8), R22       // R22 = x[i+2]
   386	MOVDU   32(R8), R23       // R23 = x[i+3]
   387	MULLD   R9, R20, R24      // R24 = z0[i]
   388	MULHDU  R9, R20, R20      // R20 = z1[i]
   389	ADDC    R4, R24           // R24 = z0[i] + c
   390	ADDZE   R20               // R7 = z1[i] + CA
   391	MULLD   R9, R21, R25
   392	MULHDU  R9, R21, R21
   393	ADDC    R20, R25
   394	ADDZE   R21
   395	MULLD   R9, R22, R26
   396	MULHDU  R9, R22, R22
   397	ADDC    R21, R26
   398	ADDZE   R22
   399	MULLD   R9, R23, R27
   400	MULHDU  R9, R23, R23
   401	ADDC    R22, R27
   402	ADDZE   R23
   403	MOVD    R24, 8(R10)       // z[i]
   404	MOVD    R25, 16(R10)      // z[i+1]
   405	MOVD    R26, 24(R10)      // z[i+2]
   406	MOVDU   R27, 32(R10)      // z[i+3]
   407	MOVD    R23, R4           // R4 = c
   408	ADD     $-4, R11          // R11 = z_len - 4
   409	BC      16, 0, loop       // bdnz
   410
   411	// We may have some elements to read
   412	CMP   R0, R11
   413	BEQ   done
   414
   415	// Process the remaining elements, one at a time
   416tail:
   417	MOVDU   8(R8), R20        // R20 = x[i]
   418	MULLD   R9, R20, R24      // R24 = z0[i]
   419	MULHDU  R9, R20, R25      // R25 = z1[i]
   420	ADD     $-1, R11          // R11 = z_len - 1
   421	ADDC    R4, R24
   422	ADDZE   R25
   423	MOVDU   R24, 8(R10)       // z[i]
   424	CMP     R0, R11
   425	MOVD    R25, R4           // R4 = c
   426	BEQ     done              // If R11 = 0, we are done
   427
   428	MOVDU   8(R8), R20
   429	MULLD   R9, R20, R24
   430	MULHDU  R9, R20, R25
   431	ADD     $-1, R11
   432	ADDC    R4, R24
   433	ADDZE   R25
   434	MOVDU   R24, 8(R10)
   435	CMP     R0, R11
   436	MOVD    R25, R4
   437	BEQ     done
   438
   439	MOVD    8(R8), R20
   440	MULLD   R9, R20, R24
   441	MULHDU  R9, R20, R25
   442	ADD     $-1, R11
   443	ADDC    R4, R24
   444	ADDZE   R25
   445	MOVD    R24, 8(R10)
   446	MOVD    R25, R4
   447
   448done:
   449	MOVD    R4, c+64(FP)
   450	RET
   451
   452// func addMulVVW(z, x []Word, y Word) (c Word)
   453TEXT ·addMulVVW(SB), NOSPLIT, $0
   454	MOVD z+0(FP), R10	// R10 = z[]
   455	MOVD x+24(FP), R8	// R8 = x[]
   456	MOVD y+48(FP), R9	// R9 = y
   457	MOVD z_len+8(FP), R22	// R22 = z_len
   458
   459	MOVD R0, R3		// R3 will be the index register
   460	CMP  R0, R22
   461	MOVD R0, R4		// R4 = c = 0
   462	MOVD R22, CTR		// Initialize loop counter
   463	BEQ  done
   464
   465loop:
   466	MOVD  (R8)(R3), R20	// Load x[i]
   467	MOVD  (R10)(R3), R21	// Load z[i]
   468	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   469	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   470	ADDC   R21, R6		// R6 = z0
   471	ADDZE  R7		// R7 = z1
   472	ADDC   R4, R6		// R6 = z0 + c + 0
   473	ADDZE  R7, R4           // c += z1
   474	MOVD   R6, (R10)(R3)	// Store z[i]
   475	ADD    $8, R3
   476	BC  16, 0, loop		// bdnz
   477
   478done:
   479	MOVD R4, c+56(FP)
   480	RET
   481
   482// func divWW(x1, x0, y Word) (q, r Word)
   483TEXT ·divWW(SB), NOSPLIT, $0
   484	MOVD x1+0(FP), R4
   485	MOVD x0+8(FP), R5
   486	MOVD y+16(FP), R6
   487
   488	CMPU R4, R6
   489	BGE  divbigger
   490
   491	// from the programmer's note in ch. 3 of the ISA manual, p.74
   492	DIVDEU R6, R4, R3
   493	DIVDU  R6, R5, R7
   494	MULLD  R6, R3, R8
   495	MULLD  R6, R7, R20
   496	SUB    R20, R5, R10
   497	ADD    R7, R3, R3
   498	SUB    R8, R10, R4
   499	CMPU   R4, R10
   500	BLT    adjust
   501	CMPU   R4, R6
   502	BLT    end
   503
   504adjust:
   505	MOVD $1, R21
   506	ADD  R21, R3, R3
   507	SUB  R6, R4, R4
   508
   509end:
   510	MOVD R3, q+24(FP)
   511	MOVD R4, r+32(FP)
   512
   513	RET
   514
   515divbigger:
   516	MOVD $-1, R7
   517	MOVD R7, q+24(FP)
   518	MOVD R7, r+32(FP)
   519	RET
   520
   521TEXT ·divWVW(SB), NOSPLIT, $0
   522	BR ·divWVW_g(SB)

View as plain text