...
Run Format

Text file src/math/big/arith_ppc64x.s

Documentation: math/big

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// func mulWW(x, y Word) (z1, z0 Word)
    13	TEXT ·mulWW(SB), NOSPLIT, $0
    14		MOVD   x+0(FP), R4
    15		MOVD   y+8(FP), R5
    16		MULHDU R4, R5, R6
    17		MULLD  R4, R5, R7
    18		MOVD   R6, z1+16(FP)
    19		MOVD   R7, z0+24(FP)
    20		RET
    21	
    22	// func addVV(z, y, y []Word) (c Word)
    23	// z[i] = x[i] + y[i] for all i, carrying
    24	TEXT ·addVV(SB), NOSPLIT, $0
    25		MOVD  z_len+8(FP), R7
    26		MOVD  x+24(FP), R8
    27		MOVD  y+48(FP), R9
    28		MOVD  z+0(FP), R10
    29	
    30		MOVD  R0, R4
    31		MOVD  R0, R6  // R6 will be the address index
    32		ADDC R4, R4   // clear CA
    33		MOVD  R7, CTR
    34	
    35		CMP   R0, R7
    36		BEQ   done
    37	
    38	loop:
    39		MOVD  (R8)(R6), R11   // x[i]
    40		MOVD  (R9)(R6), R12   // y[i]
    41		ADDE  R12, R11, R15   // x[i] + y[i] + CA
    42		MOVD  R15, (R10)(R6)  // z[i]
    43	
    44		ADD $8, R6
    45		BC  16, 0, loop	// bdnz
    46	
    47	done:
    48		ADDZE R4
    49		MOVD  R4, c+72(FP)
    50		RET
    51	
    52	// func subVV(z, x, y []Word) (c Word)
    53	// z[i] = x[i] - y[i] for all i, carrying
    54	TEXT ·subVV(SB), NOSPLIT, $0
    55		MOVD z_len+8(FP), R7
    56		MOVD x+24(FP), R8
    57		MOVD y+48(FP), R9
    58		MOVD z+0(FP), R10
    59	
    60		MOVD  R0, R4  // c = 0
    61		MOVD  R0, R6
    62		SUBC R0, R0  // clear CA
    63		MOVD  R7, CTR
    64	
    65		CMP R0, R7
    66		BEQ  sublend
    67	
    68	// amd64 saves and restores CF, but I believe they only have to do that because all of
    69	// their math operations clobber it - we should just be able to recover it at the end.
    70	subloop:
    71		MOVD  (R8)(R6), R11 // x[i]
    72		MOVD  (R9)(R6), R12 // y[i]
    73	
    74		SUBE R12, R11, R15
    75		MOVD R15, (R10)(R6)
    76	
    77		ADD $8, R6
    78		BC  16, 0, subloop  // bdnz
    79	
    80	sublend:
    81	
    82		ADDZE R4
    83		XOR   $1, R4
    84		MOVD  R4, c+72(FP)
    85		RET
    86	
    87	TEXT ·addVW(SB), NOSPLIT, $0
    88		BR ·addVW_g(SB)
    89	
    90	TEXT ·subVW(SB), NOSPLIT, $0
    91		BR ·subVW_g(SB)
    92	
    93	TEXT ·shlVU(SB), NOSPLIT, $0
    94		BR ·shlVU_g(SB)
    95	
    96	TEXT ·shrVU(SB), NOSPLIT, $0
    97		BR ·shrVU_g(SB)
    98	
    99	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   100	TEXT ·mulAddVWW(SB), NOSPLIT, $0
   101		MOVD z+0(FP), R10
   102		MOVD x+24(FP), R8
   103		MOVD y+48(FP), R9
   104		MOVD r+56(FP), R4     // c = r
   105		MOVD z_len+8(FP), R11
   106		MOVD $0, R3           // i = 0
   107		MOVD $8, R18
   108		MOVD $1, R19
   109	
   110		JMP e5
   111	
   112	l5:
   113		MULLD  R18, R3, R5
   114		MOVD   (R8)(R5), R20
   115		MULLD  R9, R20, R6
   116		MULHDU R9, R20, R7
   117		ADDC   R4, R6
   118		ADDZE  R7
   119		MOVD   R6, (R10)(R5)
   120		MOVD   R7, R4
   121		ADD    R19, R3
   122	
   123	e5:
   124		CMP R3, R11
   125		BLT l5
   126	
   127		MOVD R4, c+64(FP)
   128		RET
   129	
   130	// func addMulVVW(z, x []Word, y Word) (c Word)
   131	TEXT ·addMulVVW(SB), NOSPLIT, $0
   132		MOVD z+0(FP), R10
   133		MOVD x+24(FP), R8
   134		MOVD y+48(FP), R9
   135		MOVD z_len+8(FP), R22
   136	
   137		MOVD $0, R5   // i = 0
   138		MOVD $0, R4   // c = 0
   139		MOVD $8, R28
   140		MOVD $-2, R23
   141		AND  R22, R23 // mask the last bit of z.len
   142		MOVD $2, R24
   143		CMP  R23, R24
   144		BGE  unrolled
   145		JMP  end
   146	
   147	unrolled:
   148		MOVD  $8, R19         // no (RA)(RB*8) on power
   149		MULLD R5, R19
   150		MOVD  (R10)(R19), R11 // R11 = z[i]
   151		MOVD  (R8)(R19), R16  // R16 = x[i]
   152		ADD   R28, R19, R25
   153		MOVD  (R10)(R25), R17
   154		MOVD  (R8)(R25), R18
   155	
   156		MULLD  R9, R16, R12
   157		MULHDU R9, R16, R14
   158		MULLD  R9, R18, R6
   159		MULHDU R9, R18, R7
   160		ADDC   R4, R12
   161		ADDZE  R14
   162		ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
   163		ADDZE  R14             // carry = high order bits + add carry
   164		MOVD   R12, (R10)(R19)
   165		ADDC   R14, R6
   166		ADDZE  R7
   167		ADDC   R17, R6
   168		ADDZE  R7
   169		MOVD   R6, (R10)(R25)
   170		MOVD   R7, R4
   171	
   172		ADD R24, R5
   173		CMP R5, R23
   174		BLT unrolled
   175		JMP end
   176	
   177	loop:
   178		MOVD   $8, R19
   179		MULLD  R5, R19
   180		MOVD   (R10)(R19), R11
   181		MOVD   (R8)(R19), R16
   182		MULLD  R9, R16, R12
   183		MULHDU R9, R16, R14
   184		ADDC   R4, R12
   185		ADDZE  R14
   186		ADDC   R11, R12
   187		ADDZE  R14
   188		MOVD   R12, (R10)(R19)
   189		MOVD   R14, R4
   190	
   191		MOVD $1, R15
   192		ADD  R15, R5
   193	
   194	end:
   195		CMP R5, R22
   196		BLT loop
   197	
   198		MOVD R4, c+56(FP)
   199		RET
   200	
   201	// func divWW(x1, x0, y Word) (q, r Word)
   202	TEXT ·divWW(SB), NOSPLIT, $0
   203		MOVD x1+0(FP), R4
   204		MOVD x0+8(FP), R5
   205		MOVD y+16(FP), R6
   206	
   207		CMPU R4, R6
   208		BGE  divbigger
   209	
   210		// from the programmer's note in ch. 3 of the ISA manual, p.74
   211		DIVDEU R6, R4, R3
   212		DIVDU  R6, R5, R7
   213		MULLD  R6, R3, R8
   214		MULLD  R6, R7, R20
   215		SUB    R20, R5, R10
   216		ADD    R7, R3, R3
   217		SUB    R8, R10, R4
   218		CMPU   R4, R10
   219		BLT    adjust
   220		CMPU   R4, R6
   221		BLT    end
   222	
   223	adjust:
   224		MOVD $1, R21
   225		ADD  R21, R3, R3
   226		SUB  R6, R4, R4
   227	
   228	end:
   229		MOVD R3, q+24(FP)
   230		MOVD R4, r+32(FP)
   231	
   232		RET
   233	
   234	divbigger:
   235		MOVD $-1, R7
   236		MOVD R7, q+24(FP)
   237		MOVD R7, r+32(FP)
   238		RET
   239	
   240	TEXT ·divWVW(SB), NOSPLIT, $0
   241		BR ·divWVW_g(SB)

View as plain text