arith_arm64.s

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // TODO: Consider re-implementing using Advanced SIMD
    13  // once the assembler supports those instructions.
    14  
    15  // func addVV(z, x, y []Word) (c Word)
    16  TEXT ·addVV(SB),NOSPLIT,$0
    17  	MOVD	z_len+8(FP), R0
    18  	MOVD	x+24(FP), R8
    19  	MOVD	y+48(FP), R9
    20  	MOVD	z+0(FP), R10
    21  	ADDS	$0, R0		// clear carry flag
    22  	TBZ	$0, R0, two
    23  	MOVD.P	8(R8), R11
    24  	MOVD.P	8(R9), R15
    25  	ADCS	R15, R11
    26  	MOVD.P	R11, 8(R10)
    27  	SUB	$1, R0
    28  two:
    29  	TBZ	$1, R0, loop
    30  	LDP.P	16(R8), (R11, R12)
    31  	LDP.P	16(R9), (R15, R16)
    32  	ADCS	R15, R11
    33  	ADCS	R16, R12
    34  	STP.P	(R11, R12), 16(R10)
    35  	SUB	$2, R0
    36  loop:
    37  	CBZ	R0, done	// careful not to touch the carry flag
    38  	LDP.P	32(R8), (R11, R12)
    39  	LDP	-16(R8), (R13, R14)
    40  	LDP.P	32(R9), (R15, R16)
    41  	LDP	-16(R9), (R17, R19)
    42  	ADCS	R15, R11
    43  	ADCS	R16, R12
    44  	ADCS	R17, R13
    45  	ADCS	R19, R14
    46  	STP.P	(R11, R12), 32(R10)
    47  	STP	(R13, R14), -16(R10)
    48  	SUB	$4, R0
    49  	B	loop
    50  done:
    51  	CSET	HS, R0		// extract carry flag
    52  	MOVD	R0, c+72(FP)
    53  	RET
    54  
    55  
    56  // func subVV(z, x, y []Word) (c Word)
    57  TEXT ·subVV(SB),NOSPLIT,$0
    58  	MOVD	z_len+8(FP), R0
    59  	MOVD	x+24(FP), R8
    60  	MOVD	y+48(FP), R9
    61  	MOVD	z+0(FP), R10
    62  	CMP	R0, R0		// set carry flag
    63  	TBZ	$0, R0, two
    64  	MOVD.P	8(R8), R11
    65  	MOVD.P	8(R9), R15
    66  	SBCS	R15, R11
    67  	MOVD.P	R11, 8(R10)
    68  	SUB	$1, R0
    69  two:
    70  	TBZ	$1, R0, loop
    71  	LDP.P	16(R8), (R11, R12)
    72  	LDP.P	16(R9), (R15, R16)
    73  	SBCS	R15, R11
    74  	SBCS	R16, R12
    75  	STP.P	(R11, R12), 16(R10)
    76  	SUB	$2, R0
    77  loop:
    78  	CBZ	R0, done	// careful not to touch the carry flag
    79  	LDP.P	32(R8), (R11, R12)
    80  	LDP	-16(R8), (R13, R14)
    81  	LDP.P	32(R9), (R15, R16)
    82  	LDP	-16(R9), (R17, R19)
    83  	SBCS	R15, R11
    84  	SBCS	R16, R12
    85  	SBCS	R17, R13
    86  	SBCS	R19, R14
    87  	STP.P	(R11, R12), 32(R10)
    88  	STP	(R13, R14), -16(R10)
    89  	SUB	$4, R0
    90  	B	loop
    91  done:
    92  	CSET	LO, R0		// extract carry flag
    93  	MOVD	R0, c+72(FP)
    94  	RET
    95  
    96  #define vwOneOp(instr, op1)				\
    97  	MOVD.P	8(R1), R4;				\
    98  	instr	op1, R4;				\
    99  	MOVD.P	R4, 8(R3);
   100  
   101  // handle the first 1~4 elements before starting iteration in addVW/subVW
   102  #define vwPreIter(instr1, instr2, counter, target)	\
   103  	vwOneOp(instr1, R2);				\
   104  	SUB	$1, counter;				\
   105  	CBZ	counter, target;			\
   106  	vwOneOp(instr2, $0);				\
   107  	SUB	$1, counter;				\
   108  	CBZ	counter, target;			\
   109  	vwOneOp(instr2, $0);				\
   110  	SUB	$1, counter;				\
   111  	CBZ	counter, target;			\
   112  	vwOneOp(instr2, $0);
   113  
   114  // do one iteration of add or sub in addVW/subVW
   115  #define vwOneIter(instr, counter, exit)	\
   116  	CBZ	counter, exit;		\	// careful not to touch the carry flag
   117  	LDP.P	32(R1), (R4, R5);	\
   118  	LDP	-16(R1), (R6, R7);	\
   119  	instr	$0, R4, R8;		\
   120  	instr	$0, R5, R9;		\
   121  	instr	$0, R6, R10;		\
   122  	instr	$0, R7, R11;		\
   123  	STP.P	(R8, R9), 32(R3);	\
   124  	STP	(R10, R11), -16(R3);	\
   125  	SUB	$4, counter;
   126  
   127  // do one iteration of copy in addVW/subVW
   128  #define vwOneIterCopy(counter, exit)			\
   129  	CBZ	counter, exit;				\
   130  	LDP.P	32(R1), (R4, R5);			\
   131  	LDP	-16(R1), (R6, R7);			\
   132  	STP.P	(R4, R5), 32(R3);			\
   133  	STP	(R6, R7), -16(R3);			\
   134  	SUB	$4, counter;
   135  
   136  // func addVW(z, x []Word, y Word) (c Word)
   137  // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   138  // and switches to copy if we are done with carries. The copying is skipped as well
   139  // if 'x' and 'z' happen to share the same underlying storage.
   140  // The overhead of the checking and branching is visible when 'z' are small (~5%),
   141  // so set a threshold of 32, and remain the small-sized part entirely untouched.
   142  TEXT ·addVW(SB),NOSPLIT,$0
   143  	MOVD	z+0(FP), R3
   144  	MOVD	z_len+8(FP), R0
   145  	MOVD	x+24(FP), R1
   146  	MOVD	y+48(FP), R2
   147  	CMP	$32, R0
   148  	BGE	large		// large-sized 'z' and 'x'
   149  	CBZ	R0, len0	// the length of z is 0
   150  	MOVD.P	8(R1), R4
   151  	ADDS	R2, R4		// z[0] = x[0] + y, set carry
   152  	MOVD.P	R4, 8(R3)
   153  	SUB	$1, R0
   154  	CBZ	R0, len1	// the length of z is 1
   155  	TBZ	$0, R0, two
   156  	MOVD.P	8(R1), R4	// do it once
   157  	ADCS	$0, R4
   158  	MOVD.P	R4, 8(R3)
   159  	SUB	$1, R0
   160  two:				// do it twice
   161  	TBZ	$1, R0, loop
   162  	LDP.P	16(R1), (R4, R5)
   163  	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
   164  	ADCS	$0, R5, R9
   165  	STP.P	(R8, R9), 16(R3)
   166  	SUB	$2, R0
   167  loop:				// do four times per round
   168  	vwOneIter(ADCS, R0, len1)
   169  	B	loop
   170  len1:
   171  	CSET	HS, R2		// extract carry flag
   172  len0:
   173  	MOVD	R2, c+56(FP)
   174  done:
   175  	RET
   176  large:
   177  	AND	$0x3, R0, R10
   178  	AND	$~0x3, R0
   179  	// unrolling for the first 1~4 elements to avoid saving the carry
   180  	// flag in each step, adjust $R0 if we unrolled 4 elements
   181  	vwPreIter(ADDS, ADCS, R10, add4)
   182  	SUB	$4, R0
   183  add4:
   184  	BCC	copy
   185  	vwOneIter(ADCS, R0, len1)
   186  	B	add4
   187  copy:
   188  	MOVD	ZR, c+56(FP)
   189  	CMP	R1, R3
   190  	BEQ	done
   191  copy_4:				// no carry flag, copy the rest
   192  	vwOneIterCopy(R0, done)
   193  	B	copy_4
   194  
   195  // func subVW(z, x []Word, y Word) (c Word)
   196  // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   197  // and switches to copy if we are done with carries. The copying is skipped as well
   198  // if 'x' and 'z' happen to share the same underlying storage.
   199  // The overhead of the checking and branching is visible when 'z' are small (~5%),
   200  // so set a threshold of 32, and remain the small-sized part entirely untouched.
   201  TEXT ·subVW(SB),NOSPLIT,$0
   202  	MOVD	z+0(FP), R3
   203  	MOVD	z_len+8(FP), R0
   204  	MOVD	x+24(FP), R1
   205  	MOVD	y+48(FP), R2
   206  	CMP	$32, R0
   207  	BGE	large		// large-sized 'z' and 'x'
   208  	CBZ	R0, len0	// the length of z is 0
   209  	MOVD.P	8(R1), R4
   210  	SUBS	R2, R4		// z[0] = x[0] - y, set carry
   211  	MOVD.P	R4, 8(R3)
   212  	SUB	$1, R0
   213  	CBZ	R0, len1	// the length of z is 1
   214  	TBZ	$0, R0, two	// do it once
   215  	MOVD.P	8(R1), R4
   216  	SBCS	$0, R4
   217  	MOVD.P	R4, 8(R3)
   218  	SUB	$1, R0
   219  two:				// do it twice
   220  	TBZ	$1, R0, loop
   221  	LDP.P	16(R1), (R4, R5)
   222  	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
   223  	SBCS	$0, R5, R9
   224  	STP.P	(R8, R9), 16(R3)
   225  	SUB	$2, R0
   226  loop:				// do four times per round
   227  	vwOneIter(SBCS, R0, len1)
   228  	B	loop
   229  len1:
   230  	CSET	LO, R2		// extract carry flag
   231  len0:
   232  	MOVD	R2, c+56(FP)
   233  done:
   234  	RET
   235  large:
   236  	AND	$0x3, R0, R10
   237  	AND	$~0x3, R0
   238  	// unrolling for the first 1~4 elements to avoid saving the carry
   239  	// flag in each step, adjust $R0 if we unrolled 4 elements
   240  	vwPreIter(SUBS, SBCS, R10, sub4)
   241  	SUB	$4, R0
   242  sub4:
   243  	BCS	copy
   244  	vwOneIter(SBCS, R0, len1)
   245  	B	sub4
   246  copy:
   247  	MOVD	ZR, c+56(FP)
   248  	CMP	R1, R3
   249  	BEQ	done
   250  copy_4:				// no carry flag, copy the rest
   251  	vwOneIterCopy(R0, done)
   252  	B	copy_4
   253  
   254  // func shlVU(z, x []Word, s uint) (c Word)
   255  // This implementation handles the shift operation from the high word to the low word,
   256  // which may be an error for the case where the low word of x overlaps with the high
   257  // word of z. When calling this function directly, you need to pay attention to this
   258  // situation.
   259  TEXT ·shlVU(SB),NOSPLIT,$0
   260  	LDP	z+0(FP), (R0, R1)	// R0 = z.ptr, R1 = len(z)
   261  	MOVD	x+24(FP), R2
   262  	MOVD	s+48(FP), R3
   263  	ADD	R1<<3, R0	// R0 = &z[n]
   264  	ADD	R1<<3, R2	// R2 = &x[n]
   265  	CBZ	R1, len0
   266  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   267  	MOVD	$64, R4
   268  	SUB	R3, R4
   269  	// handling the most significant element x[n-1]
   270  	MOVD.W	-8(R2), R6
   271  	LSR	R4, R6, R5	// return value
   272  	LSL	R3, R6, R8	// x[i] << s
   273  	SUB	$1, R1
   274  one:	TBZ	$0, R1, two
   275  	MOVD.W	-8(R2), R6
   276  	LSR	R4, R6, R7
   277  	ORR	R8, R7
   278  	LSL	R3, R6, R8
   279  	SUB	$1, R1
   280  	MOVD.W	R7, -8(R0)
   281  two:
   282  	TBZ	$1, R1, loop
   283  	LDP.W	-16(R2), (R6, R7)
   284  	LSR	R4, R7, R10
   285  	ORR	R8, R10
   286  	LSL	R3, R7
   287  	LSR	R4, R6, R9
   288  	ORR	R7, R9
   289  	LSL	R3, R6, R8
   290  	SUB	$2, R1
   291  	STP.W	(R9, R10), -16(R0)
   292  loop:
   293  	CBZ	R1, done
   294  	LDP.W	-32(R2), (R10, R11)
   295  	LDP	16(R2), (R12, R13)
   296  	LSR	R4, R13, R23
   297  	ORR	R8, R23		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
   298  	LSL	R3, R13
   299  	LSR	R4, R12, R22
   300  	ORR	R13, R22
   301  	LSL	R3, R12
   302  	LSR	R4, R11, R21
   303  	ORR	R12, R21
   304  	LSL	R3, R11
   305  	LSR	R4, R10, R20
   306  	ORR	R11, R20
   307  	LSL	R3, R10, R8
   308  	STP.W	(R20, R21), -32(R0)
   309  	STP	(R22, R23), 16(R0)
   310  	SUB	$4, R1
   311  	B	loop
   312  done:
   313  	MOVD.W	R8, -8(R0)	// the first element x[0]
   314  	MOVD	R5, c+56(FP)	// the part moved out from x[n-1]
   315  	RET
   316  copy:
   317  	CMP	R0, R2
   318  	BEQ	len0
   319  	TBZ	$0, R1, ctwo
   320  	MOVD.W	-8(R2), R4
   321  	MOVD.W	R4, -8(R0)
   322  	SUB	$1, R1
   323  ctwo:
   324  	TBZ	$1, R1, cloop
   325  	LDP.W	-16(R2), (R4, R5)
   326  	STP.W	(R4, R5), -16(R0)
   327  	SUB	$2, R1
   328  cloop:
   329  	CBZ	R1, len0
   330  	LDP.W	-32(R2), (R4, R5)
   331  	LDP	16(R2), (R6, R7)
   332  	STP.W	(R4, R5), -32(R0)
   333  	STP	(R6, R7), 16(R0)
   334  	SUB	$4, R1
   335  	B	cloop
   336  len0:
   337  	MOVD	$0, c+56(FP)
   338  	RET
   339  
   340  // func shrVU(z, x []Word, s uint) (c Word)
   341  // This implementation handles the shift operation from the low word to the high word,
   342  // which may be an error for the case where the high word of x overlaps with the low
   343  // word of z. When calling this function directly, you need to pay attention to this
   344  // situation.
   345  TEXT ·shrVU(SB),NOSPLIT,$0
   346  	MOVD	z+0(FP), R0
   347  	MOVD	z_len+8(FP), R1
   348  	MOVD	x+24(FP), R2
   349  	MOVD	s+48(FP), R3
   350  	MOVD	$0, R8
   351  	MOVD	$64, R4
   352  	SUB	R3, R4
   353  	CBZ	R1, len0
   354  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   355  
   356  	MOVD.P	8(R2), R20
   357  	LSR	R3, R20, R8
   358  	LSL	R4, R20
   359  	MOVD	R20, c+56(FP)	// deal with the first element
   360  	SUB	$1, R1
   361  
   362  	TBZ	$0, R1, two
   363  	MOVD.P	8(R2), R6
   364  	LSL	R4, R6, R20
   365  	ORR	R8, R20
   366  	LSR	R3, R6, R8
   367  	MOVD.P	R20, 8(R0)
   368  	SUB	$1, R1
   369  two:
   370  	TBZ	$1, R1, loop
   371  	LDP.P	16(R2), (R6, R7)
   372  	LSL	R4, R6, R20
   373  	LSR	R3, R6
   374  	ORR	R8, R20
   375  	LSL	R4, R7, R21
   376  	LSR	R3, R7, R8
   377  	ORR	R6, R21
   378  	STP.P	(R20, R21), 16(R0)
   379  	SUB	$2, R1
   380  loop:
   381  	CBZ	R1, done
   382  	LDP.P	32(R2), (R10, R11)
   383  	LDP	-16(R2), (R12, R13)
   384  	LSL	R4, R10, R20
   385  	LSR	R3, R10
   386  	ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
   387  	LSL	R4, R11, R21
   388  	LSR	R3, R11
   389  	ORR	R10, R21
   390  	LSL	R4, R12, R22
   391  	LSR	R3, R12
   392  	ORR	R11, R22
   393  	LSL	R4, R13, R23
   394  	LSR	R3, R13, R8
   395  	ORR	R12, R23
   396  	STP.P	(R20, R21), 32(R0)
   397  	STP	(R22, R23), -16(R0)
   398  	SUB	$4, R1
   399  	B	loop
   400  done:
   401  	MOVD	R8, (R0)	// deal with the last element
   402  	RET
   403  copy:
   404  	CMP	R0, R2
   405  	BEQ	len0
   406  	TBZ	$0, R1, ctwo
   407  	MOVD.P	8(R2), R3
   408  	MOVD.P	R3, 8(R0)
   409  	SUB	$1, R1
   410  ctwo:
   411  	TBZ	$1, R1, cloop
   412  	LDP.P	16(R2), (R4, R5)
   413  	STP.P	(R4, R5), 16(R0)
   414  	SUB	$2, R1
   415  cloop:
   416  	CBZ	R1, len0
   417  	LDP.P	32(R2), (R4, R5)
   418  	LDP	-16(R2), (R6, R7)
   419  	STP.P	(R4, R5), 32(R0)
   420  	STP	(R6, R7), -16(R0)
   421  	SUB	$4, R1
   422  	B	cloop
   423  len0:
   424  	MOVD	$0, c+56(FP)
   425  	RET
   426  
   427  
   428  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   429  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   430  	MOVD	z+0(FP), R1
   431  	MOVD	z_len+8(FP), R0
   432  	MOVD	x+24(FP), R2
   433  	MOVD	y+48(FP), R3
   434  	MOVD	r+56(FP), R4
   435  	// c, z = x * y + r
   436  	TBZ	$0, R0, two
   437  	MOVD.P	8(R2), R5
   438  	MUL	R3, R5, R7
   439  	UMULH	R3, R5, R8
   440  	ADDS	R4, R7
   441  	ADC	$0, R8, R4	// c, z[i] = x[i] * y +  r
   442  	MOVD.P	R7, 8(R1)
   443  	SUB	$1, R0
   444  two:
   445  	TBZ	$1, R0, loop
   446  	LDP.P	16(R2), (R5, R6)
   447  	MUL	R3, R5, R10
   448  	UMULH	R3, R5, R11
   449  	ADDS	R4, R10
   450  	MUL	R3, R6, R12
   451  	UMULH	R3, R6, R13
   452  	ADCS	R12, R11
   453  	ADC	$0, R13, R4
   454  
   455  	STP.P	(R10, R11), 16(R1)
   456  	SUB	$2, R0
   457  loop:
   458  	CBZ	R0, done
   459  	LDP.P	32(R2), (R5, R6)
   460  	LDP	-16(R2), (R7, R8)
   461  
   462  	MUL	R3, R5, R10
   463  	UMULH	R3, R5, R11
   464  	ADDS	R4, R10
   465  	MUL	R3, R6, R12
   466  	UMULH	R3, R6, R13
   467  	ADCS	R11, R12
   468  
   469  	MUL	R3, R7, R14
   470  	UMULH	R3, R7, R15
   471  	ADCS	R13, R14
   472  	MUL	R3, R8, R16
   473  	UMULH	R3, R8, R17
   474  	ADCS	R15, R16
   475  	ADC	$0, R17, R4
   476  
   477  	STP.P	(R10, R12), 32(R1)
   478  	STP	(R14, R16), -16(R1)
   479  	SUB	$4, R0
   480  	B	loop
   481  done:
   482  	MOVD	R4, c+64(FP)
   483  	RET
   484  
   485  
   486  // func addMulVVW(z, x []Word, y Word) (c Word)
   487  TEXT ·addMulVVW(SB),NOSPLIT,$0
   488  	MOVD	z+0(FP), R1
   489  	MOVD	z_len+8(FP), R0
   490  	MOVD	x+24(FP), R2
   491  	MOVD	y+48(FP), R3
   492  	MOVD	$0, R4
   493  
   494  	TBZ	$0, R0, two
   495  
   496  	MOVD.P	8(R2), R5
   497  	MOVD	(R1), R6
   498  
   499  	MUL	R5, R3, R7
   500  	UMULH	R5, R3, R8
   501  
   502  	ADDS	R7, R6
   503  	ADC	$0, R8, R4
   504  
   505  	MOVD.P	R6, 8(R1)
   506  	SUB	$1, R0
   507  
   508  two:
   509  	TBZ	$1, R0, loop
   510  
   511  	LDP.P	16(R2), (R5, R10)
   512  	LDP	(R1), (R6, R11)
   513  
   514  	MUL	R10, R3, R13
   515  	UMULH	R10, R3, R12
   516  
   517  	MUL	R5, R3, R7
   518  	UMULH	R5, R3, R8
   519  
   520  	ADDS	R4, R6
   521  	ADCS	R13, R11
   522  	ADC	$0, R12
   523  
   524  	ADDS	R7, R6
   525  	ADCS	R8, R11
   526  	ADC	$0, R12, R4
   527  
   528  	STP.P	(R6, R11), 16(R1)
   529  	SUB	$2, R0
   530  
   531  // The main loop of this code operates on a block of 4 words every iteration
   532  // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
   533  // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
   534  // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
   535  loop:
   536  	CBZ	R0, done
   537  
   538  	LDP.P	16(R2), (R5, R6)
   539  	LDP.P	16(R2), (R7, R8)
   540  
   541  	LDP	(R1), (R9, R10)
   542  	ADDS	R4, R9
   543  	MUL	R6, R3, R14
   544  	ADCS	R14, R10
   545  	MUL	R7, R3, R15
   546  	LDP	16(R1), (R11, R12)
   547  	ADCS	R15, R11
   548  	MUL	R8, R3, R16
   549  	ADCS	R16, R12
   550  	UMULH	R8, R3, R20
   551  	ADC	$0, R20
   552  
   553  	MUL	R5, R3, R13
   554  	ADDS	R13, R9
   555  	UMULH	R5, R3, R17
   556  	ADCS	R17, R10
   557  	UMULH	R6, R3, R21
   558  	STP.P	(R9, R10), 16(R1)
   559  	ADCS	R21, R11
   560  	UMULH	R7, R3, R19
   561  	ADCS	R19, R12
   562  	STP.P	(R11, R12), 16(R1)
   563  	ADC	$0, R20, R4
   564  
   565  	SUB	$4, R0
   566  	B	loop
   567  
   568  done:
   569  	MOVD	R4, c+56(FP)
   570  	RET
   571  
   572  
   573
View as plain text