...
Run Format

Text file src/math/big/arith_arm64.s

Documentation: math/big

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// TODO: Consider re-implementing using Advanced SIMD
    13	// once the assembler supports those instructions.
    14	
    15	// func mulWW(x, y Word) (z1, z0 Word)
    16	TEXT ·mulWW(SB),NOSPLIT,$0
    17		MOVD	x+0(FP), R0
    18		MOVD	y+8(FP), R1
    19		MUL	R0, R1, R2
    20		UMULH	R0, R1, R3
    21		MOVD	R3, z1+16(FP)
    22		MOVD	R2, z0+24(FP)
    23		RET
    24	
    25	
    26	// func divWW(x1, x0, y Word) (q, r Word)
    27	TEXT ·divWW(SB),NOSPLIT,$0
    28		B	·divWW_g(SB) // ARM64 has no multiword division
    29	
    30	
    31	// func addVV(z, x, y []Word) (c Word)
    32	TEXT ·addVV(SB),NOSPLIT,$0
    33		MOVD	z_len+8(FP), R0
    34		MOVD	x+24(FP), R8
    35		MOVD	y+48(FP), R9
    36		MOVD	z+0(FP), R10
    37		ADDS	$0, R0		// clear carry flag
    38		TBZ	$0, R0, two
    39		MOVD.P	8(R8), R11
    40		MOVD.P	8(R9), R15
    41		ADCS	R15, R11
    42		MOVD.P	R11, 8(R10)
    43		SUB	$1, R0
    44	two:
    45		TBZ	$1, R0, loop
    46		LDP.P	16(R8), (R11, R12)
    47		LDP.P	16(R9), (R15, R16)
    48		ADCS	R15, R11
    49		ADCS	R16, R12
    50		STP.P	(R11, R12), 16(R10)
    51		SUB	$2, R0
    52	loop:
    53		CBZ	R0, done	// careful not to touch the carry flag
    54		LDP.P	32(R8), (R11, R12)
    55		LDP	-16(R8), (R13, R14)
    56		LDP.P	32(R9), (R15, R16)
    57		LDP	-16(R9), (R17, R19)
    58		ADCS	R15, R11
    59		ADCS	R16, R12
    60		ADCS	R17, R13
    61		ADCS	R19, R14
    62		STP.P	(R11, R12), 32(R10)
    63		STP	(R13, R14), -16(R10)
    64		SUB	$4, R0
    65		B	loop
    66	done:
    67		CSET	HS, R0		// extract carry flag
    68		MOVD	R0, c+72(FP)
    69		RET
    70	
    71	
    72	// func subVV(z, x, y []Word) (c Word)
    73	TEXT ·subVV(SB),NOSPLIT,$0
    74		MOVD	z_len+8(FP), R0
    75		MOVD	x+24(FP), R8
    76		MOVD	y+48(FP), R9
    77		MOVD	z+0(FP), R10
    78		CMP	R0, R0		// set carry flag
    79		TBZ	$0, R0, two
    80		MOVD.P	8(R8), R11
    81		MOVD.P	8(R9), R15
    82		SBCS	R15, R11
    83		MOVD.P	R11, 8(R10)
    84		SUB	$1, R0
    85	two:
    86		TBZ	$1, R0, loop
    87		LDP.P	16(R8), (R11, R12)
    88		LDP.P	16(R9), (R15, R16)
    89		SBCS	R15, R11
    90		SBCS	R16, R12
    91		STP.P	(R11, R12), 16(R10)
    92		SUB	$2, R0
    93	loop:
    94		CBZ	R0, done	// careful not to touch the carry flag
    95		LDP.P	32(R8), (R11, R12)
    96		LDP	-16(R8), (R13, R14)
    97		LDP.P	32(R9), (R15, R16)
    98		LDP	-16(R9), (R17, R19)
    99		SBCS	R15, R11
   100		SBCS	R16, R12
   101		SBCS	R17, R13
   102		SBCS	R19, R14
   103		STP.P	(R11, R12), 32(R10)
   104		STP	(R13, R14), -16(R10)
   105		SUB	$4, R0
   106		B	loop
   107	done:
   108		CSET	LO, R0		// extract carry flag
   109		MOVD	R0, c+72(FP)
   110		RET
   111	
   112	
   113	// func addVW(z, x []Word, y Word) (c Word)
   114	TEXT ·addVW(SB),NOSPLIT,$0
   115		MOVD	z+0(FP), R3
   116		MOVD	z_len+8(FP), R0
   117		MOVD	x+24(FP), R1
   118		MOVD	y+48(FP), R2
   119		CBZ	R0, len0	// the length of z is 0
   120		MOVD.P	8(R1), R4
   121		ADDS	R2, R4		// z[0] = x[0] + y, set carry
   122		MOVD.P	R4, 8(R3)
   123		SUB	$1, R0
   124		CBZ	R0, len1	// the length of z is 1
   125		TBZ	$0, R0, two
   126		MOVD.P	8(R1), R4	// do it once
   127		ADCS	$0, R4
   128		MOVD.P	R4, 8(R3)
   129		SUB	$1, R0
   130	two:				// do it twice
   131		TBZ	$1, R0, loop
   132		LDP.P	16(R1), (R4, R5)
   133		ADCS	$0, R4, R8	// c, z[i] = x[i] + c
   134		ADCS	$0, R5, R9
   135		STP.P	(R8, R9), 16(R3)
   136		SUB	$2, R0
   137	loop:				// do four times per round
   138		CBZ	R0, len1	// careful not to touch the carry flag
   139		LDP.P	32(R1), (R4, R5)
   140		LDP	-16(R1), (R6, R7)
   141		ADCS	$0, R4, R8
   142		ADCS	$0, R5, R9
   143		ADCS	$0, R6, R10
   144		ADCS	$0, R7, R11
   145		STP.P	(R8, R9), 32(R3)
   146		STP	(R10, R11), -16(R3)
   147		SUB	$4, R0
   148		B	loop
   149	len1:
   150		CSET	HS, R2		// extract carry flag
   151	len0:
   152		MOVD	R2, c+56(FP)
   153		RET
   154	
   155	// func subVW(z, x []Word, y Word) (c Word)
   156	TEXT ·subVW(SB),NOSPLIT,$0
   157		MOVD	z+0(FP), R3
   158		MOVD	z_len+8(FP), R0
   159		MOVD	x+24(FP), R1
   160		MOVD	y+48(FP), R2
   161		CBZ	R0, len0	// the length of z is 0
   162		MOVD.P	8(R1), R4
   163		SUBS	R2, R4		// z[0] = x[0] - y, set carry
   164		MOVD.P	R4, 8(R3)
   165		SUB	$1, R0
   166		CBZ	R0, len1	// the length of z is 1
   167		TBZ	$0, R0, two	// do it once
   168		MOVD.P	8(R1), R4
   169		SBCS	$0, R4
   170		MOVD.P	R4, 8(R3)
   171		SUB	$1, R0
   172	two:				// do it twice
   173		TBZ	$1, R0, loop
   174		LDP.P	16(R1), (R4, R5)
   175		SBCS	$0, R4, R8	// c, z[i] = x[i] + c
   176		SBCS	$0, R5, R9
   177		STP.P	(R8, R9), 16(R3)
   178		SUB	$2, R0
   179	loop:				// do four times per round
   180		CBZ	R0, len1	// careful not to touch the carry flag
   181		LDP.P	32(R1), (R4, R5)
   182		LDP	-16(R1), (R6, R7)
   183		SBCS	$0, R4, R8
   184		SBCS	$0, R5, R9
   185		SBCS	$0, R6, R10
   186		SBCS	$0, R7, R11
   187		STP.P	(R8, R9), 32(R3)
   188		STP	(R10, R11), -16(R3)
   189		SUB	$4, R0
   190		B	loop
   191	len1:
   192		CSET	LO, R2		// extract carry flag
   193	len0:
   194		MOVD	R2, c+56(FP)
   195		RET
   196	
   197	
   198	// func shlVU(z, x []Word, s uint) (c Word)
   199	TEXT ·shlVU(SB),NOSPLIT,$0
   200		MOVD	z+0(FP), R0
   201		MOVD	z_len+8(FP), R1
   202		MOVD	x+24(FP), R2
   203		MOVD	s+48(FP), R3
   204		MOVD	$0, R8		// in order not to affect the first element, R8 is initialized to zero
   205		MOVD	$64, R4
   206		SUB	R3, R4
   207		CBZ	R1, len0
   208		CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   209	
   210		TBZ	$0, R1, two
   211		MOVD.P	8(R2), R6
   212		LSR	R4, R6, R8
   213		LSL	R3, R6
   214		MOVD.P	R6, 8(R0)
   215		SUB	$1, R1
   216	two:
   217		TBZ	$1, R1, loop
   218		LDP.P	16(R2), (R6, R7)
   219		LSR	R4, R6, R9
   220		LSL	R3, R6
   221		ORR	R8, R6
   222		LSR	R4, R7, R8
   223		LSL	R3, R7
   224		ORR	R9, R7
   225		STP.P	(R6, R7), 16(R0)
   226		SUB	$2, R1
   227	loop:
   228		CBZ	R1, done
   229		LDP.P	32(R2), (R10, R11)
   230		LDP	-16(R2), (R12, R13)
   231		LSR	R4, R10, R20
   232		LSL	R3, R10
   233		ORR	R8, R10		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
   234		LSR	R4, R11, R21
   235		LSL	R3, R11
   236		ORR	R20, R11
   237		LSR	R4, R12, R22
   238		LSL	R3, R12
   239		ORR	R21, R12
   240		LSR	R4, R13, R8
   241		LSL	R3, R13
   242		ORR	R22, R13
   243		STP.P	(R10, R11), 32(R0)
   244		STP	(R12, R13), -16(R0)
   245		SUB	$4, R1
   246		B	loop
   247	done:
   248		MOVD	R8, c+56(FP)	// the part moved out from the last element
   249		RET
   250	copy:
   251		TBZ	$0, R1, ctwo
   252		MOVD.P	8(R2), R3
   253		MOVD.P	R3, 8(R0)
   254		SUB	$1, R1
   255	ctwo:
   256		TBZ	$1, R1, cloop
   257		LDP.P	16(R2), (R4, R5)
   258		STP.P	(R4, R5), 16(R0)
   259		SUB	$2, R1
   260	cloop:
   261		CBZ	R1, len0
   262		LDP.P	32(R2), (R4, R5)
   263		LDP	-16(R2), (R6, R7)
   264		STP.P	(R4, R5), 32(R0)
   265		STP	(R6, R7), -16(R0)
   266		SUB	$4, R1
   267		B	cloop
   268	len0:
   269		MOVD	$0, c+56(FP)
   270		RET
   271	
   272	
   273	// func shrVU(z, x []Word, s uint) (c Word)
   274	TEXT ·shrVU(SB),NOSPLIT,$0
   275		MOVD	z+0(FP), R0
   276		MOVD	z_len+8(FP), R1
   277		MOVD	x+24(FP), R2
   278		MOVD	s+48(FP), R3
   279		MOVD	$0, R8
   280		MOVD	$64, R4
   281		SUB	R3, R4
   282		CBZ	R1, len0
   283		CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   284	
   285		MOVD.P	8(R2), R20
   286		LSR	R3, R20, R8
   287		LSL	R4, R20
   288		MOVD	R20, c+56(FP)	// deal with the first element
   289		SUB	$1, R1
   290	
   291		TBZ	$0, R1, two
   292		MOVD.P	8(R2), R6
   293		LSL	R4, R6, R20
   294		ORR	R8, R20
   295		LSR	R3, R6, R8
   296		MOVD.P	R20, 8(R0)
   297		SUB	$1, R1
   298	two:
   299		TBZ	$1, R1, loop
   300		LDP.P	16(R2), (R6, R7)
   301		LSL	R4, R6, R20
   302		LSR	R3, R6
   303		ORR	R8, R20
   304		LSL	R4, R7, R21
   305		LSR	R3, R7, R8
   306		ORR	R6, R21
   307		STP.P	(R20, R21), 16(R0)
   308		SUB	$2, R1
   309	loop:
   310		CBZ	R1, done
   311		LDP.P	32(R2), (R10, R11)
   312		LDP	-16(R2), (R12, R13)
   313		LSL	R4, R10, R20
   314		LSR	R3, R10
   315		ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
   316		LSL	R4, R11, R21
   317		LSR	R3, R11
   318		ORR	R10, R21
   319		LSL	R4, R12, R22
   320		LSR	R3, R12
   321		ORR	R11, R22
   322		LSL	R4, R13, R23
   323		LSR	R3, R13, R8
   324		ORR	R12, R23
   325		STP.P	(R20, R21), 32(R0)
   326		STP	(R22, R23), -16(R0)
   327		SUB	$4, R1
   328		B	loop
   329	done:
   330		MOVD	R8, (R0)	// deal with the last element
   331		RET
   332	copy:
   333		TBZ	$0, R1, ctwo
   334		MOVD.P	8(R2), R3
   335		MOVD.P	R3, 8(R0)
   336		SUB	$1, R1
   337	ctwo:
   338		TBZ	$1, R1, cloop
   339		LDP.P	16(R2), (R4, R5)
   340		STP.P	(R4, R5), 16(R0)
   341		SUB	$2, R1
   342	cloop:
   343		CBZ	R1, len0
   344		LDP.P	32(R2), (R4, R5)
   345		LDP	-16(R2), (R6, R7)
   346		STP.P	(R4, R5), 32(R0)
   347		STP	(R6, R7), -16(R0)
   348		SUB	$4, R1
   349		B	cloop
   350	len0:
   351		MOVD	$0, c+56(FP)
   352		RET
   353	
   354	
   355	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   356	TEXT ·mulAddVWW(SB),NOSPLIT,$0
   357		MOVD	z+0(FP), R1
   358		MOVD	z_len+8(FP), R0
   359		MOVD	x+24(FP), R2
   360		MOVD	y+48(FP), R3
   361		MOVD	r+56(FP), R4
   362	loop:
   363		CBZ	R0, done
   364		MOVD.P	8(R2), R5
   365		UMULH	R5, R3, R7
   366		MUL	R5, R3, R6
   367		ADDS	R4, R6
   368		ADC	$0, R7
   369		MOVD.P	R6, 8(R1)
   370		MOVD	R7, R4
   371		SUB	$1, R0
   372		B	loop
   373	done:
   374		MOVD	R4, c+64(FP)
   375		RET
   376	
   377	
   378	// func addMulVVW(z, x []Word, y Word) (c Word)
   379	TEXT ·addMulVVW(SB),NOSPLIT,$0
   380		MOVD	z+0(FP), R1
   381		MOVD	z_len+8(FP), R0
   382		MOVD	x+24(FP), R2
   383		MOVD	y+48(FP), R3
   384		MOVD	$0, R4
   385	
   386		TBZ	$0, R0, two
   387	
   388		MOVD.P	8(R2), R5
   389		MOVD	(R1), R6
   390	
   391		MUL	R5, R3, R7
   392		UMULH	R5, R3, R8
   393	
   394		ADDS	R7, R6
   395		ADC	$0, R8, R4
   396	
   397		MOVD.P	R6, 8(R1)
   398		SUB	$1, R0
   399	
   400	two:
   401		TBZ	$1, R0, loop
   402	
   403		LDP.P	16(R2), (R5, R10)
   404		LDP	(R1), (R6, R11)
   405	
   406		MUL	R10, R3, R13
   407		UMULH	R10, R3, R12
   408	
   409		MUL	R5, R3, R7
   410		UMULH	R5, R3, R8
   411	
   412		ADDS	R4, R6
   413		ADCS	R13, R11
   414		ADC	$0, R12
   415	
   416		ADDS	R7, R6
   417		ADCS	R8, R11
   418		ADC	$0, R12, R4
   419	
   420		STP.P	(R6, R11), 16(R1)
   421		SUB	$2, R0
   422	
   423	// The main loop of this code operates on a block of 4 words every iteration
   424	// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
   425	// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
   426	// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
   427	loop:
   428		CBZ	R0, done
   429	
   430		LDP.P	16(R2), (R5, R6)
   431		LDP.P	16(R2), (R7, R8)
   432	
   433		LDP	(R1), (R9, R10)
   434		ADDS	R4, R9
   435		MUL	R6, R3, R14
   436		ADCS	R14, R10
   437		MUL	R7, R3, R15
   438		LDP	16(R1), (R11, R12)
   439		ADCS	R15, R11
   440		MUL	R8, R3, R16
   441		ADCS	R16, R12
   442		UMULH	R8, R3, R20
   443		ADC	$0, R20
   444	
   445		MUL	R5, R3, R13
   446		ADDS	R13, R9
   447		UMULH	R5, R3, R17
   448		ADCS	R17, R10
   449		UMULH	R6, R3, R21
   450		STP.P	(R9, R10), 16(R1)
   451		ADCS	R21, R11
   452		UMULH	R7, R3, R19
   453		ADCS	R19, R12
   454		STP.P	(R11, R12), 16(R1)
   455		ADC	$0, R20, R4
   456	
   457		SUB	$4, R0
   458		B	loop
   459	
   460	done:
   461		MOVD	R4, c+56(FP)
   462		RET
   463	
   464	// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   465	TEXT ·divWVW(SB),NOSPLIT,$0
   466		B ·divWVW_g(SB)

View as plain text