...
Run Format

Text file src/math/big/arith_amd64.s

Documentation: math/big

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// func mulWW(x, y Word) (z1, z0 Word)
    13	TEXT ·mulWW(SB),NOSPLIT,$0
    14		MOVQ x+0(FP), AX
    15		MULQ y+8(FP)
    16		MOVQ DX, z1+16(FP)
    17		MOVQ AX, z0+24(FP)
    18		RET
    19	
    20	
    21	// func divWW(x1, x0, y Word) (q, r Word)
    22	TEXT ·divWW(SB),NOSPLIT,$0
    23		MOVQ x1+0(FP), DX
    24		MOVQ x0+8(FP), AX
    25		DIVQ y+16(FP)
    26		MOVQ AX, q+24(FP)
    27		MOVQ DX, r+32(FP)
    28		RET
    29	
    30	// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31	// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32	// This is faster than using rotate instructions.
    33	//
    34	// CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
    35	
    36	// func addVV(z, x, y []Word) (c Word)
    37	TEXT ·addVV(SB),NOSPLIT,$0
    38		MOVQ z_len+8(FP), DI
    39		MOVQ x+24(FP), R8
    40		MOVQ y+48(FP), R9
    41		MOVQ z+0(FP), R10
    42	
    43		MOVQ $0, CX		// c = 0
    44		MOVQ $0, SI		// i = 0
    45	
    46		// s/JL/JMP/ below to disable the unrolled loop
    47		SUBQ $4, DI		// n -= 4
    48		JL V1			// if n < 0 goto V1
    49	
    50	U1:	// n >= 0
    51		// regular loop body unrolled 4x
    52		ADDQ CX, CX		// restore CF
    53		MOVQ 0(R8)(SI*8), R11
    54		MOVQ 8(R8)(SI*8), R12
    55		MOVQ 16(R8)(SI*8), R13
    56		MOVQ 24(R8)(SI*8), R14
    57		ADCQ 0(R9)(SI*8), R11
    58		ADCQ 8(R9)(SI*8), R12
    59		ADCQ 16(R9)(SI*8), R13
    60		ADCQ 24(R9)(SI*8), R14
    61		MOVQ R11, 0(R10)(SI*8)
    62		MOVQ R12, 8(R10)(SI*8)
    63		MOVQ R13, 16(R10)(SI*8)
    64		MOVQ R14, 24(R10)(SI*8)
    65		SBBQ CX, CX		// save CF
    66	
    67		ADDQ $4, SI		// i += 4
    68		SUBQ $4, DI		// n -= 4
    69		JGE U1			// if n >= 0 goto U1
    70	
    71	V1:	ADDQ $4, DI		// n += 4
    72		JLE E1			// if n <= 0 goto E1
    73	
    74	L1:	// n > 0
    75		ADDQ CX, CX		// restore CF
    76		MOVQ 0(R8)(SI*8), R11
    77		ADCQ 0(R9)(SI*8), R11
    78		MOVQ R11, 0(R10)(SI*8)
    79		SBBQ CX, CX		// save CF
    80	
    81		ADDQ $1, SI		// i++
    82		SUBQ $1, DI		// n--
    83		JG L1			// if n > 0 goto L1
    84	
    85	E1:	NEGQ CX
    86		MOVQ CX, c+72(FP)	// return c
    87		RET
    88	
    89	
    90	// func subVV(z, x, y []Word) (c Word)
    91	// (same as addVV except for SBBQ instead of ADCQ and label names)
    92	TEXT ·subVV(SB),NOSPLIT,$0
    93		MOVQ z_len+8(FP), DI
    94		MOVQ x+24(FP), R8
    95		MOVQ y+48(FP), R9
    96		MOVQ z+0(FP), R10
    97	
    98		MOVQ $0, CX		// c = 0
    99		MOVQ $0, SI		// i = 0
   100	
   101		// s/JL/JMP/ below to disable the unrolled loop
   102		SUBQ $4, DI		// n -= 4
   103		JL V2			// if n < 0 goto V2
   104	
   105	U2:	// n >= 0
   106		// regular loop body unrolled 4x
   107		ADDQ CX, CX		// restore CF
   108		MOVQ 0(R8)(SI*8), R11
   109		MOVQ 8(R8)(SI*8), R12
   110		MOVQ 16(R8)(SI*8), R13
   111		MOVQ 24(R8)(SI*8), R14
   112		SBBQ 0(R9)(SI*8), R11
   113		SBBQ 8(R9)(SI*8), R12
   114		SBBQ 16(R9)(SI*8), R13
   115		SBBQ 24(R9)(SI*8), R14
   116		MOVQ R11, 0(R10)(SI*8)
   117		MOVQ R12, 8(R10)(SI*8)
   118		MOVQ R13, 16(R10)(SI*8)
   119		MOVQ R14, 24(R10)(SI*8)
   120		SBBQ CX, CX		// save CF
   121	
   122		ADDQ $4, SI		// i += 4
   123		SUBQ $4, DI		// n -= 4
   124		JGE U2			// if n >= 0 goto U2
   125	
   126	V2:	ADDQ $4, DI		// n += 4
   127		JLE E2			// if n <= 0 goto E2
   128	
   129	L2:	// n > 0
   130		ADDQ CX, CX		// restore CF
   131		MOVQ 0(R8)(SI*8), R11
   132		SBBQ 0(R9)(SI*8), R11
   133		MOVQ R11, 0(R10)(SI*8)
   134		SBBQ CX, CX		// save CF
   135	
   136		ADDQ $1, SI		// i++
   137		SUBQ $1, DI		// n--
   138		JG L2			// if n > 0 goto L2
   139	
   140	E2:	NEGQ CX
   141		MOVQ CX, c+72(FP)	// return c
   142		RET
   143	
   144	
   145	// func addVW(z, x []Word, y Word) (c Word)
   146	TEXT ·addVW(SB),NOSPLIT,$0
   147		MOVQ z_len+8(FP), DI
   148		MOVQ x+24(FP), R8
   149		MOVQ y+48(FP), CX	// c = y
   150		MOVQ z+0(FP), R10
   151	
   152		MOVQ $0, SI		// i = 0
   153	
   154		// s/JL/JMP/ below to disable the unrolled loop
   155		SUBQ $4, DI		// n -= 4
   156		JL V3			// if n < 4 goto V3
   157	
   158	U3:	// n >= 0
   159		// regular loop body unrolled 4x
   160		MOVQ 0(R8)(SI*8), R11
   161		MOVQ 8(R8)(SI*8), R12
   162		MOVQ 16(R8)(SI*8), R13
   163		MOVQ 24(R8)(SI*8), R14
   164		ADDQ CX, R11
   165		ADCQ $0, R12
   166		ADCQ $0, R13
   167		ADCQ $0, R14
   168		SBBQ CX, CX		// save CF
   169		NEGQ CX
   170		MOVQ R11, 0(R10)(SI*8)
   171		MOVQ R12, 8(R10)(SI*8)
   172		MOVQ R13, 16(R10)(SI*8)
   173		MOVQ R14, 24(R10)(SI*8)
   174	
   175		ADDQ $4, SI		// i += 4
   176		SUBQ $4, DI		// n -= 4
   177		JGE U3			// if n >= 0 goto U3
   178	
   179	V3:	ADDQ $4, DI		// n += 4
   180		JLE E3			// if n <= 0 goto E3
   181	
   182	L3:	// n > 0
   183		ADDQ 0(R8)(SI*8), CX
   184		MOVQ CX, 0(R10)(SI*8)
   185		SBBQ CX, CX		// save CF
   186		NEGQ CX
   187	
   188		ADDQ $1, SI		// i++
   189		SUBQ $1, DI		// n--
   190		JG L3			// if n > 0 goto L3
   191	
   192	E3:	MOVQ CX, c+56(FP)	// return c
   193		RET
   194	
   195	
   196	// func subVW(z, x []Word, y Word) (c Word)
   197	// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   198	TEXT ·subVW(SB),NOSPLIT,$0
   199		MOVQ z_len+8(FP), DI
   200		MOVQ x+24(FP), R8
   201		MOVQ y+48(FP), CX	// c = y
   202		MOVQ z+0(FP), R10
   203	
   204		MOVQ $0, SI		// i = 0
   205	
   206		// s/JL/JMP/ below to disable the unrolled loop
   207		SUBQ $4, DI		// n -= 4
   208		JL V4			// if n < 4 goto V4
   209	
   210	U4:	// n >= 0
   211		// regular loop body unrolled 4x
   212		MOVQ 0(R8)(SI*8), R11
   213		MOVQ 8(R8)(SI*8), R12
   214		MOVQ 16(R8)(SI*8), R13
   215		MOVQ 24(R8)(SI*8), R14
   216		SUBQ CX, R11
   217		SBBQ $0, R12
   218		SBBQ $0, R13
   219		SBBQ $0, R14
   220		SBBQ CX, CX		// save CF
   221		NEGQ CX
   222		MOVQ R11, 0(R10)(SI*8)
   223		MOVQ R12, 8(R10)(SI*8)
   224		MOVQ R13, 16(R10)(SI*8)
   225		MOVQ R14, 24(R10)(SI*8)
   226	
   227		ADDQ $4, SI		// i += 4
   228		SUBQ $4, DI		// n -= 4
   229		JGE U4			// if n >= 0 goto U4
   230	
   231	V4:	ADDQ $4, DI		// n += 4
   232		JLE E4			// if n <= 0 goto E4
   233	
   234	L4:	// n > 0
   235		MOVQ 0(R8)(SI*8), R11
   236		SUBQ CX, R11
   237		MOVQ R11, 0(R10)(SI*8)
   238		SBBQ CX, CX		// save CF
   239		NEGQ CX
   240	
   241		ADDQ $1, SI		// i++
   242		SUBQ $1, DI		// n--
   243		JG L4			// if n > 0 goto L4
   244	
   245	E4:	MOVQ CX, c+56(FP)	// return c
   246		RET
   247	
   248	
   249	// func shlVU(z, x []Word, s uint) (c Word)
   250	TEXT ·shlVU(SB),NOSPLIT,$0
   251		MOVQ z_len+8(FP), BX	// i = z
   252		SUBQ $1, BX		// i--
   253		JL X8b			// i < 0	(n <= 0)
   254	
   255		// n > 0
   256		MOVQ z+0(FP), R10
   257		MOVQ x+24(FP), R8
   258		MOVQ s+48(FP), CX
   259		MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   260		MOVQ $0, DX
   261		SHLQ CX, DX:AX		// w1>>ŝ
   262		MOVQ DX, c+56(FP)
   263	
   264		CMPQ BX, $0
   265		JLE X8a			// i <= 0
   266	
   267		// i > 0
   268	L8:	MOVQ AX, DX		// w = w1
   269		MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   270		SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   271		MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   272		SUBQ $1, BX		// i--
   273		JG L8			// i > 0
   274	
   275		// i <= 0
   276	X8a:	SHLQ CX, AX		// w1<<s
   277		MOVQ AX, (R10)		// z[0] = w1<<s
   278		RET
   279	
   280	X8b:	MOVQ $0, c+56(FP)
   281		RET
   282	
   283	
   284	// func shrVU(z, x []Word, s uint) (c Word)
   285	TEXT ·shrVU(SB),NOSPLIT,$0
   286		MOVQ z_len+8(FP), R11
   287		SUBQ $1, R11		// n--
   288		JL X9b			// n < 0	(n <= 0)
   289	
   290		// n > 0
   291		MOVQ z+0(FP), R10
   292		MOVQ x+24(FP), R8
   293		MOVQ s+48(FP), CX
   294		MOVQ (R8), AX		// w1 = x[0]
   295		MOVQ $0, DX
   296		SHRQ CX, DX:AX		// w1<<ŝ
   297		MOVQ DX, c+56(FP)
   298	
   299		MOVQ $0, BX		// i = 0
   300		JMP E9
   301	
   302		// i < n-1
   303	L9:	MOVQ AX, DX		// w = w1
   304		MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   305		SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   306		MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   307		ADDQ $1, BX		// i++
   308	
   309	E9:	CMPQ BX, R11
   310		JL L9			// i < n-1
   311	
   312		// i >= n-1
   313	X9a:	SHRQ CX, AX		// w1>>s
   314		MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   315		RET
   316	
   317	X9b:	MOVQ $0, c+56(FP)
   318		RET
   319	
   320	
   321	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   322	TEXT ·mulAddVWW(SB),NOSPLIT,$0
   323		MOVQ z+0(FP), R10
   324		MOVQ x+24(FP), R8
   325		MOVQ y+48(FP), R9
   326		MOVQ r+56(FP), CX	// c = r
   327		MOVQ z_len+8(FP), R11
   328		MOVQ $0, BX		// i = 0
   329		
   330		CMPQ R11, $4
   331		JL E5
   332		
   333	U5:	// i+4 <= n
   334		// regular loop body unrolled 4x
   335		MOVQ (0*8)(R8)(BX*8), AX
   336		MULQ R9
   337		ADDQ CX, AX
   338		ADCQ $0, DX
   339		MOVQ AX, (0*8)(R10)(BX*8)
   340		MOVQ DX, CX
   341		MOVQ (1*8)(R8)(BX*8), AX
   342		MULQ R9
   343		ADDQ CX, AX
   344		ADCQ $0, DX
   345		MOVQ AX, (1*8)(R10)(BX*8)
   346		MOVQ DX, CX
   347		MOVQ (2*8)(R8)(BX*8), AX
   348		MULQ R9
   349		ADDQ CX, AX
   350		ADCQ $0, DX
   351		MOVQ AX, (2*8)(R10)(BX*8)
   352		MOVQ DX, CX
   353		MOVQ (3*8)(R8)(BX*8), AX
   354		MULQ R9
   355		ADDQ CX, AX
   356		ADCQ $0, DX
   357		MOVQ AX, (3*8)(R10)(BX*8)
   358		MOVQ DX, CX
   359		ADDQ $4, BX		// i += 4
   360		
   361		LEAQ 4(BX), DX
   362		CMPQ DX, R11
   363		JLE U5
   364		JMP E5
   365	
   366	L5:	MOVQ (R8)(BX*8), AX
   367		MULQ R9
   368		ADDQ CX, AX
   369		ADCQ $0, DX
   370		MOVQ AX, (R10)(BX*8)
   371		MOVQ DX, CX
   372		ADDQ $1, BX		// i++
   373	
   374	E5:	CMPQ BX, R11		// i < n
   375		JL L5
   376	
   377		MOVQ CX, c+64(FP)
   378		RET
   379	
   380	
   381	// func addMulVVW(z, x []Word, y Word) (c Word)
   382	TEXT ·addMulVVW(SB),NOSPLIT,$0
   383		MOVQ z+0(FP), R10
   384		MOVQ x+24(FP), R8
   385		MOVQ y+48(FP), R9
   386		MOVQ z_len+8(FP), R11
   387		MOVQ $0, BX		// i = 0
   388		MOVQ $0, CX		// c = 0
   389		MOVQ R11, R12
   390		ANDQ $-2, R12
   391		CMPQ R11, $2
   392		JAE A6
   393		JMP E6
   394	
   395	A6:
   396		MOVQ (R8)(BX*8), AX
   397		MULQ R9
   398		ADDQ (R10)(BX*8), AX
   399		ADCQ $0, DX
   400		ADDQ CX, AX
   401		ADCQ $0, DX
   402		MOVQ DX, CX
   403		MOVQ AX, (R10)(BX*8)
   404	
   405		MOVQ (8)(R8)(BX*8), AX
   406		MULQ R9
   407		ADDQ (8)(R10)(BX*8), AX
   408		ADCQ $0, DX
   409		ADDQ CX, AX
   410		ADCQ $0, DX
   411		MOVQ DX, CX
   412		MOVQ AX, (8)(R10)(BX*8)
   413	
   414		ADDQ $2, BX
   415		CMPQ BX, R12
   416		JL A6
   417		JMP E6
   418	
   419	L6:	MOVQ (R8)(BX*8), AX
   420		MULQ R9
   421		ADDQ CX, AX
   422		ADCQ $0, DX
   423		ADDQ AX, (R10)(BX*8)
   424		ADCQ $0, DX
   425		MOVQ DX, CX
   426		ADDQ $1, BX		// i++
   427	
   428	E6:	CMPQ BX, R11		// i < n
   429		JL L6
   430	
   431		MOVQ CX, c+56(FP)
   432		RET
   433	
   434	
   435	// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   436	TEXT ·divWVW(SB),NOSPLIT,$0
   437		MOVQ z+0(FP), R10
   438		MOVQ xn+24(FP), DX	// r = xn
   439		MOVQ x+32(FP), R8
   440		MOVQ y+56(FP), R9
   441		MOVQ z_len+8(FP), BX	// i = z
   442		JMP E7
   443	
   444	L7:	MOVQ (R8)(BX*8), AX
   445		DIVQ R9
   446		MOVQ AX, (R10)(BX*8)
   447	
   448	E7:	SUBQ $1, BX		// i--
   449		JGE L7			// i >= 0
   450	
   451		MOVQ DX, r+64(FP)
   452		RET

View as plain text