...
Run Format

Text file src/math/big/arith_amd64.s

Documentation: math/big

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// func mulWW(x, y Word) (z1, z0 Word)
    13	TEXT ·mulWW(SB),NOSPLIT,$0
    14		MOVQ x+0(FP), AX
    15		MULQ y+8(FP)
    16		MOVQ DX, z1+16(FP)
    17		MOVQ AX, z0+24(FP)
    18		RET
    19	
    20	
    21	// func divWW(x1, x0, y Word) (q, r Word)
    22	TEXT ·divWW(SB),NOSPLIT,$0
    23		MOVQ x1+0(FP), DX
    24		MOVQ x0+8(FP), AX
    25		DIVQ y+16(FP)
    26		MOVQ AX, q+24(FP)
    27		MOVQ DX, r+32(FP)
    28		RET
    29	
    30	// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31	// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32	// This is faster than using rotate instructions.
    33	
    34	// func addVV(z, x, y []Word) (c Word)
    35	TEXT ·addVV(SB),NOSPLIT,$0
    36		MOVQ z_len+8(FP), DI
    37		MOVQ x+24(FP), R8
    38		MOVQ y+48(FP), R9
    39		MOVQ z+0(FP), R10
    40	
    41		MOVQ $0, CX		// c = 0
    42		MOVQ $0, SI		// i = 0
    43	
    44		// s/JL/JMP/ below to disable the unrolled loop
    45		SUBQ $4, DI		// n -= 4
    46		JL V1			// if n < 0 goto V1
    47	
    48	U1:	// n >= 0
    49		// regular loop body unrolled 4x
    50		ADDQ CX, CX		// restore CF
    51		MOVQ 0(R8)(SI*8), R11
    52		MOVQ 8(R8)(SI*8), R12
    53		MOVQ 16(R8)(SI*8), R13
    54		MOVQ 24(R8)(SI*8), R14
    55		ADCQ 0(R9)(SI*8), R11
    56		ADCQ 8(R9)(SI*8), R12
    57		ADCQ 16(R9)(SI*8), R13
    58		ADCQ 24(R9)(SI*8), R14
    59		MOVQ R11, 0(R10)(SI*8)
    60		MOVQ R12, 8(R10)(SI*8)
    61		MOVQ R13, 16(R10)(SI*8)
    62		MOVQ R14, 24(R10)(SI*8)
    63		SBBQ CX, CX		// save CF
    64	
    65		ADDQ $4, SI		// i += 4
    66		SUBQ $4, DI		// n -= 4
    67		JGE U1			// if n >= 0 goto U1
    68	
    69	V1:	ADDQ $4, DI		// n += 4
    70		JLE E1			// if n <= 0 goto E1
    71	
    72	L1:	// n > 0
    73		ADDQ CX, CX		// restore CF
    74		MOVQ 0(R8)(SI*8), R11
    75		ADCQ 0(R9)(SI*8), R11
    76		MOVQ R11, 0(R10)(SI*8)
    77		SBBQ CX, CX		// save CF
    78	
    79		ADDQ $1, SI		// i++
    80		SUBQ $1, DI		// n--
    81		JG L1			// if n > 0 goto L1
    82	
    83	E1:	NEGQ CX
    84		MOVQ CX, c+72(FP)	// return c
    85		RET
    86	
    87	
    88	// func subVV(z, x, y []Word) (c Word)
    89	// (same as addVV except for SBBQ instead of ADCQ and label names)
    90	TEXT ·subVV(SB),NOSPLIT,$0
    91		MOVQ z_len+8(FP), DI
    92		MOVQ x+24(FP), R8
    93		MOVQ y+48(FP), R9
    94		MOVQ z+0(FP), R10
    95	
    96		MOVQ $0, CX		// c = 0
    97		MOVQ $0, SI		// i = 0
    98	
    99		// s/JL/JMP/ below to disable the unrolled loop
   100		SUBQ $4, DI		// n -= 4
   101		JL V2			// if n < 0 goto V2
   102	
   103	U2:	// n >= 0
   104		// regular loop body unrolled 4x
   105		ADDQ CX, CX		// restore CF
   106		MOVQ 0(R8)(SI*8), R11
   107		MOVQ 8(R8)(SI*8), R12
   108		MOVQ 16(R8)(SI*8), R13
   109		MOVQ 24(R8)(SI*8), R14
   110		SBBQ 0(R9)(SI*8), R11
   111		SBBQ 8(R9)(SI*8), R12
   112		SBBQ 16(R9)(SI*8), R13
   113		SBBQ 24(R9)(SI*8), R14
   114		MOVQ R11, 0(R10)(SI*8)
   115		MOVQ R12, 8(R10)(SI*8)
   116		MOVQ R13, 16(R10)(SI*8)
   117		MOVQ R14, 24(R10)(SI*8)
   118		SBBQ CX, CX		// save CF
   119	
   120		ADDQ $4, SI		// i += 4
   121		SUBQ $4, DI		// n -= 4
   122		JGE U2			// if n >= 0 goto U2
   123	
   124	V2:	ADDQ $4, DI		// n += 4
   125		JLE E2			// if n <= 0 goto E2
   126	
   127	L2:	// n > 0
   128		ADDQ CX, CX		// restore CF
   129		MOVQ 0(R8)(SI*8), R11
   130		SBBQ 0(R9)(SI*8), R11
   131		MOVQ R11, 0(R10)(SI*8)
   132		SBBQ CX, CX		// save CF
   133	
   134		ADDQ $1, SI		// i++
   135		SUBQ $1, DI		// n--
   136		JG L2			// if n > 0 goto L2
   137	
   138	E2:	NEGQ CX
   139		MOVQ CX, c+72(FP)	// return c
   140		RET
   141	
   142	
   143	// func addVW(z, x []Word, y Word) (c Word)
   144	TEXT ·addVW(SB),NOSPLIT,$0
   145		MOVQ z_len+8(FP), DI
   146		MOVQ x+24(FP), R8
   147		MOVQ y+48(FP), CX	// c = y
   148		MOVQ z+0(FP), R10
   149	
   150		MOVQ $0, SI		// i = 0
   151	
   152		// s/JL/JMP/ below to disable the unrolled loop
   153		SUBQ $4, DI		// n -= 4
   154		JL V3			// if n < 4 goto V3
   155	
   156	U3:	// n >= 0
   157		// regular loop body unrolled 4x
   158		MOVQ 0(R8)(SI*8), R11
   159		MOVQ 8(R8)(SI*8), R12
   160		MOVQ 16(R8)(SI*8), R13
   161		MOVQ 24(R8)(SI*8), R14
   162		ADDQ CX, R11
   163		ADCQ $0, R12
   164		ADCQ $0, R13
   165		ADCQ $0, R14
   166		SBBQ CX, CX		// save CF
   167		NEGQ CX
   168		MOVQ R11, 0(R10)(SI*8)
   169		MOVQ R12, 8(R10)(SI*8)
   170		MOVQ R13, 16(R10)(SI*8)
   171		MOVQ R14, 24(R10)(SI*8)
   172	
   173		ADDQ $4, SI		// i += 4
   174		SUBQ $4, DI		// n -= 4
   175		JGE U3			// if n >= 0 goto U3
   176	
   177	V3:	ADDQ $4, DI		// n += 4
   178		JLE E3			// if n <= 0 goto E3
   179	
   180	L3:	// n > 0
   181		ADDQ 0(R8)(SI*8), CX
   182		MOVQ CX, 0(R10)(SI*8)
   183		SBBQ CX, CX		// save CF
   184		NEGQ CX
   185	
   186		ADDQ $1, SI		// i++
   187		SUBQ $1, DI		// n--
   188		JG L3			// if n > 0 goto L3
   189	
   190	E3:	MOVQ CX, c+56(FP)	// return c
   191		RET
   192	
   193	
   194	// func subVW(z, x []Word, y Word) (c Word)
   195	// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   196	TEXT ·subVW(SB),NOSPLIT,$0
   197		MOVQ z_len+8(FP), DI
   198		MOVQ x+24(FP), R8
   199		MOVQ y+48(FP), CX	// c = y
   200		MOVQ z+0(FP), R10
   201	
   202		MOVQ $0, SI		// i = 0
   203	
   204		// s/JL/JMP/ below to disable the unrolled loop
   205		SUBQ $4, DI		// n -= 4
   206		JL V4			// if n < 4 goto V4
   207	
   208	U4:	// n >= 0
   209		// regular loop body unrolled 4x
   210		MOVQ 0(R8)(SI*8), R11
   211		MOVQ 8(R8)(SI*8), R12
   212		MOVQ 16(R8)(SI*8), R13
   213		MOVQ 24(R8)(SI*8), R14
   214		SUBQ CX, R11
   215		SBBQ $0, R12
   216		SBBQ $0, R13
   217		SBBQ $0, R14
   218		SBBQ CX, CX		// save CF
   219		NEGQ CX
   220		MOVQ R11, 0(R10)(SI*8)
   221		MOVQ R12, 8(R10)(SI*8)
   222		MOVQ R13, 16(R10)(SI*8)
   223		MOVQ R14, 24(R10)(SI*8)
   224	
   225		ADDQ $4, SI		// i += 4
   226		SUBQ $4, DI		// n -= 4
   227		JGE U4			// if n >= 0 goto U4
   228	
   229	V4:	ADDQ $4, DI		// n += 4
   230		JLE E4			// if n <= 0 goto E4
   231	
   232	L4:	// n > 0
   233		MOVQ 0(R8)(SI*8), R11
   234		SUBQ CX, R11
   235		MOVQ R11, 0(R10)(SI*8)
   236		SBBQ CX, CX		// save CF
   237		NEGQ CX
   238	
   239		ADDQ $1, SI		// i++
   240		SUBQ $1, DI		// n--
   241		JG L4			// if n > 0 goto L4
   242	
   243	E4:	MOVQ CX, c+56(FP)	// return c
   244		RET
   245	
   246	
   247	// func shlVU(z, x []Word, s uint) (c Word)
   248	TEXT ·shlVU(SB),NOSPLIT,$0
   249		MOVQ z_len+8(FP), BX	// i = z
   250		SUBQ $1, BX		// i--
   251		JL X8b			// i < 0	(n <= 0)
   252	
   253		// n > 0
   254		MOVQ z+0(FP), R10
   255		MOVQ x+24(FP), R8
   256		MOVQ s+48(FP), CX
   257		MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   258		MOVQ $0, DX
   259		SHLQ CX, DX:AX		// w1>>ŝ
   260		MOVQ DX, c+56(FP)
   261	
   262		CMPQ BX, $0
   263		JLE X8a			// i <= 0
   264	
   265		// i > 0
   266	L8:	MOVQ AX, DX		// w = w1
   267		MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   268		SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   269		MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   270		SUBQ $1, BX		// i--
   271		JG L8			// i > 0
   272	
   273		// i <= 0
   274	X8a:	SHLQ CX, AX		// w1<<s
   275		MOVQ AX, (R10)		// z[0] = w1<<s
   276		RET
   277	
   278	X8b:	MOVQ $0, c+56(FP)
   279		RET
   280	
   281	
   282	// func shrVU(z, x []Word, s uint) (c Word)
   283	TEXT ·shrVU(SB),NOSPLIT,$0
   284		MOVQ z_len+8(FP), R11
   285		SUBQ $1, R11		// n--
   286		JL X9b			// n < 0	(n <= 0)
   287	
   288		// n > 0
   289		MOVQ z+0(FP), R10
   290		MOVQ x+24(FP), R8
   291		MOVQ s+48(FP), CX
   292		MOVQ (R8), AX		// w1 = x[0]
   293		MOVQ $0, DX
   294		SHRQ CX, DX:AX		// w1<<ŝ
   295		MOVQ DX, c+56(FP)
   296	
   297		MOVQ $0, BX		// i = 0
   298		JMP E9
   299	
   300		// i < n-1
   301	L9:	MOVQ AX, DX		// w = w1
   302		MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   303		SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   304		MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   305		ADDQ $1, BX		// i++
   306	
   307	E9:	CMPQ BX, R11
   308		JL L9			// i < n-1
   309	
   310		// i >= n-1
   311	X9a:	SHRQ CX, AX		// w1>>s
   312		MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   313		RET
   314	
   315	X9b:	MOVQ $0, c+56(FP)
   316		RET
   317	
   318	
   319	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   320	TEXT ·mulAddVWW(SB),NOSPLIT,$0
   321		MOVQ z+0(FP), R10
   322		MOVQ x+24(FP), R8
   323		MOVQ y+48(FP), R9
   324		MOVQ r+56(FP), CX	// c = r
   325		MOVQ z_len+8(FP), R11
   326		MOVQ $0, BX		// i = 0
   327		
   328		CMPQ R11, $4
   329		JL E5
   330		
   331	U5:	// i+4 <= n
   332		// regular loop body unrolled 4x
   333		MOVQ (0*8)(R8)(BX*8), AX
   334		MULQ R9
   335		ADDQ CX, AX
   336		ADCQ $0, DX
   337		MOVQ AX, (0*8)(R10)(BX*8)
   338		MOVQ DX, CX
   339		MOVQ (1*8)(R8)(BX*8), AX
   340		MULQ R9
   341		ADDQ CX, AX
   342		ADCQ $0, DX
   343		MOVQ AX, (1*8)(R10)(BX*8)
   344		MOVQ DX, CX
   345		MOVQ (2*8)(R8)(BX*8), AX
   346		MULQ R9
   347		ADDQ CX, AX
   348		ADCQ $0, DX
   349		MOVQ AX, (2*8)(R10)(BX*8)
   350		MOVQ DX, CX
   351		MOVQ (3*8)(R8)(BX*8), AX
   352		MULQ R9
   353		ADDQ CX, AX
   354		ADCQ $0, DX
   355		MOVQ AX, (3*8)(R10)(BX*8)
   356		MOVQ DX, CX
   357		ADDQ $4, BX		// i += 4
   358		
   359		LEAQ 4(BX), DX
   360		CMPQ DX, R11
   361		JLE U5
   362		JMP E5
   363	
   364	L5:	MOVQ (R8)(BX*8), AX
   365		MULQ R9
   366		ADDQ CX, AX
   367		ADCQ $0, DX
   368		MOVQ AX, (R10)(BX*8)
   369		MOVQ DX, CX
   370		ADDQ $1, BX		// i++
   371	
   372	E5:	CMPQ BX, R11		// i < n
   373		JL L5
   374	
   375		MOVQ CX, c+64(FP)
   376		RET
   377	
   378	
   379	// func addMulVVW(z, x []Word, y Word) (c Word)
   380	TEXT ·addMulVVW(SB),NOSPLIT,$0
   381		MOVQ z+0(FP), R10
   382		MOVQ x+24(FP), R8
   383		MOVQ y+48(FP), R9
   384		MOVQ z_len+8(FP), R11
   385		MOVQ $0, BX		// i = 0
   386		MOVQ $0, CX		// c = 0
   387		MOVQ R11, R12
   388		ANDQ $-2, R12
   389		CMPQ R11, $2
   390		JAE A6
   391		JMP E6
   392	
   393	A6:
   394		MOVQ (R8)(BX*8), AX
   395		MULQ R9
   396		ADDQ (R10)(BX*8), AX
   397		ADCQ $0, DX
   398		ADDQ CX, AX
   399		ADCQ $0, DX
   400		MOVQ DX, CX
   401		MOVQ AX, (R10)(BX*8)
   402	
   403		MOVQ (8)(R8)(BX*8), AX
   404		MULQ R9
   405		ADDQ (8)(R10)(BX*8), AX
   406		ADCQ $0, DX
   407		ADDQ CX, AX
   408		ADCQ $0, DX
   409		MOVQ DX, CX
   410		MOVQ AX, (8)(R10)(BX*8)
   411	
   412		ADDQ $2, BX
   413		CMPQ BX, R12
   414		JL A6
   415		JMP E6
   416	
   417	L6:	MOVQ (R8)(BX*8), AX
   418		MULQ R9
   419		ADDQ CX, AX
   420		ADCQ $0, DX
   421		ADDQ AX, (R10)(BX*8)
   422		ADCQ $0, DX
   423		MOVQ DX, CX
   424		ADDQ $1, BX		// i++
   425	
   426	E6:	CMPQ BX, R11		// i < n
   427		JL L6
   428	
   429		MOVQ CX, c+56(FP)
   430		RET
   431	
   432	
   433	// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   434	TEXT ·divWVW(SB),NOSPLIT,$0
   435		MOVQ z+0(FP), R10
   436		MOVQ xn+24(FP), DX	// r = xn
   437		MOVQ x+32(FP), R8
   438		MOVQ y+56(FP), R9
   439		MOVQ z_len+8(FP), BX	// i = z
   440		JMP E7
   441	
   442	L7:	MOVQ (R8)(BX*8), AX
   443		DIVQ R9
   444		MOVQ AX, (R10)(BX*8)
   445	
   446	E7:	SUBQ $1, BX		// i--
   447		JGE L7			// i >= 0
   448	
   449		MOVQ DX, r+64(FP)
   450		RET

View as plain text