...
Run Format

Text file src/math/big/arith_amd64.s

Documentation: math/big

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// func mulWW(x, y Word) (z1, z0 Word)
    13	TEXT ·mulWW(SB),NOSPLIT,$0
    14		MOVQ x+0(FP), AX
    15		MULQ y+8(FP)
    16		MOVQ DX, z1+16(FP)
    17		MOVQ AX, z0+24(FP)
    18		RET
    19	
    20	
    21	// func divWW(x1, x0, y Word) (q, r Word)
    22	TEXT ·divWW(SB),NOSPLIT,$0
    23		MOVQ x1+0(FP), DX
    24		MOVQ x0+8(FP), AX
    25		DIVQ y+16(FP)
    26		MOVQ AX, q+24(FP)
    27		MOVQ DX, r+32(FP)
    28		RET
    29	
    30	// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31	// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32	// This is faster than using rotate instructions.
    33	
    34	// func addVV(z, x, y []Word) (c Word)
    35	TEXT ·addVV(SB),NOSPLIT,$0
    36		MOVQ z_len+8(FP), DI
    37		MOVQ x+24(FP), R8
    38		MOVQ y+48(FP), R9
    39		MOVQ z+0(FP), R10
    40	
    41		MOVQ $0, CX		// c = 0
    42		MOVQ $0, SI		// i = 0
    43	
    44		// s/JL/JMP/ below to disable the unrolled loop
    45		SUBQ $4, DI		// n -= 4
    46		JL V1			// if n < 0 goto V1
    47	
    48	U1:	// n >= 0
    49		// regular loop body unrolled 4x
    50		ADDQ CX, CX		// restore CF
    51		MOVQ 0(R8)(SI*8), R11
    52		MOVQ 8(R8)(SI*8), R12
    53		MOVQ 16(R8)(SI*8), R13
    54		MOVQ 24(R8)(SI*8), R14
    55		ADCQ 0(R9)(SI*8), R11
    56		ADCQ 8(R9)(SI*8), R12
    57		ADCQ 16(R9)(SI*8), R13
    58		ADCQ 24(R9)(SI*8), R14
    59		MOVQ R11, 0(R10)(SI*8)
    60		MOVQ R12, 8(R10)(SI*8)
    61		MOVQ R13, 16(R10)(SI*8)
    62		MOVQ R14, 24(R10)(SI*8)
    63		SBBQ CX, CX		// save CF
    64	
    65		ADDQ $4, SI		// i += 4
    66		SUBQ $4, DI		// n -= 4
    67		JGE U1			// if n >= 0 goto U1
    68	
    69	V1:	ADDQ $4, DI		// n += 4
    70		JLE E1			// if n <= 0 goto E1
    71	
    72	L1:	// n > 0
    73		ADDQ CX, CX		// restore CF
    74		MOVQ 0(R8)(SI*8), R11
    75		ADCQ 0(R9)(SI*8), R11
    76		MOVQ R11, 0(R10)(SI*8)
    77		SBBQ CX, CX		// save CF
    78	
    79		ADDQ $1, SI		// i++
    80		SUBQ $1, DI		// n--
    81		JG L1			// if n > 0 goto L1
    82	
    83	E1:	NEGQ CX
    84		MOVQ CX, c+72(FP)	// return c
    85		RET
    86	
    87	
    88	// func subVV(z, x, y []Word) (c Word)
    89	// (same as addVV except for SBBQ instead of ADCQ and label names)
    90	TEXT ·subVV(SB),NOSPLIT,$0
    91		MOVQ z_len+8(FP), DI
    92		MOVQ x+24(FP), R8
    93		MOVQ y+48(FP), R9
    94		MOVQ z+0(FP), R10
    95	
    96		MOVQ $0, CX		// c = 0
    97		MOVQ $0, SI		// i = 0
    98	
    99		// s/JL/JMP/ below to disable the unrolled loop
   100		SUBQ $4, DI		// n -= 4
   101		JL V2			// if n < 0 goto V2
   102	
   103	U2:	// n >= 0
   104		// regular loop body unrolled 4x
   105		ADDQ CX, CX		// restore CF
   106		MOVQ 0(R8)(SI*8), R11
   107		MOVQ 8(R8)(SI*8), R12
   108		MOVQ 16(R8)(SI*8), R13
   109		MOVQ 24(R8)(SI*8), R14
   110		SBBQ 0(R9)(SI*8), R11
   111		SBBQ 8(R9)(SI*8), R12
   112		SBBQ 16(R9)(SI*8), R13
   113		SBBQ 24(R9)(SI*8), R14
   114		MOVQ R11, 0(R10)(SI*8)
   115		MOVQ R12, 8(R10)(SI*8)
   116		MOVQ R13, 16(R10)(SI*8)
   117		MOVQ R14, 24(R10)(SI*8)
   118		SBBQ CX, CX		// save CF
   119	
   120		ADDQ $4, SI		// i += 4
   121		SUBQ $4, DI		// n -= 4
   122		JGE U2			// if n >= 0 goto U2
   123	
   124	V2:	ADDQ $4, DI		// n += 4
   125		JLE E2			// if n <= 0 goto E2
   126	
   127	L2:	// n > 0
   128		ADDQ CX, CX		// restore CF
   129		MOVQ 0(R8)(SI*8), R11
   130		SBBQ 0(R9)(SI*8), R11
   131		MOVQ R11, 0(R10)(SI*8)
   132		SBBQ CX, CX		// save CF
   133	
   134		ADDQ $1, SI		// i++
   135		SUBQ $1, DI		// n--
   136		JG L2			// if n > 0 goto L2
   137	
   138	E2:	NEGQ CX
   139		MOVQ CX, c+72(FP)	// return c
   140		RET
   141	
   142	
   143	// func addVW(z, x []Word, y Word) (c Word)
   144	TEXT ·addVW(SB),NOSPLIT,$0
   145		MOVQ z_len+8(FP), DI
   146		MOVQ x+24(FP), R8
   147		MOVQ y+48(FP), CX	// c = y
   148		MOVQ z+0(FP), R10
   149	
   150		MOVQ $0, SI		// i = 0
   151	
   152		// s/JL/JMP/ below to disable the unrolled loop
   153		SUBQ $4, DI		// n -= 4
   154		JL V3			// if n < 4 goto V3
   155	
   156	U3:	// n >= 0
   157		// regular loop body unrolled 4x
   158		MOVQ 0(R8)(SI*8), R11
   159		MOVQ 8(R8)(SI*8), R12
   160		MOVQ 16(R8)(SI*8), R13
   161		MOVQ 24(R8)(SI*8), R14
   162		ADDQ CX, R11
   163		ADCQ $0, R12
   164		ADCQ $0, R13
   165		ADCQ $0, R14
   166		SBBQ CX, CX		// save CF
   167		NEGQ CX
   168		MOVQ R11, 0(R10)(SI*8)
   169		MOVQ R12, 8(R10)(SI*8)
   170		MOVQ R13, 16(R10)(SI*8)
   171		MOVQ R14, 24(R10)(SI*8)
   172	
   173		ADDQ $4, SI		// i += 4
   174		SUBQ $4, DI		// n -= 4
   175		JGE U3			// if n >= 0 goto U3
   176	
   177	V3:	ADDQ $4, DI		// n += 4
   178		JLE E3			// if n <= 0 goto E3
   179	
   180	L3:	// n > 0
   181		ADDQ 0(R8)(SI*8), CX
   182		MOVQ CX, 0(R10)(SI*8)
   183		SBBQ CX, CX		// save CF
   184		NEGQ CX
   185	
   186		ADDQ $1, SI		// i++
   187		SUBQ $1, DI		// n--
   188		JG L3			// if n > 0 goto L3
   189	
   190	E3:	MOVQ CX, c+56(FP)	// return c
   191		RET
   192	
   193	
   194	// func subVW(z, x []Word, y Word) (c Word)
   195	// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   196	TEXT ·subVW(SB),NOSPLIT,$0
   197		MOVQ z_len+8(FP), DI
   198		MOVQ x+24(FP), R8
   199		MOVQ y+48(FP), CX	// c = y
   200		MOVQ z+0(FP), R10
   201	
   202		MOVQ $0, SI		// i = 0
   203	
   204		// s/JL/JMP/ below to disable the unrolled loop
   205		SUBQ $4, DI		// n -= 4
   206		JL V4			// if n < 4 goto V4
   207	
   208	U4:	// n >= 0
   209		// regular loop body unrolled 4x
   210		MOVQ 0(R8)(SI*8), R11
   211		MOVQ 8(R8)(SI*8), R12
   212		MOVQ 16(R8)(SI*8), R13
   213		MOVQ 24(R8)(SI*8), R14
   214		SUBQ CX, R11
   215		SBBQ $0, R12
   216		SBBQ $0, R13
   217		SBBQ $0, R14
   218		SBBQ CX, CX		// save CF
   219		NEGQ CX
   220		MOVQ R11, 0(R10)(SI*8)
   221		MOVQ R12, 8(R10)(SI*8)
   222		MOVQ R13, 16(R10)(SI*8)
   223		MOVQ R14, 24(R10)(SI*8)
   224	
   225		ADDQ $4, SI		// i += 4
   226		SUBQ $4, DI		// n -= 4
   227		JGE U4			// if n >= 0 goto U4
   228	
   229	V4:	ADDQ $4, DI		// n += 4
   230		JLE E4			// if n <= 0 goto E4
   231	
   232	L4:	// n > 0
   233		MOVQ 0(R8)(SI*8), R11
   234		SUBQ CX, R11
   235		MOVQ R11, 0(R10)(SI*8)
   236		SBBQ CX, CX		// save CF
   237		NEGQ CX
   238	
   239		ADDQ $1, SI		// i++
   240		SUBQ $1, DI		// n--
   241		JG L4			// if n > 0 goto L4
   242	
   243	E4:	MOVQ CX, c+56(FP)	// return c
   244		RET
   245	
   246	
   247	// func shlVU(z, x []Word, s uint) (c Word)
   248	TEXT ·shlVU(SB),NOSPLIT,$0
   249		MOVQ z_len+8(FP), BX	// i = z
   250		SUBQ $1, BX		// i--
   251		JL X8b			// i < 0	(n <= 0)
   252	
   253		// n > 0
   254		MOVQ z+0(FP), R10
   255		MOVQ x+24(FP), R8
   256		MOVQ s+48(FP), CX
   257		MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   258		MOVQ $0, DX
   259		SHLQ CX, DX:AX		// w1>>ŝ
   260		MOVQ DX, c+56(FP)
   261	
   262		CMPQ BX, $0
   263		JLE X8a			// i <= 0
   264	
   265		// i > 0
   266	L8:	MOVQ AX, DX		// w = w1
   267		MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   268		SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   269		MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   270		SUBQ $1, BX		// i--
   271		JG L8			// i > 0
   272	
   273		// i <= 0
   274	X8a:	SHLQ CX, AX		// w1<<s
   275		MOVQ AX, (R10)		// z[0] = w1<<s
   276		RET
   277	
   278	X8b:	MOVQ $0, c+56(FP)
   279		RET
   280	
   281	
   282	// func shrVU(z, x []Word, s uint) (c Word)
   283	TEXT ·shrVU(SB),NOSPLIT,$0
   284		MOVQ z_len+8(FP), R11
   285		SUBQ $1, R11		// n--
   286		JL X9b			// n < 0	(n <= 0)
   287	
   288		// n > 0
   289		MOVQ z+0(FP), R10
   290		MOVQ x+24(FP), R8
   291		MOVQ s+48(FP), CX
   292		MOVQ (R8), AX		// w1 = x[0]
   293		MOVQ $0, DX
   294		SHRQ CX, DX:AX		// w1<<ŝ
   295		MOVQ DX, c+56(FP)
   296	
   297		MOVQ $0, BX		// i = 0
   298		JMP E9
   299	
   300		// i < n-1
   301	L9:	MOVQ AX, DX		// w = w1
   302		MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   303		SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   304		MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   305		ADDQ $1, BX		// i++
   306	
   307	E9:	CMPQ BX, R11
   308		JL L9			// i < n-1
   309	
   310		// i >= n-1
   311	X9a:	SHRQ CX, AX		// w1>>s
   312		MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   313		RET
   314	
   315	X9b:	MOVQ $0, c+56(FP)
   316		RET
   317	
   318	
   319	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   320	TEXT ·mulAddVWW(SB),NOSPLIT,$0
   321		MOVQ z+0(FP), R10
   322		MOVQ x+24(FP), R8
   323		MOVQ y+48(FP), R9
   324		MOVQ r+56(FP), CX	// c = r
   325		MOVQ z_len+8(FP), R11
   326		MOVQ $0, BX		// i = 0
   327		
   328		CMPQ R11, $4
   329		JL E5
   330		
   331	U5:	// i+4 <= n
   332		// regular loop body unrolled 4x
   333		MOVQ (0*8)(R8)(BX*8), AX
   334		MULQ R9
   335		ADDQ CX, AX
   336		ADCQ $0, DX
   337		MOVQ AX, (0*8)(R10)(BX*8)
   338		MOVQ DX, CX
   339		MOVQ (1*8)(R8)(BX*8), AX
   340		MULQ R9
   341		ADDQ CX, AX
   342		ADCQ $0, DX
   343		MOVQ AX, (1*8)(R10)(BX*8)
   344		MOVQ DX, CX
   345		MOVQ (2*8)(R8)(BX*8), AX
   346		MULQ R9
   347		ADDQ CX, AX
   348		ADCQ $0, DX
   349		MOVQ AX, (2*8)(R10)(BX*8)
   350		MOVQ DX, CX
   351		MOVQ (3*8)(R8)(BX*8), AX
   352		MULQ R9
   353		ADDQ CX, AX
   354		ADCQ $0, DX
   355		MOVQ AX, (3*8)(R10)(BX*8)
   356		MOVQ DX, CX
   357		ADDQ $4, BX		// i += 4
   358		
   359		LEAQ 4(BX), DX
   360		CMPQ DX, R11
   361		JLE U5
   362		JMP E5
   363	
   364	L5:	MOVQ (R8)(BX*8), AX
   365		MULQ R9
   366		ADDQ CX, AX
   367		ADCQ $0, DX
   368		MOVQ AX, (R10)(BX*8)
   369		MOVQ DX, CX
   370		ADDQ $1, BX		// i++
   371	
   372	E5:	CMPQ BX, R11		// i < n
   373		JL L5
   374	
   375		MOVQ CX, c+64(FP)
   376		RET
   377	
   378	
   379	// func addMulVVW(z, x []Word, y Word) (c Word)
   380	TEXT ·addMulVVW(SB),NOSPLIT,$0
   381		CMPB    ·support_adx(SB), $1
   382		JEQ adx
   383		MOVQ z+0(FP), R10
   384		MOVQ x+24(FP), R8
   385		MOVQ y+48(FP), R9
   386		MOVQ z_len+8(FP), R11
   387		MOVQ $0, BX		// i = 0
   388		MOVQ $0, CX		// c = 0
   389		MOVQ R11, R12
   390		ANDQ $-2, R12
   391		CMPQ R11, $2
   392		JAE A6
   393		JMP E6
   394	
   395	A6:
   396		MOVQ (R8)(BX*8), AX
   397		MULQ R9
   398		ADDQ (R10)(BX*8), AX
   399		ADCQ $0, DX
   400		ADDQ CX, AX
   401		ADCQ $0, DX
   402		MOVQ DX, CX
   403		MOVQ AX, (R10)(BX*8)
   404	
   405		MOVQ (8)(R8)(BX*8), AX
   406		MULQ R9
   407		ADDQ (8)(R10)(BX*8), AX
   408		ADCQ $0, DX
   409		ADDQ CX, AX
   410		ADCQ $0, DX
   411		MOVQ DX, CX
   412		MOVQ AX, (8)(R10)(BX*8)
   413	
   414		ADDQ $2, BX
   415		CMPQ BX, R12
   416		JL A6
   417		JMP E6
   418	
   419	L6:	MOVQ (R8)(BX*8), AX
   420		MULQ R9
   421		ADDQ CX, AX
   422		ADCQ $0, DX
   423		ADDQ AX, (R10)(BX*8)
   424		ADCQ $0, DX
   425		MOVQ DX, CX
   426		ADDQ $1, BX		// i++
   427	
   428	E6:	CMPQ BX, R11		// i < n
   429		JL L6
   430	
   431		MOVQ CX, c+56(FP)
   432		RET
   433	
   434	adx:
   435		MOVQ z_len+8(FP), R11
   436		MOVQ z+0(FP), R10
   437		MOVQ x+24(FP), R8
   438		MOVQ y+48(FP), DX
   439		MOVQ $0, BX   // i = 0
   440		MOVQ $0, CX   // carry
   441		CMPQ R11, $8
   442		JAE  adx_loop_header
   443		CMPQ BX, R11
   444		JL adx_short
   445		MOVQ CX, c+56(FP)
   446		RET
   447	
   448	adx_loop_header:
   449		MOVQ  R11, R13
   450		ANDQ  $-8, R13
   451	adx_loop:
   452		XORQ  R9, R9  // unset flags
   453		MULXQ (R8), SI, DI
   454		ADCXQ CX,SI
   455		ADOXQ (R10), SI
   456		MOVQ  SI,(R10)
   457	
   458		MULXQ 8(R8), AX, CX
   459		ADCXQ DI, AX
   460		ADOXQ 8(R10), AX
   461		MOVQ  AX, 8(R10)
   462	
   463		MULXQ 16(R8), SI, DI
   464		ADCXQ CX, SI
   465		ADOXQ 16(R10), SI
   466		MOVQ  SI, 16(R10)
   467	
   468		MULXQ 24(R8), AX, CX
   469		ADCXQ DI, AX
   470		ADOXQ 24(R10), AX
   471		MOVQ  AX, 24(R10)
   472	
   473		MULXQ 32(R8), SI, DI
   474		ADCXQ CX, SI
   475		ADOXQ 32(R10), SI
   476		MOVQ  SI, 32(R10)
   477	
   478		MULXQ 40(R8), AX, CX
   479		ADCXQ DI, AX
   480		ADOXQ 40(R10), AX
   481		MOVQ  AX, 40(R10)
   482	
   483		MULXQ 48(R8), SI, DI
   484		ADCXQ CX, SI
   485		ADOXQ 48(R10), SI
   486		MOVQ  SI, 48(R10)
   487	
   488		MULXQ 56(R8), AX, CX
   489		ADCXQ DI, AX
   490		ADOXQ 56(R10), AX
   491		MOVQ  AX, 56(R10)
   492	
   493		ADCXQ R9, CX
   494		ADOXQ R9, CX
   495	
   496		ADDQ $64, R8
   497		ADDQ $64, R10
   498		ADDQ $8, BX
   499	
   500		CMPQ BX, R13
   501		JL adx_loop
   502		MOVQ z+0(FP), R10
   503		MOVQ x+24(FP), R8
   504		CMPQ BX, R11
   505		JL adx_short
   506		MOVQ CX, c+56(FP)
   507		RET
   508	
   509	adx_short:
   510		MULXQ (R8)(BX*8), SI, DI
   511		ADDQ CX, SI
   512		ADCQ $0, DI
   513		ADDQ SI, (R10)(BX*8)
   514		ADCQ $0, DI
   515		MOVQ DI, CX
   516		ADDQ $1, BX		// i++
   517	
   518		CMPQ BX, R11
   519		JL adx_short
   520	
   521		MOVQ CX, c+56(FP)
   522		RET
   523	
   524	
   525	
   526	// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   527	TEXT ·divWVW(SB),NOSPLIT,$0
   528		MOVQ z+0(FP), R10
   529		MOVQ xn+24(FP), DX	// r = xn
   530		MOVQ x+32(FP), R8
   531		MOVQ y+56(FP), R9
   532		MOVQ z_len+8(FP), BX	// i = z
   533		JMP E7
   534	
   535	L7:	MOVQ (R8)(BX*8), AX
   536		DIVQ R9
   537		MOVQ AX, (R10)(BX*8)
   538	
   539	E7:	SUBQ $1, BX		// i--
   540		JGE L7			// i >= 0
   541	
   542		MOVQ DX, r+64(FP)
   543		RET

View as plain text