Text file src/math/big/arith_amd64.s

Documentation: math/big

     1// Copyright 2009 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !math_big_pure_go
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// func mulWW(x, y Word) (z1, z0 Word)
    13TEXT ·mulWW(SB),NOSPLIT,$0
    14	MOVQ x+0(FP), AX
    15	MULQ y+8(FP)
    16	MOVQ DX, z1+16(FP)
    17	MOVQ AX, z0+24(FP)
    18	RET
    19
    20
    21// func divWW(x1, x0, y Word) (q, r Word)
    22TEXT ·divWW(SB),NOSPLIT,$0
    23	MOVQ x1+0(FP), DX
    24	MOVQ x0+8(FP), AX
    25	DIVQ y+16(FP)
    26	MOVQ AX, q+24(FP)
    27	MOVQ DX, r+32(FP)
    28	RET
    29
    30// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32// This is faster than using rotate instructions.
    33
    34// func addVV(z, x, y []Word) (c Word)
    35TEXT ·addVV(SB),NOSPLIT,$0
    36	MOVQ z_len+8(FP), DI
    37	MOVQ x+24(FP), R8
    38	MOVQ y+48(FP), R9
    39	MOVQ z+0(FP), R10
    40
    41	MOVQ $0, CX		// c = 0
    42	MOVQ $0, SI		// i = 0
    43
    44	// s/JL/JMP/ below to disable the unrolled loop
    45	SUBQ $4, DI		// n -= 4
    46	JL V1			// if n < 0 goto V1
    47
    48U1:	// n >= 0
    49	// regular loop body unrolled 4x
    50	ADDQ CX, CX		// restore CF
    51	MOVQ 0(R8)(SI*8), R11
    52	MOVQ 8(R8)(SI*8), R12
    53	MOVQ 16(R8)(SI*8), R13
    54	MOVQ 24(R8)(SI*8), R14
    55	ADCQ 0(R9)(SI*8), R11
    56	ADCQ 8(R9)(SI*8), R12
    57	ADCQ 16(R9)(SI*8), R13
    58	ADCQ 24(R9)(SI*8), R14
    59	MOVQ R11, 0(R10)(SI*8)
    60	MOVQ R12, 8(R10)(SI*8)
    61	MOVQ R13, 16(R10)(SI*8)
    62	MOVQ R14, 24(R10)(SI*8)
    63	SBBQ CX, CX		// save CF
    64
    65	ADDQ $4, SI		// i += 4
    66	SUBQ $4, DI		// n -= 4
    67	JGE U1			// if n >= 0 goto U1
    68
    69V1:	ADDQ $4, DI		// n += 4
    70	JLE E1			// if n <= 0 goto E1
    71
    72L1:	// n > 0
    73	ADDQ CX, CX		// restore CF
    74	MOVQ 0(R8)(SI*8), R11
    75	ADCQ 0(R9)(SI*8), R11
    76	MOVQ R11, 0(R10)(SI*8)
    77	SBBQ CX, CX		// save CF
    78
    79	ADDQ $1, SI		// i++
    80	SUBQ $1, DI		// n--
    81	JG L1			// if n > 0 goto L1
    82
    83E1:	NEGQ CX
    84	MOVQ CX, c+72(FP)	// return c
    85	RET
    86
    87
    88// func subVV(z, x, y []Word) (c Word)
    89// (same as addVV except for SBBQ instead of ADCQ and label names)
    90TEXT ·subVV(SB),NOSPLIT,$0
    91	MOVQ z_len+8(FP), DI
    92	MOVQ x+24(FP), R8
    93	MOVQ y+48(FP), R9
    94	MOVQ z+0(FP), R10
    95
    96	MOVQ $0, CX		// c = 0
    97	MOVQ $0, SI		// i = 0
    98
    99	// s/JL/JMP/ below to disable the unrolled loop
   100	SUBQ $4, DI		// n -= 4
   101	JL V2			// if n < 0 goto V2
   102
   103U2:	// n >= 0
   104	// regular loop body unrolled 4x
   105	ADDQ CX, CX		// restore CF
   106	MOVQ 0(R8)(SI*8), R11
   107	MOVQ 8(R8)(SI*8), R12
   108	MOVQ 16(R8)(SI*8), R13
   109	MOVQ 24(R8)(SI*8), R14
   110	SBBQ 0(R9)(SI*8), R11
   111	SBBQ 8(R9)(SI*8), R12
   112	SBBQ 16(R9)(SI*8), R13
   113	SBBQ 24(R9)(SI*8), R14
   114	MOVQ R11, 0(R10)(SI*8)
   115	MOVQ R12, 8(R10)(SI*8)
   116	MOVQ R13, 16(R10)(SI*8)
   117	MOVQ R14, 24(R10)(SI*8)
   118	SBBQ CX, CX		// save CF
   119
   120	ADDQ $4, SI		// i += 4
   121	SUBQ $4, DI		// n -= 4
   122	JGE U2			// if n >= 0 goto U2
   123
   124V2:	ADDQ $4, DI		// n += 4
   125	JLE E2			// if n <= 0 goto E2
   126
   127L2:	// n > 0
   128	ADDQ CX, CX		// restore CF
   129	MOVQ 0(R8)(SI*8), R11
   130	SBBQ 0(R9)(SI*8), R11
   131	MOVQ R11, 0(R10)(SI*8)
   132	SBBQ CX, CX		// save CF
   133
   134	ADDQ $1, SI		// i++
   135	SUBQ $1, DI		// n--
   136	JG L2			// if n > 0 goto L2
   137
   138E2:	NEGQ CX
   139	MOVQ CX, c+72(FP)	// return c
   140	RET
   141
   142
   143// func addVW(z, x []Word, y Word) (c Word)
   144TEXT ·addVW(SB),NOSPLIT,$0
   145	MOVQ z_len+8(FP), DI
   146	CMPQ DI, $32
   147	JG large
   148	MOVQ x+24(FP), R8
   149	MOVQ y+48(FP), CX	// c = y
   150	MOVQ z+0(FP), R10
   151
   152	MOVQ $0, SI		// i = 0
   153
   154	// s/JL/JMP/ below to disable the unrolled loop
   155	SUBQ $4, DI		// n -= 4
   156	JL V3			// if n < 4 goto V3
   157
   158U3:	// n >= 0
   159	// regular loop body unrolled 4x
   160	MOVQ 0(R8)(SI*8), R11
   161	MOVQ 8(R8)(SI*8), R12
   162	MOVQ 16(R8)(SI*8), R13
   163	MOVQ 24(R8)(SI*8), R14
   164	ADDQ CX, R11
   165	ADCQ $0, R12
   166	ADCQ $0, R13
   167	ADCQ $0, R14
   168	SBBQ CX, CX		// save CF
   169	NEGQ CX
   170	MOVQ R11, 0(R10)(SI*8)
   171	MOVQ R12, 8(R10)(SI*8)
   172	MOVQ R13, 16(R10)(SI*8)
   173	MOVQ R14, 24(R10)(SI*8)
   174
   175	ADDQ $4, SI		// i += 4
   176	SUBQ $4, DI		// n -= 4
   177	JGE U3			// if n >= 0 goto U3
   178
   179V3:	ADDQ $4, DI		// n += 4
   180	JLE E3			// if n <= 0 goto E3
   181
   182L3:	// n > 0
   183	ADDQ 0(R8)(SI*8), CX
   184	MOVQ CX, 0(R10)(SI*8)
   185	SBBQ CX, CX		// save CF
   186	NEGQ CX
   187
   188	ADDQ $1, SI		// i++
   189	SUBQ $1, DI		// n--
   190	JG L3			// if n > 0 goto L3
   191
   192E3:	MOVQ CX, c+56(FP)	// return c
   193	RET
   194large:
   195	JMP ·addVWlarge(SB)
   196
   197
   198// func subVW(z, x []Word, y Word) (c Word)
   199// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   200TEXT ·subVW(SB),NOSPLIT,$0
   201	MOVQ z_len+8(FP), DI
   202	CMPQ DI, $32
   203	JG large
   204	MOVQ x+24(FP), R8
   205	MOVQ y+48(FP), CX	// c = y
   206	MOVQ z+0(FP), R10
   207
   208	MOVQ $0, SI		// i = 0
   209
   210	// s/JL/JMP/ below to disable the unrolled loop
   211	SUBQ $4, DI		// n -= 4
   212	JL V4			// if n < 4 goto V4
   213
   214U4:	// n >= 0
   215	// regular loop body unrolled 4x
   216	MOVQ 0(R8)(SI*8), R11
   217	MOVQ 8(R8)(SI*8), R12
   218	MOVQ 16(R8)(SI*8), R13
   219	MOVQ 24(R8)(SI*8), R14
   220	SUBQ CX, R11
   221	SBBQ $0, R12
   222	SBBQ $0, R13
   223	SBBQ $0, R14
   224	SBBQ CX, CX		// save CF
   225	NEGQ CX
   226	MOVQ R11, 0(R10)(SI*8)
   227	MOVQ R12, 8(R10)(SI*8)
   228	MOVQ R13, 16(R10)(SI*8)
   229	MOVQ R14, 24(R10)(SI*8)
   230
   231	ADDQ $4, SI		// i += 4
   232	SUBQ $4, DI		// n -= 4
   233	JGE U4			// if n >= 0 goto U4
   234
   235V4:	ADDQ $4, DI		// n += 4
   236	JLE E4			// if n <= 0 goto E4
   237
   238L4:	// n > 0
   239	MOVQ 0(R8)(SI*8), R11
   240	SUBQ CX, R11
   241	MOVQ R11, 0(R10)(SI*8)
   242	SBBQ CX, CX		// save CF
   243	NEGQ CX
   244
   245	ADDQ $1, SI		// i++
   246	SUBQ $1, DI		// n--
   247	JG L4			// if n > 0 goto L4
   248
   249E4:	MOVQ CX, c+56(FP)	// return c
   250	RET
   251large:
   252	JMP ·subVWlarge(SB)
   253
   254
   255// func shlVU(z, x []Word, s uint) (c Word)
   256TEXT ·shlVU(SB),NOSPLIT,$0
   257	MOVQ z_len+8(FP), BX	// i = z
   258	SUBQ $1, BX		// i--
   259	JL X8b			// i < 0	(n <= 0)
   260
   261	// n > 0
   262	MOVQ z+0(FP), R10
   263	MOVQ x+24(FP), R8
   264	MOVQ s+48(FP), CX
   265	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   266	MOVQ $0, DX
   267	SHLQ CX, AX, DX		// w1>>ŝ
   268	MOVQ DX, c+56(FP)
   269
   270	CMPQ BX, $0
   271	JLE X8a			// i <= 0
   272
   273	// i > 0
   274L8:	MOVQ AX, DX		// w = w1
   275	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   276	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
   277	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   278	SUBQ $1, BX		// i--
   279	JG L8			// i > 0
   280
   281	// i <= 0
   282X8a:	SHLQ CX, AX		// w1<<s
   283	MOVQ AX, (R10)		// z[0] = w1<<s
   284	RET
   285
   286X8b:	MOVQ $0, c+56(FP)
   287	RET
   288
   289
   290// func shrVU(z, x []Word, s uint) (c Word)
   291TEXT ·shrVU(SB),NOSPLIT,$0
   292	MOVQ z_len+8(FP), R11
   293	SUBQ $1, R11		// n--
   294	JL X9b			// n < 0	(n <= 0)
   295
   296	// n > 0
   297	MOVQ z+0(FP), R10
   298	MOVQ x+24(FP), R8
   299	MOVQ s+48(FP), CX
   300	MOVQ (R8), AX		// w1 = x[0]
   301	MOVQ $0, DX
   302	SHRQ CX, AX, DX		// w1<<ŝ
   303	MOVQ DX, c+56(FP)
   304
   305	MOVQ $0, BX		// i = 0
   306	JMP E9
   307
   308	// i < n-1
   309L9:	MOVQ AX, DX		// w = w1
   310	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   311	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
   312	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   313	ADDQ $1, BX		// i++
   314
   315E9:	CMPQ BX, R11
   316	JL L9			// i < n-1
   317
   318	// i >= n-1
   319X9a:	SHRQ CX, AX		// w1>>s
   320	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   321	RET
   322
   323X9b:	MOVQ $0, c+56(FP)
   324	RET
   325
   326
   327// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   328TEXT ·mulAddVWW(SB),NOSPLIT,$0
   329	MOVQ z+0(FP), R10
   330	MOVQ x+24(FP), R8
   331	MOVQ y+48(FP), R9
   332	MOVQ r+56(FP), CX	// c = r
   333	MOVQ z_len+8(FP), R11
   334	MOVQ $0, BX		// i = 0
   335
   336	CMPQ R11, $4
   337	JL E5
   338
   339U5:	// i+4 <= n
   340	// regular loop body unrolled 4x
   341	MOVQ (0*8)(R8)(BX*8), AX
   342	MULQ R9
   343	ADDQ CX, AX
   344	ADCQ $0, DX
   345	MOVQ AX, (0*8)(R10)(BX*8)
   346	MOVQ DX, CX
   347	MOVQ (1*8)(R8)(BX*8), AX
   348	MULQ R9
   349	ADDQ CX, AX
   350	ADCQ $0, DX
   351	MOVQ AX, (1*8)(R10)(BX*8)
   352	MOVQ DX, CX
   353	MOVQ (2*8)(R8)(BX*8), AX
   354	MULQ R9
   355	ADDQ CX, AX
   356	ADCQ $0, DX
   357	MOVQ AX, (2*8)(R10)(BX*8)
   358	MOVQ DX, CX
   359	MOVQ (3*8)(R8)(BX*8), AX
   360	MULQ R9
   361	ADDQ CX, AX
   362	ADCQ $0, DX
   363	MOVQ AX, (3*8)(R10)(BX*8)
   364	MOVQ DX, CX
   365	ADDQ $4, BX		// i += 4
   366
   367	LEAQ 4(BX), DX
   368	CMPQ DX, R11
   369	JLE U5
   370	JMP E5
   371
   372L5:	MOVQ (R8)(BX*8), AX
   373	MULQ R9
   374	ADDQ CX, AX
   375	ADCQ $0, DX
   376	MOVQ AX, (R10)(BX*8)
   377	MOVQ DX, CX
   378	ADDQ $1, BX		// i++
   379
   380E5:	CMPQ BX, R11		// i < n
   381	JL L5
   382
   383	MOVQ CX, c+64(FP)
   384	RET
   385
   386
   387// func addMulVVW(z, x []Word, y Word) (c Word)
   388TEXT ·addMulVVW(SB),NOSPLIT,$0
   389	CMPB    ·support_adx(SB), $1
   390	JEQ adx
   391	MOVQ z+0(FP), R10
   392	MOVQ x+24(FP), R8
   393	MOVQ y+48(FP), R9
   394	MOVQ z_len+8(FP), R11
   395	MOVQ $0, BX		// i = 0
   396	MOVQ $0, CX		// c = 0
   397	MOVQ R11, R12
   398	ANDQ $-2, R12
   399	CMPQ R11, $2
   400	JAE A6
   401	JMP E6
   402
   403A6:
   404	MOVQ (R8)(BX*8), AX
   405	MULQ R9
   406	ADDQ (R10)(BX*8), AX
   407	ADCQ $0, DX
   408	ADDQ CX, AX
   409	ADCQ $0, DX
   410	MOVQ DX, CX
   411	MOVQ AX, (R10)(BX*8)
   412
   413	MOVQ (8)(R8)(BX*8), AX
   414	MULQ R9
   415	ADDQ (8)(R10)(BX*8), AX
   416	ADCQ $0, DX
   417	ADDQ CX, AX
   418	ADCQ $0, DX
   419	MOVQ DX, CX
   420	MOVQ AX, (8)(R10)(BX*8)
   421
   422	ADDQ $2, BX
   423	CMPQ BX, R12
   424	JL A6
   425	JMP E6
   426
   427L6:	MOVQ (R8)(BX*8), AX
   428	MULQ R9
   429	ADDQ CX, AX
   430	ADCQ $0, DX
   431	ADDQ AX, (R10)(BX*8)
   432	ADCQ $0, DX
   433	MOVQ DX, CX
   434	ADDQ $1, BX		// i++
   435
   436E6:	CMPQ BX, R11		// i < n
   437	JL L6
   438
   439	MOVQ CX, c+56(FP)
   440	RET
   441
   442adx:
   443	MOVQ z_len+8(FP), R11
   444	MOVQ z+0(FP), R10
   445	MOVQ x+24(FP), R8
   446	MOVQ y+48(FP), DX
   447	MOVQ $0, BX   // i = 0
   448	MOVQ $0, CX   // carry
   449	CMPQ R11, $8
   450	JAE  adx_loop_header
   451	CMPQ BX, R11
   452	JL adx_short
   453	MOVQ CX, c+56(FP)
   454	RET
   455
   456adx_loop_header:
   457	MOVQ  R11, R13
   458	ANDQ  $-8, R13
   459adx_loop:
   460	XORQ  R9, R9  // unset flags
   461	MULXQ (R8), SI, DI
   462	ADCXQ CX,SI
   463	ADOXQ (R10), SI
   464	MOVQ  SI,(R10)
   465
   466	MULXQ 8(R8), AX, CX
   467	ADCXQ DI, AX
   468	ADOXQ 8(R10), AX
   469	MOVQ  AX, 8(R10)
   470
   471	MULXQ 16(R8), SI, DI
   472	ADCXQ CX, SI
   473	ADOXQ 16(R10), SI
   474	MOVQ  SI, 16(R10)
   475
   476	MULXQ 24(R8), AX, CX
   477	ADCXQ DI, AX
   478	ADOXQ 24(R10), AX
   479	MOVQ  AX, 24(R10)
   480
   481	MULXQ 32(R8), SI, DI
   482	ADCXQ CX, SI
   483	ADOXQ 32(R10), SI
   484	MOVQ  SI, 32(R10)
   485
   486	MULXQ 40(R8), AX, CX
   487	ADCXQ DI, AX
   488	ADOXQ 40(R10), AX
   489	MOVQ  AX, 40(R10)
   490
   491	MULXQ 48(R8), SI, DI
   492	ADCXQ CX, SI
   493	ADOXQ 48(R10), SI
   494	MOVQ  SI, 48(R10)
   495
   496	MULXQ 56(R8), AX, CX
   497	ADCXQ DI, AX
   498	ADOXQ 56(R10), AX
   499	MOVQ  AX, 56(R10)
   500
   501	ADCXQ R9, CX
   502	ADOXQ R9, CX
   503
   504	ADDQ $64, R8
   505	ADDQ $64, R10
   506	ADDQ $8, BX
   507
   508	CMPQ BX, R13
   509	JL adx_loop
   510	MOVQ z+0(FP), R10
   511	MOVQ x+24(FP), R8
   512	CMPQ BX, R11
   513	JL adx_short
   514	MOVQ CX, c+56(FP)
   515	RET
   516
   517adx_short:
   518	MULXQ (R8)(BX*8), SI, DI
   519	ADDQ CX, SI
   520	ADCQ $0, DI
   521	ADDQ SI, (R10)(BX*8)
   522	ADCQ $0, DI
   523	MOVQ DI, CX
   524	ADDQ $1, BX		// i++
   525
   526	CMPQ BX, R11
   527	JL adx_short
   528
   529	MOVQ CX, c+56(FP)
   530	RET
   531
   532
   533
   534// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   535TEXT ·divWVW(SB),NOSPLIT,$0
   536	MOVQ z+0(FP), R10
   537	MOVQ xn+24(FP), DX	// r = xn
   538	MOVQ x+32(FP), R8
   539	MOVQ y+56(FP), R9
   540	MOVQ z_len+8(FP), BX	// i = z
   541	JMP E7
   542
   543L7:	MOVQ (R8)(BX*8), AX
   544	DIVQ R9
   545	MOVQ AX, (R10)(BX*8)
   546
   547E7:	SUBQ $1, BX		// i--
   548	JGE L7			// i >= 0
   549
   550	MOVQ DX, r+64(FP)
   551	RET

View as plain text