...

Text file src/math/big/arith_amd64.s

Documentation: math/big

     1// Copyright 2009 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !math_big_pure_go
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// func mulWW(x, y Word) (z1, z0 Word)
    13TEXT ·mulWW(SB),NOSPLIT,$0
    14	MOVQ x+0(FP), AX
    15	MULQ y+8(FP)
    16	MOVQ DX, z1+16(FP)
    17	MOVQ AX, z0+24(FP)
    18	RET
    19
    20
    21// func divWW(x1, x0, y Word) (q, r Word)
    22TEXT ·divWW(SB),NOSPLIT,$0
    23	MOVQ x1+0(FP), DX
    24	MOVQ x0+8(FP), AX
    25	DIVQ y+16(FP)
    26	MOVQ AX, q+24(FP)
    27	MOVQ DX, r+32(FP)
    28	RET
    29
    30// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32// This is faster than using rotate instructions.
    33
    34// func addVV(z, x, y []Word) (c Word)
    35TEXT ·addVV(SB),NOSPLIT,$0
    36	MOVQ z_len+8(FP), DI
    37	MOVQ x+24(FP), R8
    38	MOVQ y+48(FP), R9
    39	MOVQ z+0(FP), R10
    40
    41	MOVQ $0, CX		// c = 0
    42	MOVQ $0, SI		// i = 0
    43
    44	// s/JL/JMP/ below to disable the unrolled loop
    45	SUBQ $4, DI		// n -= 4
    46	JL V1			// if n < 0 goto V1
    47
    48U1:	// n >= 0
    49	// regular loop body unrolled 4x
    50	ADDQ CX, CX		// restore CF
    51	MOVQ 0(R8)(SI*8), R11
    52	MOVQ 8(R8)(SI*8), R12
    53	MOVQ 16(R8)(SI*8), R13
    54	MOVQ 24(R8)(SI*8), R14
    55	ADCQ 0(R9)(SI*8), R11
    56	ADCQ 8(R9)(SI*8), R12
    57	ADCQ 16(R9)(SI*8), R13
    58	ADCQ 24(R9)(SI*8), R14
    59	MOVQ R11, 0(R10)(SI*8)
    60	MOVQ R12, 8(R10)(SI*8)
    61	MOVQ R13, 16(R10)(SI*8)
    62	MOVQ R14, 24(R10)(SI*8)
    63	SBBQ CX, CX		// save CF
    64
    65	ADDQ $4, SI		// i += 4
    66	SUBQ $4, DI		// n -= 4
    67	JGE U1			// if n >= 0 goto U1
    68
    69V1:	ADDQ $4, DI		// n += 4
    70	JLE E1			// if n <= 0 goto E1
    71
    72L1:	// n > 0
    73	ADDQ CX, CX		// restore CF
    74	MOVQ 0(R8)(SI*8), R11
    75	ADCQ 0(R9)(SI*8), R11
    76	MOVQ R11, 0(R10)(SI*8)
    77	SBBQ CX, CX		// save CF
    78
    79	ADDQ $1, SI		// i++
    80	SUBQ $1, DI		// n--
    81	JG L1			// if n > 0 goto L1
    82
    83E1:	NEGQ CX
    84	MOVQ CX, c+72(FP)	// return c
    85	RET
    86
    87
    88// func subVV(z, x, y []Word) (c Word)
    89// (same as addVV except for SBBQ instead of ADCQ and label names)
    90TEXT ·subVV(SB),NOSPLIT,$0
    91	MOVQ z_len+8(FP), DI
    92	MOVQ x+24(FP), R8
    93	MOVQ y+48(FP), R9
    94	MOVQ z+0(FP), R10
    95
    96	MOVQ $0, CX		// c = 0
    97	MOVQ $0, SI		// i = 0
    98
    99	// s/JL/JMP/ below to disable the unrolled loop
   100	SUBQ $4, DI		// n -= 4
   101	JL V2			// if n < 0 goto V2
   102
   103U2:	// n >= 0
   104	// regular loop body unrolled 4x
   105	ADDQ CX, CX		// restore CF
   106	MOVQ 0(R8)(SI*8), R11
   107	MOVQ 8(R8)(SI*8), R12
   108	MOVQ 16(R8)(SI*8), R13
   109	MOVQ 24(R8)(SI*8), R14
   110	SBBQ 0(R9)(SI*8), R11
   111	SBBQ 8(R9)(SI*8), R12
   112	SBBQ 16(R9)(SI*8), R13
   113	SBBQ 24(R9)(SI*8), R14
   114	MOVQ R11, 0(R10)(SI*8)
   115	MOVQ R12, 8(R10)(SI*8)
   116	MOVQ R13, 16(R10)(SI*8)
   117	MOVQ R14, 24(R10)(SI*8)
   118	SBBQ CX, CX		// save CF
   119
   120	ADDQ $4, SI		// i += 4
   121	SUBQ $4, DI		// n -= 4
   122	JGE U2			// if n >= 0 goto U2
   123
   124V2:	ADDQ $4, DI		// n += 4
   125	JLE E2			// if n <= 0 goto E2
   126
   127L2:	// n > 0
   128	ADDQ CX, CX		// restore CF
   129	MOVQ 0(R8)(SI*8), R11
   130	SBBQ 0(R9)(SI*8), R11
   131	MOVQ R11, 0(R10)(SI*8)
   132	SBBQ CX, CX		// save CF
   133
   134	ADDQ $1, SI		// i++
   135	SUBQ $1, DI		// n--
   136	JG L2			// if n > 0 goto L2
   137
   138E2:	NEGQ CX
   139	MOVQ CX, c+72(FP)	// return c
   140	RET
   141
   142
   143// func addVW(z, x []Word, y Word) (c Word)
   144TEXT ·addVW(SB),NOSPLIT,$0
   145	MOVQ z_len+8(FP), DI
   146	MOVQ x+24(FP), R8
   147	MOVQ y+48(FP), CX	// c = y
   148	MOVQ z+0(FP), R10
   149
   150	MOVQ $0, SI		// i = 0
   151
   152	// s/JL/JMP/ below to disable the unrolled loop
   153	SUBQ $4, DI		// n -= 4
   154	JL V3			// if n < 4 goto V3
   155
   156U3:	// n >= 0
   157	// regular loop body unrolled 4x
   158	MOVQ 0(R8)(SI*8), R11
   159	MOVQ 8(R8)(SI*8), R12
   160	MOVQ 16(R8)(SI*8), R13
   161	MOVQ 24(R8)(SI*8), R14
   162	ADDQ CX, R11
   163	ADCQ $0, R12
   164	ADCQ $0, R13
   165	ADCQ $0, R14
   166	SBBQ CX, CX		// save CF
   167	NEGQ CX
   168	MOVQ R11, 0(R10)(SI*8)
   169	MOVQ R12, 8(R10)(SI*8)
   170	MOVQ R13, 16(R10)(SI*8)
   171	MOVQ R14, 24(R10)(SI*8)
   172
   173	ADDQ $4, SI		// i += 4
   174	SUBQ $4, DI		// n -= 4
   175	JGE U3			// if n >= 0 goto U3
   176
   177V3:	ADDQ $4, DI		// n += 4
   178	JLE E3			// if n <= 0 goto E3
   179
   180L3:	// n > 0
   181	ADDQ 0(R8)(SI*8), CX
   182	MOVQ CX, 0(R10)(SI*8)
   183	SBBQ CX, CX		// save CF
   184	NEGQ CX
   185
   186	ADDQ $1, SI		// i++
   187	SUBQ $1, DI		// n--
   188	JG L3			// if n > 0 goto L3
   189
   190E3:	MOVQ CX, c+56(FP)	// return c
   191	RET
   192
   193
   194// func subVW(z, x []Word, y Word) (c Word)
   195// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   196TEXT ·subVW(SB),NOSPLIT,$0
   197	MOVQ z_len+8(FP), DI
   198	MOVQ x+24(FP), R8
   199	MOVQ y+48(FP), CX	// c = y
   200	MOVQ z+0(FP), R10
   201
   202	MOVQ $0, SI		// i = 0
   203
   204	// s/JL/JMP/ below to disable the unrolled loop
   205	SUBQ $4, DI		// n -= 4
   206	JL V4			// if n < 4 goto V4
   207
   208U4:	// n >= 0
   209	// regular loop body unrolled 4x
   210	MOVQ 0(R8)(SI*8), R11
   211	MOVQ 8(R8)(SI*8), R12
   212	MOVQ 16(R8)(SI*8), R13
   213	MOVQ 24(R8)(SI*8), R14
   214	SUBQ CX, R11
   215	SBBQ $0, R12
   216	SBBQ $0, R13
   217	SBBQ $0, R14
   218	SBBQ CX, CX		// save CF
   219	NEGQ CX
   220	MOVQ R11, 0(R10)(SI*8)
   221	MOVQ R12, 8(R10)(SI*8)
   222	MOVQ R13, 16(R10)(SI*8)
   223	MOVQ R14, 24(R10)(SI*8)
   224
   225	ADDQ $4, SI		// i += 4
   226	SUBQ $4, DI		// n -= 4
   227	JGE U4			// if n >= 0 goto U4
   228
   229V4:	ADDQ $4, DI		// n += 4
   230	JLE E4			// if n <= 0 goto E4
   231
   232L4:	// n > 0
   233	MOVQ 0(R8)(SI*8), R11
   234	SUBQ CX, R11
   235	MOVQ R11, 0(R10)(SI*8)
   236	SBBQ CX, CX		// save CF
   237	NEGQ CX
   238
   239	ADDQ $1, SI		// i++
   240	SUBQ $1, DI		// n--
   241	JG L4			// if n > 0 goto L4
   242
   243E4:	MOVQ CX, c+56(FP)	// return c
   244	RET
   245
   246
   247// func shlVU(z, x []Word, s uint) (c Word)
   248TEXT ·shlVU(SB),NOSPLIT,$0
   249	MOVQ z_len+8(FP), BX	// i = z
   250	SUBQ $1, BX		// i--
   251	JL X8b			// i < 0	(n <= 0)
   252
   253	// n > 0
   254	MOVQ z+0(FP), R10
   255	MOVQ x+24(FP), R8
   256	MOVQ s+48(FP), CX
   257	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   258	MOVQ $0, DX
   259	SHLQ CX, DX:AX		// w1>>ŝ
   260	MOVQ DX, c+56(FP)
   261
   262	CMPQ BX, $0
   263	JLE X8a			// i <= 0
   264
   265	// i > 0
   266L8:	MOVQ AX, DX		// w = w1
   267	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   268	SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   269	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   270	SUBQ $1, BX		// i--
   271	JG L8			// i > 0
   272
   273	// i <= 0
   274X8a:	SHLQ CX, AX		// w1<<s
   275	MOVQ AX, (R10)		// z[0] = w1<<s
   276	RET
   277
   278X8b:	MOVQ $0, c+56(FP)
   279	RET
   280
   281
   282// func shrVU(z, x []Word, s uint) (c Word)
   283TEXT ·shrVU(SB),NOSPLIT,$0
   284	MOVQ z_len+8(FP), R11
   285	SUBQ $1, R11		// n--
   286	JL X9b			// n < 0	(n <= 0)
   287
   288	// n > 0
   289	MOVQ z+0(FP), R10
   290	MOVQ x+24(FP), R8
   291	MOVQ s+48(FP), CX
   292	MOVQ (R8), AX		// w1 = x[0]
   293	MOVQ $0, DX
   294	SHRQ CX, DX:AX		// w1<<ŝ
   295	MOVQ DX, c+56(FP)
   296
   297	MOVQ $0, BX		// i = 0
   298	JMP E9
   299
   300	// i < n-1
   301L9:	MOVQ AX, DX		// w = w1
   302	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   303	SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   304	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   305	ADDQ $1, BX		// i++
   306
   307E9:	CMPQ BX, R11
   308	JL L9			// i < n-1
   309
   310	// i >= n-1
   311X9a:	SHRQ CX, AX		// w1>>s
   312	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   313	RET
   314
   315X9b:	MOVQ $0, c+56(FP)
   316	RET
   317
   318
   319// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   320TEXT ·mulAddVWW(SB),NOSPLIT,$0
   321	MOVQ z+0(FP), R10
   322	MOVQ x+24(FP), R8
   323	MOVQ y+48(FP), R9
   324	MOVQ r+56(FP), CX	// c = r
   325	MOVQ z_len+8(FP), R11
   326	MOVQ $0, BX		// i = 0
   327
   328	CMPQ R11, $4
   329	JL E5
   330
   331U5:	// i+4 <= n
   332	// regular loop body unrolled 4x
   333	MOVQ (0*8)(R8)(BX*8), AX
   334	MULQ R9
   335	ADDQ CX, AX
   336	ADCQ $0, DX
   337	MOVQ AX, (0*8)(R10)(BX*8)
   338	MOVQ DX, CX
   339	MOVQ (1*8)(R8)(BX*8), AX
   340	MULQ R9
   341	ADDQ CX, AX
   342	ADCQ $0, DX
   343	MOVQ AX, (1*8)(R10)(BX*8)
   344	MOVQ DX, CX
   345	MOVQ (2*8)(R8)(BX*8), AX
   346	MULQ R9
   347	ADDQ CX, AX
   348	ADCQ $0, DX
   349	MOVQ AX, (2*8)(R10)(BX*8)
   350	MOVQ DX, CX
   351	MOVQ (3*8)(R8)(BX*8), AX
   352	MULQ R9
   353	ADDQ CX, AX
   354	ADCQ $0, DX
   355	MOVQ AX, (3*8)(R10)(BX*8)
   356	MOVQ DX, CX
   357	ADDQ $4, BX		// i += 4
   358
   359	LEAQ 4(BX), DX
   360	CMPQ DX, R11
   361	JLE U5
   362	JMP E5
   363
   364L5:	MOVQ (R8)(BX*8), AX
   365	MULQ R9
   366	ADDQ CX, AX
   367	ADCQ $0, DX
   368	MOVQ AX, (R10)(BX*8)
   369	MOVQ DX, CX
   370	ADDQ $1, BX		// i++
   371
   372E5:	CMPQ BX, R11		// i < n
   373	JL L5
   374
   375	MOVQ CX, c+64(FP)
   376	RET
   377
   378
   379// func addMulVVW(z, x []Word, y Word) (c Word)
   380TEXT ·addMulVVW(SB),NOSPLIT,$0
   381	CMPB    ·support_adx(SB), $1
   382	JEQ adx
   383	MOVQ z+0(FP), R10
   384	MOVQ x+24(FP), R8
   385	MOVQ y+48(FP), R9
   386	MOVQ z_len+8(FP), R11
   387	MOVQ $0, BX		// i = 0
   388	MOVQ $0, CX		// c = 0
   389	MOVQ R11, R12
   390	ANDQ $-2, R12
   391	CMPQ R11, $2
   392	JAE A6
   393	JMP E6
   394
   395A6:
   396	MOVQ (R8)(BX*8), AX
   397	MULQ R9
   398	ADDQ (R10)(BX*8), AX
   399	ADCQ $0, DX
   400	ADDQ CX, AX
   401	ADCQ $0, DX
   402	MOVQ DX, CX
   403	MOVQ AX, (R10)(BX*8)
   404
   405	MOVQ (8)(R8)(BX*8), AX
   406	MULQ R9
   407	ADDQ (8)(R10)(BX*8), AX
   408	ADCQ $0, DX
   409	ADDQ CX, AX
   410	ADCQ $0, DX
   411	MOVQ DX, CX
   412	MOVQ AX, (8)(R10)(BX*8)
   413
   414	ADDQ $2, BX
   415	CMPQ BX, R12
   416	JL A6
   417	JMP E6
   418
   419L6:	MOVQ (R8)(BX*8), AX
   420	MULQ R9
   421	ADDQ CX, AX
   422	ADCQ $0, DX
   423	ADDQ AX, (R10)(BX*8)
   424	ADCQ $0, DX
   425	MOVQ DX, CX
   426	ADDQ $1, BX		// i++
   427
   428E6:	CMPQ BX, R11		// i < n
   429	JL L6
   430
   431	MOVQ CX, c+56(FP)
   432	RET
   433
   434adx:
   435	MOVQ z_len+8(FP), R11
   436	MOVQ z+0(FP), R10
   437	MOVQ x+24(FP), R8
   438	MOVQ y+48(FP), DX
   439	MOVQ $0, BX   // i = 0
   440	MOVQ $0, CX   // carry
   441	CMPQ R11, $8
   442	JAE  adx_loop_header
   443	CMPQ BX, R11
   444	JL adx_short
   445	MOVQ CX, c+56(FP)
   446	RET
   447
   448adx_loop_header:
   449	MOVQ  R11, R13
   450	ANDQ  $-8, R13
   451adx_loop:
   452	XORQ  R9, R9  // unset flags
   453	MULXQ (R8), SI, DI
   454	ADCXQ CX,SI
   455	ADOXQ (R10), SI
   456	MOVQ  SI,(R10)
   457
   458	MULXQ 8(R8), AX, CX
   459	ADCXQ DI, AX
   460	ADOXQ 8(R10), AX
   461	MOVQ  AX, 8(R10)
   462
   463	MULXQ 16(R8), SI, DI
   464	ADCXQ CX, SI
   465	ADOXQ 16(R10), SI
   466	MOVQ  SI, 16(R10)
   467
   468	MULXQ 24(R8), AX, CX
   469	ADCXQ DI, AX
   470	ADOXQ 24(R10), AX
   471	MOVQ  AX, 24(R10)
   472
   473	MULXQ 32(R8), SI, DI
   474	ADCXQ CX, SI
   475	ADOXQ 32(R10), SI
   476	MOVQ  SI, 32(R10)
   477
   478	MULXQ 40(R8), AX, CX
   479	ADCXQ DI, AX
   480	ADOXQ 40(R10), AX
   481	MOVQ  AX, 40(R10)
   482
   483	MULXQ 48(R8), SI, DI
   484	ADCXQ CX, SI
   485	ADOXQ 48(R10), SI
   486	MOVQ  SI, 48(R10)
   487
   488	MULXQ 56(R8), AX, CX
   489	ADCXQ DI, AX
   490	ADOXQ 56(R10), AX
   491	MOVQ  AX, 56(R10)
   492
   493	ADCXQ R9, CX
   494	ADOXQ R9, CX
   495
   496	ADDQ $64, R8
   497	ADDQ $64, R10
   498	ADDQ $8, BX
   499
   500	CMPQ BX, R13
   501	JL adx_loop
   502	MOVQ z+0(FP), R10
   503	MOVQ x+24(FP), R8
   504	CMPQ BX, R11
   505	JL adx_short
   506	MOVQ CX, c+56(FP)
   507	RET
   508
   509adx_short:
   510	MULXQ (R8)(BX*8), SI, DI
   511	ADDQ CX, SI
   512	ADCQ $0, DI
   513	ADDQ SI, (R10)(BX*8)
   514	ADCQ $0, DI
   515	MOVQ DI, CX
   516	ADDQ $1, BX		// i++
   517
   518	CMPQ BX, R11
   519	JL adx_short
   520
   521	MOVQ CX, c+56(FP)
   522	RET
   523
   524
   525
   526// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   527TEXT ·divWVW(SB),NOSPLIT,$0
   528	MOVQ z+0(FP), R10
   529	MOVQ xn+24(FP), DX	// r = xn
   530	MOVQ x+32(FP), R8
   531	MOVQ y+56(FP), R9
   532	MOVQ z_len+8(FP), BX	// i = z
   533	JMP E7
   534
   535L7:	MOVQ (R8)(BX*8), AX
   536	DIVQ R9
   537	MOVQ AX, (R10)(BX*8)
   538
   539E7:	SUBQ $1, BX		// i--
   540	JGE L7			// i >= 0
   541
   542	MOVQ DX, r+64(FP)
   543	RET

View as plain text