...
Run Format

Text file src/crypto/elliptic/p256_asm_amd64.s

Documentation: crypto/elliptic

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file contains constant-time, 64-bit assembly implementation of
     6	// P256. The optimizations performed here are described in detail in:
     7	// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8	//                          256-bit primes"
     9	// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10	// https://eprint.iacr.org/2013/816.pdf
    11	
    12	#include "textflag.h"
    13	
    14	#define res_ptr DI
    15	#define x_ptr SI
    16	#define y_ptr CX
    17	
    18	#define acc0 R8
    19	#define acc1 R9
    20	#define acc2 R10
    21	#define acc3 R11
    22	#define acc4 R12
    23	#define acc5 R13
    24	#define t0 R14
    25	#define t1 R15
    26	
    27	DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    28	DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    29	DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    30	DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    31	DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    32	DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    33	DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    34	DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    35	DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    36	DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    37	DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    38	GLOBL p256const0<>(SB), 8, $8
    39	GLOBL p256const1<>(SB), 8, $8
    40	GLOBL p256ordK0<>(SB), 8, $8
    41	GLOBL p256ord<>(SB), 8, $32
    42	GLOBL p256one<>(SB), 8, $32
    43	
    44	/* ---------------------------------------*/
    45	// func p256LittleToBig(res []byte, in []uint64)
    46	TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    47		JMP ·p256BigToLittle(SB)
    48	/* ---------------------------------------*/
    49	// func p256BigToLittle(res []uint64, in []byte)
    50	TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    51		MOVQ res+0(FP), res_ptr
    52		MOVQ in+24(FP), x_ptr
    53	
    54		MOVQ (8*0)(x_ptr), acc0
    55		MOVQ (8*1)(x_ptr), acc1
    56		MOVQ (8*2)(x_ptr), acc2
    57		MOVQ (8*3)(x_ptr), acc3
    58	
    59		BSWAPQ acc0
    60		BSWAPQ acc1
    61		BSWAPQ acc2
    62		BSWAPQ acc3
    63	
    64		MOVQ acc3, (8*0)(res_ptr)
    65		MOVQ acc2, (8*1)(res_ptr)
    66		MOVQ acc1, (8*2)(res_ptr)
    67		MOVQ acc0, (8*3)(res_ptr)
    68	
    69		RET
    70	/* ---------------------------------------*/
    71	// func p256MovCond(res, a, b []uint64, cond int)
    72	// If cond == 0 res=b, else res=a
    73	TEXT ·p256MovCond(SB),NOSPLIT,$0
    74		MOVQ res+0(FP), res_ptr
    75		MOVQ a+24(FP), x_ptr
    76		MOVQ b+48(FP), y_ptr
    77		MOVQ cond+72(FP), X12
    78	
    79		PXOR X13, X13
    80		PSHUFD $0, X12, X12
    81		PCMPEQL X13, X12
    82	
    83		MOVOU X12, X0
    84		MOVOU (16*0)(x_ptr), X6
    85		PANDN X6, X0
    86		MOVOU X12, X1
    87		MOVOU (16*1)(x_ptr), X7
    88		PANDN X7, X1
    89		MOVOU X12, X2
    90		MOVOU (16*2)(x_ptr), X8
    91		PANDN X8, X2
    92		MOVOU X12, X3
    93		MOVOU (16*3)(x_ptr), X9
    94		PANDN X9, X3
    95		MOVOU X12, X4
    96		MOVOU (16*4)(x_ptr), X10
    97		PANDN X10, X4
    98		MOVOU X12, X5
    99		MOVOU (16*5)(x_ptr), X11
   100		PANDN X11, X5
   101	
   102		MOVOU (16*0)(y_ptr), X6
   103		MOVOU (16*1)(y_ptr), X7
   104		MOVOU (16*2)(y_ptr), X8
   105		MOVOU (16*3)(y_ptr), X9
   106		MOVOU (16*4)(y_ptr), X10
   107		MOVOU (16*5)(y_ptr), X11
   108	
   109		PAND X12, X6
   110		PAND X12, X7
   111		PAND X12, X8
   112		PAND X12, X9
   113		PAND X12, X10
   114		PAND X12, X11
   115	
   116		PXOR X6, X0
   117		PXOR X7, X1
   118		PXOR X8, X2
   119		PXOR X9, X3
   120		PXOR X10, X4
   121		PXOR X11, X5
   122	
   123		MOVOU X0, (16*0)(res_ptr)
   124		MOVOU X1, (16*1)(res_ptr)
   125		MOVOU X2, (16*2)(res_ptr)
   126		MOVOU X3, (16*3)(res_ptr)
   127		MOVOU X4, (16*4)(res_ptr)
   128		MOVOU X5, (16*5)(res_ptr)
   129	
   130		RET
   131	/* ---------------------------------------*/
   132	// func p256NegCond(val []uint64, cond int)
   133	TEXT ·p256NegCond(SB),NOSPLIT,$0
   134		MOVQ val+0(FP), res_ptr
   135		MOVQ cond+24(FP), t0
   136		// acc = poly
   137		MOVQ $-1, acc0
   138		MOVQ p256const0<>(SB), acc1
   139		MOVQ $0, acc2
   140		MOVQ p256const1<>(SB), acc3
   141		// Load the original value
   142		MOVQ (8*0)(res_ptr), acc5
   143		MOVQ (8*1)(res_ptr), x_ptr
   144		MOVQ (8*2)(res_ptr), y_ptr
   145		MOVQ (8*3)(res_ptr), t1
   146		// Speculatively subtract
   147		SUBQ acc5, acc0
   148		SBBQ x_ptr, acc1
   149		SBBQ y_ptr, acc2
   150		SBBQ t1, acc3
   151		// If condition is 0, keep original value
   152		TESTQ t0, t0
   153		CMOVQEQ acc5, acc0
   154		CMOVQEQ x_ptr, acc1
   155		CMOVQEQ y_ptr, acc2
   156		CMOVQEQ t1, acc3
   157		// Store result
   158		MOVQ acc0, (8*0)(res_ptr)
   159		MOVQ acc1, (8*1)(res_ptr)
   160		MOVQ acc2, (8*2)(res_ptr)
   161		MOVQ acc3, (8*3)(res_ptr)
   162	
   163		RET
   164	/* ---------------------------------------*/
   165	// func p256Sqr(res, in []uint64)
   166	TEXT ·p256Sqr(SB),NOSPLIT,$0
   167		MOVQ res+0(FP), res_ptr
   168		MOVQ in+24(FP), x_ptr
   169		// y[1:] * y[0]
   170		MOVQ (8*0)(x_ptr), t0
   171	
   172		MOVQ (8*1)(x_ptr), AX
   173		MULQ t0
   174		MOVQ AX, acc1
   175		MOVQ DX, acc2
   176	
   177		MOVQ (8*2)(x_ptr), AX
   178		MULQ t0
   179		ADDQ AX, acc2
   180		ADCQ $0, DX
   181		MOVQ DX, acc3
   182	
   183		MOVQ (8*3)(x_ptr), AX
   184		MULQ t0
   185		ADDQ AX, acc3
   186		ADCQ $0, DX
   187		MOVQ DX, acc4
   188		// y[2:] * y[1]
   189		MOVQ (8*1)(x_ptr), t0
   190	
   191		MOVQ (8*2)(x_ptr), AX
   192		MULQ t0
   193		ADDQ AX, acc3
   194		ADCQ $0, DX
   195		MOVQ DX, t1
   196	
   197		MOVQ (8*3)(x_ptr), AX
   198		MULQ t0
   199		ADDQ t1, acc4
   200		ADCQ $0, DX
   201		ADDQ AX, acc4
   202		ADCQ $0, DX
   203		MOVQ DX, acc5
   204		// y[3] * y[2]
   205		MOVQ (8*2)(x_ptr), t0
   206	
   207		MOVQ (8*3)(x_ptr), AX
   208		MULQ t0
   209		ADDQ AX, acc5
   210		ADCQ $0, DX
   211		MOVQ DX, y_ptr
   212		XORQ t1, t1
   213		// *2
   214		ADDQ acc1, acc1
   215		ADCQ acc2, acc2
   216		ADCQ acc3, acc3
   217		ADCQ acc4, acc4
   218		ADCQ acc5, acc5
   219		ADCQ y_ptr, y_ptr
   220		ADCQ $0, t1
   221		// Missing products
   222		MOVQ (8*0)(x_ptr), AX
   223		MULQ AX
   224		MOVQ AX, acc0
   225		MOVQ DX, t0
   226	
   227		MOVQ (8*1)(x_ptr), AX
   228		MULQ AX
   229		ADDQ t0, acc1
   230		ADCQ AX, acc2
   231		ADCQ $0, DX
   232		MOVQ DX, t0
   233	
   234		MOVQ (8*2)(x_ptr), AX
   235		MULQ AX
   236		ADDQ t0, acc3
   237		ADCQ AX, acc4
   238		ADCQ $0, DX
   239		MOVQ DX, t0
   240	
   241		MOVQ (8*3)(x_ptr), AX
   242		MULQ AX
   243		ADDQ t0, acc5
   244		ADCQ AX, y_ptr
   245		ADCQ DX, t1
   246		MOVQ t1, x_ptr
   247		// First reduction step
   248		MOVQ acc0, AX
   249		MOVQ acc0, t1
   250		SHLQ $32, acc0
   251		MULQ p256const1<>(SB)
   252		SHRQ $32, t1
   253		ADDQ acc0, acc1
   254		ADCQ t1, acc2
   255		ADCQ AX, acc3
   256		ADCQ $0, DX
   257		MOVQ DX, acc0
   258		// Second reduction step
   259		MOVQ acc1, AX
   260		MOVQ acc1, t1
   261		SHLQ $32, acc1
   262		MULQ p256const1<>(SB)
   263		SHRQ $32, t1
   264		ADDQ acc1, acc2
   265		ADCQ t1, acc3
   266		ADCQ AX, acc0
   267		ADCQ $0, DX
   268		MOVQ DX, acc1
   269		// Third reduction step
   270		MOVQ acc2, AX
   271		MOVQ acc2, t1
   272		SHLQ $32, acc2
   273		MULQ p256const1<>(SB)
   274		SHRQ $32, t1
   275		ADDQ acc2, acc3
   276		ADCQ t1, acc0
   277		ADCQ AX, acc1
   278		ADCQ $0, DX
   279		MOVQ DX, acc2
   280		// Last reduction step
   281		XORQ t0, t0
   282		MOVQ acc3, AX
   283		MOVQ acc3, t1
   284		SHLQ $32, acc3
   285		MULQ p256const1<>(SB)
   286		SHRQ $32, t1
   287		ADDQ acc3, acc0
   288		ADCQ t1, acc1
   289		ADCQ AX, acc2
   290		ADCQ $0, DX
   291		MOVQ DX, acc3
   292		// Add bits [511:256] of the sqr result
   293		ADCQ acc4, acc0
   294		ADCQ acc5, acc1
   295		ADCQ y_ptr, acc2
   296		ADCQ x_ptr, acc3
   297		ADCQ $0, t0
   298	
   299		MOVQ acc0, acc4
   300		MOVQ acc1, acc5
   301		MOVQ acc2, y_ptr
   302		MOVQ acc3, t1
   303		// Subtract p256
   304		SUBQ $-1, acc0
   305		SBBQ p256const0<>(SB) ,acc1
   306		SBBQ $0, acc2
   307		SBBQ p256const1<>(SB), acc3
   308		SBBQ $0, t0
   309	
   310		CMOVQCS acc4, acc0
   311		CMOVQCS acc5, acc1
   312		CMOVQCS y_ptr, acc2
   313		CMOVQCS t1, acc3
   314	
   315		MOVQ acc0, (8*0)(res_ptr)
   316		MOVQ acc1, (8*1)(res_ptr)
   317		MOVQ acc2, (8*2)(res_ptr)
   318		MOVQ acc3, (8*3)(res_ptr)
   319	
   320		RET
   321	/* ---------------------------------------*/
   322	// func p256Mul(res, in1, in2 []uint64)
   323	TEXT ·p256Mul(SB),NOSPLIT,$0
   324		MOVQ res+0(FP), res_ptr
   325		MOVQ in1+24(FP), x_ptr
   326		MOVQ in2+48(FP), y_ptr
   327		// x * y[0]
   328		MOVQ (8*0)(y_ptr), t0
   329	
   330		MOVQ (8*0)(x_ptr), AX
   331		MULQ t0
   332		MOVQ AX, acc0
   333		MOVQ DX, acc1
   334	
   335		MOVQ (8*1)(x_ptr), AX
   336		MULQ t0
   337		ADDQ AX, acc1
   338		ADCQ $0, DX
   339		MOVQ DX, acc2
   340	
   341		MOVQ (8*2)(x_ptr), AX
   342		MULQ t0
   343		ADDQ AX, acc2
   344		ADCQ $0, DX
   345		MOVQ DX, acc3
   346	
   347		MOVQ (8*3)(x_ptr), AX
   348		MULQ t0
   349		ADDQ AX, acc3
   350		ADCQ $0, DX
   351		MOVQ DX, acc4
   352		XORQ acc5, acc5
   353		// First reduction step
   354		MOVQ acc0, AX
   355		MOVQ acc0, t1
   356		SHLQ $32, acc0
   357		MULQ p256const1<>(SB)
   358		SHRQ $32, t1
   359		ADDQ acc0, acc1
   360		ADCQ t1, acc2
   361		ADCQ AX, acc3
   362		ADCQ DX, acc4
   363		ADCQ $0, acc5
   364		XORQ acc0, acc0
   365		// x * y[1]
   366		MOVQ (8*1)(y_ptr), t0
   367	
   368		MOVQ (8*0)(x_ptr), AX
   369		MULQ t0
   370		ADDQ AX, acc1
   371		ADCQ $0, DX
   372		MOVQ DX, t1
   373	
   374		MOVQ (8*1)(x_ptr), AX
   375		MULQ t0
   376		ADDQ t1, acc2
   377		ADCQ $0, DX
   378		ADDQ AX, acc2
   379		ADCQ $0, DX
   380		MOVQ DX, t1
   381	
   382		MOVQ (8*2)(x_ptr), AX
   383		MULQ t0
   384		ADDQ t1, acc3
   385		ADCQ $0, DX
   386		ADDQ AX, acc3
   387		ADCQ $0, DX
   388		MOVQ DX, t1
   389	
   390		MOVQ (8*3)(x_ptr), AX
   391		MULQ t0
   392		ADDQ t1, acc4
   393		ADCQ $0, DX
   394		ADDQ AX, acc4
   395		ADCQ DX, acc5
   396		ADCQ $0, acc0
   397		// Second reduction step
   398		MOVQ acc1, AX
   399		MOVQ acc1, t1
   400		SHLQ $32, acc1
   401		MULQ p256const1<>(SB)
   402		SHRQ $32, t1
   403		ADDQ acc1, acc2
   404		ADCQ t1, acc3
   405		ADCQ AX, acc4
   406		ADCQ DX, acc5
   407		ADCQ $0, acc0
   408		XORQ acc1, acc1
   409		// x * y[2]
   410		MOVQ (8*2)(y_ptr), t0
   411	
   412		MOVQ (8*0)(x_ptr), AX
   413		MULQ t0
   414		ADDQ AX, acc2
   415		ADCQ $0, DX
   416		MOVQ DX, t1
   417	
   418		MOVQ (8*1)(x_ptr), AX
   419		MULQ t0
   420		ADDQ t1, acc3
   421		ADCQ $0, DX
   422		ADDQ AX, acc3
   423		ADCQ $0, DX
   424		MOVQ DX, t1
   425	
   426		MOVQ (8*2)(x_ptr), AX
   427		MULQ t0
   428		ADDQ t1, acc4
   429		ADCQ $0, DX
   430		ADDQ AX, acc4
   431		ADCQ $0, DX
   432		MOVQ DX, t1
   433	
   434		MOVQ (8*3)(x_ptr), AX
   435		MULQ t0
   436		ADDQ t1, acc5
   437		ADCQ $0, DX
   438		ADDQ AX, acc5
   439		ADCQ DX, acc0
   440		ADCQ $0, acc1
   441		// Third reduction step
   442		MOVQ acc2, AX
   443		MOVQ acc2, t1
   444		SHLQ $32, acc2
   445		MULQ p256const1<>(SB)
   446		SHRQ $32, t1
   447		ADDQ acc2, acc3
   448		ADCQ t1, acc4
   449		ADCQ AX, acc5
   450		ADCQ DX, acc0
   451		ADCQ $0, acc1
   452		XORQ acc2, acc2
   453		// x * y[3]
   454		MOVQ (8*3)(y_ptr), t0
   455	
   456		MOVQ (8*0)(x_ptr), AX
   457		MULQ t0
   458		ADDQ AX, acc3
   459		ADCQ $0, DX
   460		MOVQ DX, t1
   461	
   462		MOVQ (8*1)(x_ptr), AX
   463		MULQ t0
   464		ADDQ t1, acc4
   465		ADCQ $0, DX
   466		ADDQ AX, acc4
   467		ADCQ $0, DX
   468		MOVQ DX, t1
   469	
   470		MOVQ (8*2)(x_ptr), AX
   471		MULQ t0
   472		ADDQ t1, acc5
   473		ADCQ $0, DX
   474		ADDQ AX, acc5
   475		ADCQ $0, DX
   476		MOVQ DX, t1
   477	
   478		MOVQ (8*3)(x_ptr), AX
   479		MULQ t0
   480		ADDQ t1, acc0
   481		ADCQ $0, DX
   482		ADDQ AX, acc0
   483		ADCQ DX, acc1
   484		ADCQ $0, acc2
   485		// Last reduction step
   486		MOVQ acc3, AX
   487		MOVQ acc3, t1
   488		SHLQ $32, acc3
   489		MULQ p256const1<>(SB)
   490		SHRQ $32, t1
   491		ADDQ acc3, acc4
   492		ADCQ t1, acc5
   493		ADCQ AX, acc0
   494		ADCQ DX, acc1
   495		ADCQ $0, acc2
   496		// Copy result [255:0]
   497		MOVQ acc4, x_ptr
   498		MOVQ acc5, acc3
   499		MOVQ acc0, t0
   500		MOVQ acc1, t1
   501		// Subtract p256
   502		SUBQ $-1, acc4
   503		SBBQ p256const0<>(SB) ,acc5
   504		SBBQ $0, acc0
   505		SBBQ p256const1<>(SB), acc1
   506		SBBQ $0, acc2
   507	
   508		CMOVQCS x_ptr, acc4
   509		CMOVQCS acc3, acc5
   510		CMOVQCS t0, acc0
   511		CMOVQCS t1, acc1
   512	
   513		MOVQ acc4, (8*0)(res_ptr)
   514		MOVQ acc5, (8*1)(res_ptr)
   515		MOVQ acc0, (8*2)(res_ptr)
   516		MOVQ acc1, (8*3)(res_ptr)
   517	
   518		RET
   519	/* ---------------------------------------*/
   520	// func p256FromMont(res, in []uint64)
   521	TEXT ·p256FromMont(SB),NOSPLIT,$0
   522		MOVQ res+0(FP), res_ptr
   523		MOVQ in+24(FP), x_ptr
   524	
   525		MOVQ (8*0)(x_ptr), acc0
   526		MOVQ (8*1)(x_ptr), acc1
   527		MOVQ (8*2)(x_ptr), acc2
   528		MOVQ (8*3)(x_ptr), acc3
   529		XORQ acc4, acc4
   530	
   531		// Only reduce, no multiplications are needed
   532		// First stage
   533		MOVQ acc0, AX
   534		MOVQ acc0, t1
   535		SHLQ $32, acc0
   536		MULQ p256const1<>(SB)
   537		SHRQ $32, t1
   538		ADDQ acc0, acc1
   539		ADCQ t1, acc2
   540		ADCQ AX, acc3
   541		ADCQ DX, acc4
   542		XORQ acc5, acc5
   543		// Second stage
   544		MOVQ acc1, AX
   545		MOVQ acc1, t1
   546		SHLQ $32, acc1
   547		MULQ p256const1<>(SB)
   548		SHRQ $32, t1
   549		ADDQ acc1, acc2
   550		ADCQ t1, acc3
   551		ADCQ AX, acc4
   552		ADCQ DX, acc5
   553		XORQ acc0, acc0
   554		// Third stage
   555		MOVQ acc2, AX
   556		MOVQ acc2, t1
   557		SHLQ $32, acc2
   558		MULQ p256const1<>(SB)
   559		SHRQ $32, t1
   560		ADDQ acc2, acc3
   561		ADCQ t1, acc4
   562		ADCQ AX, acc5
   563		ADCQ DX, acc0
   564		XORQ acc1, acc1
   565		// Last stage
   566		MOVQ acc3, AX
   567		MOVQ acc3, t1
   568		SHLQ $32, acc3
   569		MULQ p256const1<>(SB)
   570		SHRQ $32, t1
   571		ADDQ acc3, acc4
   572		ADCQ t1, acc5
   573		ADCQ AX, acc0
   574		ADCQ DX, acc1
   575	
   576		MOVQ acc4, x_ptr
   577		MOVQ acc5, acc3
   578		MOVQ acc0, t0
   579		MOVQ acc1, t1
   580	
   581		SUBQ $-1, acc4
   582		SBBQ p256const0<>(SB), acc5
   583		SBBQ $0, acc0
   584		SBBQ p256const1<>(SB), acc1
   585	
   586		CMOVQCS x_ptr, acc4
   587		CMOVQCS acc3, acc5
   588		CMOVQCS t0, acc0
   589		CMOVQCS t1, acc1
   590	
   591		MOVQ acc4, (8*0)(res_ptr)
   592		MOVQ acc5, (8*1)(res_ptr)
   593		MOVQ acc0, (8*2)(res_ptr)
   594		MOVQ acc1, (8*3)(res_ptr)
   595	
   596		RET
   597	/* ---------------------------------------*/
   598	// Constant time point access to arbitrary point table.
   599	// Indexed from 1 to 15, with -1 offset
   600	// (index 0 is implicitly point at infinity)
   601	// func p256Select(point, table []uint64, idx int)
   602	TEXT ·p256Select(SB),NOSPLIT,$0
   603		MOVQ idx+48(FP),AX
   604		MOVQ table+24(FP),DI
   605		MOVQ point+0(FP),DX
   606	
   607		PXOR X15, X15	// X15 = 0
   608		PCMPEQL X14, X14 // X14 = -1
   609		PSUBL X14, X15   // X15 = 1
   610		MOVL AX, X14
   611		PSHUFD $0, X14, X14
   612	
   613		PXOR X0, X0
   614		PXOR X1, X1
   615		PXOR X2, X2
   616		PXOR X3, X3
   617		PXOR X4, X4
   618		PXOR X5, X5
   619		MOVQ $16, AX
   620	
   621		MOVOU X15, X13
   622	
   623	loop_select:
   624	
   625			MOVOU X13, X12
   626			PADDL X15, X13
   627			PCMPEQL X14, X12
   628	
   629			MOVOU (16*0)(DI), X6
   630			MOVOU (16*1)(DI), X7
   631			MOVOU (16*2)(DI), X8
   632			MOVOU (16*3)(DI), X9
   633			MOVOU (16*4)(DI), X10
   634			MOVOU (16*5)(DI), X11
   635			ADDQ $(16*6), DI
   636	
   637			PAND X12, X6
   638			PAND X12, X7
   639			PAND X12, X8
   640			PAND X12, X9
   641			PAND X12, X10
   642			PAND X12, X11
   643	
   644			PXOR X6, X0
   645			PXOR X7, X1
   646			PXOR X8, X2
   647			PXOR X9, X3
   648			PXOR X10, X4
   649			PXOR X11, X5
   650	
   651			DECQ AX
   652			JNE loop_select
   653	
   654		MOVOU X0, (16*0)(DX)
   655		MOVOU X1, (16*1)(DX)
   656		MOVOU X2, (16*2)(DX)
   657		MOVOU X3, (16*3)(DX)
   658		MOVOU X4, (16*4)(DX)
   659		MOVOU X5, (16*5)(DX)
   660	
   661		RET
   662	/* ---------------------------------------*/
   663	// Constant time point access to base point table.
   664	// func p256SelectBase(point, table []uint64, idx int)
   665	TEXT ·p256SelectBase(SB),NOSPLIT,$0
   666		MOVQ idx+48(FP),AX
   667		MOVQ table+24(FP),DI
   668		MOVQ point+0(FP),DX
   669	
   670		PXOR X15, X15	// X15 = 0
   671		PCMPEQL X14, X14 // X14 = -1
   672		PSUBL X14, X15   // X15 = 1
   673		MOVL AX, X14
   674		PSHUFD $0, X14, X14
   675	
   676		PXOR X0, X0
   677		PXOR X1, X1
   678		PXOR X2, X2
   679		PXOR X3, X3
   680		MOVQ $32, AX
   681	
   682		MOVOU X15, X13
   683	
   684	loop_select_base:
   685	
   686			MOVOU X13, X12
   687			PADDL X15, X13
   688			PCMPEQL X14, X12
   689	
   690			MOVOU (16*0)(DI), X4
   691			MOVOU (16*1)(DI), X5
   692			MOVOU (16*2)(DI), X6
   693			MOVOU (16*3)(DI), X7
   694	
   695			MOVOU (16*4)(DI), X8
   696			MOVOU (16*5)(DI), X9
   697			MOVOU (16*6)(DI), X10
   698			MOVOU (16*7)(DI), X11
   699	
   700			ADDQ $(16*8), DI
   701	
   702			PAND X12, X4
   703			PAND X12, X5
   704			PAND X12, X6
   705			PAND X12, X7
   706	
   707			MOVOU X13, X12
   708			PADDL X15, X13
   709			PCMPEQL X14, X12
   710	
   711			PAND X12, X8
   712			PAND X12, X9
   713			PAND X12, X10
   714			PAND X12, X11
   715	
   716			PXOR X4, X0
   717			PXOR X5, X1
   718			PXOR X6, X2
   719			PXOR X7, X3
   720	
   721			PXOR X8, X0
   722			PXOR X9, X1
   723			PXOR X10, X2
   724			PXOR X11, X3
   725	
   726			DECQ AX
   727			JNE loop_select_base
   728	
   729		MOVOU X0, (16*0)(DX)
   730		MOVOU X1, (16*1)(DX)
   731		MOVOU X2, (16*2)(DX)
   732		MOVOU X3, (16*3)(DX)
   733	
   734		RET
   735	/* ---------------------------------------*/
   736	// func p256OrdMul(res, in1, in2 []uint64)
   737	TEXT ·p256OrdMul(SB),NOSPLIT,$0
   738		MOVQ res+0(FP), res_ptr
   739		MOVQ in1+24(FP), x_ptr
   740		MOVQ in2+48(FP), y_ptr
   741		// x * y[0]
   742		MOVQ (8*0)(y_ptr), t0
   743	
   744		MOVQ (8*0)(x_ptr), AX
   745		MULQ t0
   746		MOVQ AX, acc0
   747		MOVQ DX, acc1
   748	
   749		MOVQ (8*1)(x_ptr), AX
   750		MULQ t0
   751		ADDQ AX, acc1
   752		ADCQ $0, DX
   753		MOVQ DX, acc2
   754	
   755		MOVQ (8*2)(x_ptr), AX
   756		MULQ t0
   757		ADDQ AX, acc2
   758		ADCQ $0, DX
   759		MOVQ DX, acc3
   760	
   761		MOVQ (8*3)(x_ptr), AX
   762		MULQ t0
   763		ADDQ AX, acc3
   764		ADCQ $0, DX
   765		MOVQ DX, acc4
   766		XORQ acc5, acc5
   767		// First reduction step
   768		MOVQ acc0, AX
   769		MULQ p256ordK0<>(SB)
   770		MOVQ AX, t0
   771	
   772		MOVQ p256ord<>+0x00(SB), AX
   773		MULQ t0
   774		ADDQ AX, acc0
   775		ADCQ $0, DX
   776		MOVQ DX, t1
   777	
   778		MOVQ p256ord<>+0x08(SB), AX
   779		MULQ t0
   780		ADDQ t1, acc1
   781		ADCQ $0, DX
   782		ADDQ AX, acc1
   783		ADCQ $0, DX
   784		MOVQ DX, t1
   785	
   786		MOVQ p256ord<>+0x10(SB), AX
   787		MULQ t0
   788		ADDQ t1, acc2
   789		ADCQ $0, DX
   790		ADDQ AX, acc2
   791		ADCQ $0, DX
   792		MOVQ DX, t1
   793	
   794		MOVQ p256ord<>+0x18(SB), AX
   795		MULQ t0
   796		ADDQ t1, acc3
   797		ADCQ $0, DX
   798		ADDQ AX, acc3
   799		ADCQ DX, acc4
   800		ADCQ $0, acc5
   801		// x * y[1]
   802		MOVQ (8*1)(y_ptr), t0
   803	
   804		MOVQ (8*0)(x_ptr), AX
   805		MULQ t0
   806		ADDQ AX, acc1
   807		ADCQ $0, DX
   808		MOVQ DX, t1
   809	
   810		MOVQ (8*1)(x_ptr), AX
   811		MULQ t0
   812		ADDQ t1, acc2
   813		ADCQ $0, DX
   814		ADDQ AX, acc2
   815		ADCQ $0, DX
   816		MOVQ DX, t1
   817	
   818		MOVQ (8*2)(x_ptr), AX
   819		MULQ t0
   820		ADDQ t1, acc3
   821		ADCQ $0, DX
   822		ADDQ AX, acc3
   823		ADCQ $0, DX
   824		MOVQ DX, t1
   825	
   826		MOVQ (8*3)(x_ptr), AX
   827		MULQ t0
   828		ADDQ t1, acc4
   829		ADCQ $0, DX
   830		ADDQ AX, acc4
   831		ADCQ DX, acc5
   832		ADCQ $0, acc0
   833		// Second reduction step
   834		MOVQ acc1, AX
   835		MULQ p256ordK0<>(SB)
   836		MOVQ AX, t0
   837	
   838		MOVQ p256ord<>+0x00(SB), AX
   839		MULQ t0
   840		ADDQ AX, acc1
   841		ADCQ $0, DX
   842		MOVQ DX, t1
   843	
   844		MOVQ p256ord<>+0x08(SB), AX
   845		MULQ t0
   846		ADDQ t1, acc2
   847		ADCQ $0, DX
   848		ADDQ AX, acc2
   849		ADCQ $0, DX
   850		MOVQ DX, t1
   851	
   852		MOVQ p256ord<>+0x10(SB), AX
   853		MULQ t0
   854		ADDQ t1, acc3
   855		ADCQ $0, DX
   856		ADDQ AX, acc3
   857		ADCQ $0, DX
   858		MOVQ DX, t1
   859	
   860		MOVQ p256ord<>+0x18(SB), AX
   861		MULQ t0
   862		ADDQ t1, acc4
   863		ADCQ $0, DX
   864		ADDQ AX, acc4
   865		ADCQ DX, acc5
   866		ADCQ $0, acc0
   867		// x * y[2]
   868		MOVQ (8*2)(y_ptr), t0
   869	
   870		MOVQ (8*0)(x_ptr), AX
   871		MULQ t0
   872		ADDQ AX, acc2
   873		ADCQ $0, DX
   874		MOVQ DX, t1
   875	
   876		MOVQ (8*1)(x_ptr), AX
   877		MULQ t0
   878		ADDQ t1, acc3
   879		ADCQ $0, DX
   880		ADDQ AX, acc3
   881		ADCQ $0, DX
   882		MOVQ DX, t1
   883	
   884		MOVQ (8*2)(x_ptr), AX
   885		MULQ t0
   886		ADDQ t1, acc4
   887		ADCQ $0, DX
   888		ADDQ AX, acc4
   889		ADCQ $0, DX
   890		MOVQ DX, t1
   891	
   892		MOVQ (8*3)(x_ptr), AX
   893		MULQ t0
   894		ADDQ t1, acc5
   895		ADCQ $0, DX
   896		ADDQ AX, acc5
   897		ADCQ DX, acc0
   898		ADCQ $0, acc1
   899		// Third reduction step
   900		MOVQ acc2, AX
   901		MULQ p256ordK0<>(SB)
   902		MOVQ AX, t0
   903	
   904		MOVQ p256ord<>+0x00(SB), AX
   905		MULQ t0
   906		ADDQ AX, acc2
   907		ADCQ $0, DX
   908		MOVQ DX, t1
   909	
   910		MOVQ p256ord<>+0x08(SB), AX
   911		MULQ t0
   912		ADDQ t1, acc3
   913		ADCQ $0, DX
   914		ADDQ AX, acc3
   915		ADCQ $0, DX
   916		MOVQ DX, t1
   917	
   918		MOVQ p256ord<>+0x10(SB), AX
   919		MULQ t0
   920		ADDQ t1, acc4
   921		ADCQ $0, DX
   922		ADDQ AX, acc4
   923		ADCQ $0, DX
   924		MOVQ DX, t1
   925	
   926		MOVQ p256ord<>+0x18(SB), AX
   927		MULQ t0
   928		ADDQ t1, acc5
   929		ADCQ $0, DX
   930		ADDQ AX, acc5
   931		ADCQ DX, acc0
   932		ADCQ $0, acc1
   933		// x * y[3]
   934		MOVQ (8*3)(y_ptr), t0
   935	
   936		MOVQ (8*0)(x_ptr), AX
   937		MULQ t0
   938		ADDQ AX, acc3
   939		ADCQ $0, DX
   940		MOVQ DX, t1
   941	
   942		MOVQ (8*1)(x_ptr), AX
   943		MULQ t0
   944		ADDQ t1, acc4
   945		ADCQ $0, DX
   946		ADDQ AX, acc4
   947		ADCQ $0, DX
   948		MOVQ DX, t1
   949	
   950		MOVQ (8*2)(x_ptr), AX
   951		MULQ t0
   952		ADDQ t1, acc5
   953		ADCQ $0, DX
   954		ADDQ AX, acc5
   955		ADCQ $0, DX
   956		MOVQ DX, t1
   957	
   958		MOVQ (8*3)(x_ptr), AX
   959		MULQ t0
   960		ADDQ t1, acc0
   961		ADCQ $0, DX
   962		ADDQ AX, acc0
   963		ADCQ DX, acc1
   964		ADCQ $0, acc2
   965		// Last reduction step
   966		MOVQ acc3, AX
   967		MULQ p256ordK0<>(SB)
   968		MOVQ AX, t0
   969	
   970		MOVQ p256ord<>+0x00(SB), AX
   971		MULQ t0
   972		ADDQ AX, acc3
   973		ADCQ $0, DX
   974		MOVQ DX, t1
   975	
   976		MOVQ p256ord<>+0x08(SB), AX
   977		MULQ t0
   978		ADDQ t1, acc4
   979		ADCQ $0, DX
   980		ADDQ AX, acc4
   981		ADCQ $0, DX
   982		MOVQ DX, t1
   983	
   984		MOVQ p256ord<>+0x10(SB), AX
   985		MULQ t0
   986		ADDQ t1, acc5
   987		ADCQ $0, DX
   988		ADDQ AX, acc5
   989		ADCQ $0, DX
   990		MOVQ DX, t1
   991	
   992		MOVQ p256ord<>+0x18(SB), AX
   993		MULQ t0
   994		ADDQ t1, acc0
   995		ADCQ $0, DX
   996		ADDQ AX, acc0
   997		ADCQ DX, acc1
   998		ADCQ $0, acc2
   999		// Copy result [255:0]
  1000		MOVQ acc4, x_ptr
  1001		MOVQ acc5, acc3
  1002		MOVQ acc0, t0
  1003		MOVQ acc1, t1
  1004		// Subtract p256
  1005		SUBQ p256ord<>+0x00(SB), acc4
  1006		SBBQ p256ord<>+0x08(SB) ,acc5
  1007		SBBQ p256ord<>+0x10(SB), acc0
  1008		SBBQ p256ord<>+0x18(SB), acc1
  1009		SBBQ $0, acc2
  1010	
  1011		CMOVQCS x_ptr, acc4
  1012		CMOVQCS acc3, acc5
  1013		CMOVQCS t0, acc0
  1014		CMOVQCS t1, acc1
  1015	
  1016		MOVQ acc4, (8*0)(res_ptr)
  1017		MOVQ acc5, (8*1)(res_ptr)
  1018		MOVQ acc0, (8*2)(res_ptr)
  1019		MOVQ acc1, (8*3)(res_ptr)
  1020	
  1021		RET
  1022	/* ---------------------------------------*/
  1023	// func p256OrdSqr(res, in []uint64, n int)
  1024	TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1025		MOVQ res+0(FP), res_ptr
  1026		MOVQ in+24(FP), x_ptr
  1027		MOVQ n+48(FP), BX
  1028	
  1029	ordSqrLoop:
  1030	
  1031		// y[1:] * y[0]
  1032		MOVQ (8*0)(x_ptr), t0
  1033	
  1034		MOVQ (8*1)(x_ptr), AX
  1035		MULQ t0
  1036		MOVQ AX, acc1
  1037		MOVQ DX, acc2
  1038	
  1039		MOVQ (8*2)(x_ptr), AX
  1040		MULQ t0
  1041		ADDQ AX, acc2
  1042		ADCQ $0, DX
  1043		MOVQ DX, acc3
  1044	
  1045		MOVQ (8*3)(x_ptr), AX
  1046		MULQ t0
  1047		ADDQ AX, acc3
  1048		ADCQ $0, DX
  1049		MOVQ DX, acc4
  1050		// y[2:] * y[1]
  1051		MOVQ (8*1)(x_ptr), t0
  1052	
  1053		MOVQ (8*2)(x_ptr), AX
  1054		MULQ t0
  1055		ADDQ AX, acc3
  1056		ADCQ $0, DX
  1057		MOVQ DX, t1
  1058	
  1059		MOVQ (8*3)(x_ptr), AX
  1060		MULQ t0
  1061		ADDQ t1, acc4
  1062		ADCQ $0, DX
  1063		ADDQ AX, acc4
  1064		ADCQ $0, DX
  1065		MOVQ DX, acc5
  1066		// y[3] * y[2]
  1067		MOVQ (8*2)(x_ptr), t0
  1068	
  1069		MOVQ (8*3)(x_ptr), AX
  1070		MULQ t0
  1071		ADDQ AX, acc5
  1072		ADCQ $0, DX
  1073		MOVQ DX, y_ptr
  1074		XORQ t1, t1
  1075		// *2
  1076		ADDQ acc1, acc1
  1077		ADCQ acc2, acc2
  1078		ADCQ acc3, acc3
  1079		ADCQ acc4, acc4
  1080		ADCQ acc5, acc5
  1081		ADCQ y_ptr, y_ptr
  1082		ADCQ $0, t1
  1083		// Missing products
  1084		MOVQ (8*0)(x_ptr), AX
  1085		MULQ AX
  1086		MOVQ AX, acc0
  1087		MOVQ DX, t0
  1088	
  1089		MOVQ (8*1)(x_ptr), AX
  1090		MULQ AX
  1091		ADDQ t0, acc1
  1092		ADCQ AX, acc2
  1093		ADCQ $0, DX
  1094		MOVQ DX, t0
  1095	
  1096		MOVQ (8*2)(x_ptr), AX
  1097		MULQ AX
  1098		ADDQ t0, acc3
  1099		ADCQ AX, acc4
  1100		ADCQ $0, DX
  1101		MOVQ DX, t0
  1102	
  1103		MOVQ (8*3)(x_ptr), AX
  1104		MULQ AX
  1105		ADDQ t0, acc5
  1106		ADCQ AX, y_ptr
  1107		ADCQ DX, t1
  1108		MOVQ t1, x_ptr
  1109		// First reduction step
  1110		MOVQ acc0, AX
  1111		MULQ p256ordK0<>(SB)
  1112		MOVQ AX, t0
  1113	
  1114		MOVQ p256ord<>+0x00(SB), AX
  1115		MULQ t0
  1116		ADDQ AX, acc0
  1117		ADCQ $0, DX
  1118		MOVQ DX, t1
  1119	
  1120		MOVQ p256ord<>+0x08(SB), AX
  1121		MULQ t0
  1122		ADDQ t1, acc1
  1123		ADCQ $0, DX
  1124		ADDQ AX, acc1
  1125	
  1126		MOVQ t0, t1
  1127		ADCQ DX, acc2
  1128		ADCQ $0, t1
  1129		SUBQ t0, acc2
  1130		SBBQ $0, t1
  1131	
  1132		MOVQ t0, AX
  1133		MOVQ t0, DX
  1134		MOVQ t0, acc0
  1135		SHLQ $32, AX
  1136		SHRQ $32, DX
  1137	
  1138		ADDQ t1, acc3
  1139		ADCQ $0, acc0
  1140		SUBQ AX, acc3
  1141		SBBQ DX, acc0
  1142		// Second reduction step
  1143		MOVQ acc1, AX
  1144		MULQ p256ordK0<>(SB)
  1145		MOVQ AX, t0
  1146	
  1147		MOVQ p256ord<>+0x00(SB), AX
  1148		MULQ t0
  1149		ADDQ AX, acc1
  1150		ADCQ $0, DX
  1151		MOVQ DX, t1
  1152	
  1153		MOVQ p256ord<>+0x08(SB), AX
  1154		MULQ t0
  1155		ADDQ t1, acc2
  1156		ADCQ $0, DX
  1157		ADDQ AX, acc2
  1158	
  1159		MOVQ t0, t1
  1160		ADCQ DX, acc3
  1161		ADCQ $0, t1
  1162		SUBQ t0, acc3
  1163		SBBQ $0, t1
  1164	
  1165		MOVQ t0, AX
  1166		MOVQ t0, DX
  1167		MOVQ t0, acc1
  1168		SHLQ $32, AX
  1169		SHRQ $32, DX
  1170	
  1171		ADDQ t1, acc0
  1172		ADCQ $0, acc1
  1173		SUBQ AX, acc0
  1174		SBBQ DX, acc1
  1175		// Third reduction step
  1176		MOVQ acc2, AX
  1177		MULQ p256ordK0<>(SB)
  1178		MOVQ AX, t0
  1179	
  1180		MOVQ p256ord<>+0x00(SB), AX
  1181		MULQ t0
  1182		ADDQ AX, acc2
  1183		ADCQ $0, DX
  1184		MOVQ DX, t1
  1185	
  1186		MOVQ p256ord<>+0x08(SB), AX
  1187		MULQ t0
  1188		ADDQ t1, acc3
  1189		ADCQ $0, DX
  1190		ADDQ AX, acc3
  1191	
  1192		MOVQ t0, t1
  1193		ADCQ DX, acc0
  1194		ADCQ $0, t1
  1195		SUBQ t0, acc0
  1196		SBBQ $0, t1
  1197	
  1198		MOVQ t0, AX
  1199		MOVQ t0, DX
  1200		MOVQ t0, acc2
  1201		SHLQ $32, AX
  1202		SHRQ $32, DX
  1203	
  1204		ADDQ t1, acc1
  1205		ADCQ $0, acc2
  1206		SUBQ AX, acc1
  1207		SBBQ DX, acc2
  1208		// Last reduction step
  1209		MOVQ acc3, AX
  1210		MULQ p256ordK0<>(SB)
  1211		MOVQ AX, t0
  1212	
  1213		MOVQ p256ord<>+0x00(SB), AX
  1214		MULQ t0
  1215		ADDQ AX, acc3
  1216		ADCQ $0, DX
  1217		MOVQ DX, t1
  1218	
  1219		MOVQ p256ord<>+0x08(SB), AX
  1220		MULQ t0
  1221		ADDQ t1, acc0
  1222		ADCQ $0, DX
  1223		ADDQ AX, acc0
  1224		ADCQ $0, DX
  1225		MOVQ DX, t1
  1226	
  1227		MOVQ t0, t1
  1228		ADCQ DX, acc1
  1229		ADCQ $0, t1
  1230		SUBQ t0, acc1
  1231		SBBQ $0, t1
  1232	
  1233		MOVQ t0, AX
  1234		MOVQ t0, DX
  1235		MOVQ t0, acc3
  1236		SHLQ $32, AX
  1237		SHRQ $32, DX
  1238	
  1239		ADDQ t1, acc2
  1240		ADCQ $0, acc3
  1241		SUBQ AX, acc2
  1242		SBBQ DX, acc3
  1243		XORQ t0, t0
  1244		// Add bits [511:256] of the sqr result
  1245		ADCQ acc4, acc0
  1246		ADCQ acc5, acc1
  1247		ADCQ y_ptr, acc2
  1248		ADCQ x_ptr, acc3
  1249		ADCQ $0, t0
  1250	
  1251		MOVQ acc0, acc4
  1252		MOVQ acc1, acc5
  1253		MOVQ acc2, y_ptr
  1254		MOVQ acc3, t1
  1255		// Subtract p256
  1256		SUBQ p256ord<>+0x00(SB), acc0
  1257		SBBQ p256ord<>+0x08(SB) ,acc1
  1258		SBBQ p256ord<>+0x10(SB), acc2
  1259		SBBQ p256ord<>+0x18(SB), acc3
  1260		SBBQ $0, t0
  1261	
  1262		CMOVQCS acc4, acc0
  1263		CMOVQCS acc5, acc1
  1264		CMOVQCS y_ptr, acc2
  1265		CMOVQCS t1, acc3
  1266	
  1267		MOVQ acc0, (8*0)(res_ptr)
  1268		MOVQ acc1, (8*1)(res_ptr)
  1269		MOVQ acc2, (8*2)(res_ptr)
  1270		MOVQ acc3, (8*3)(res_ptr)
  1271		MOVQ res_ptr, x_ptr
  1272		DECQ BX
  1273		JNE ordSqrLoop
  1274	
  1275		RET
  1276	/* ---------------------------------------*/
  1277	#undef res_ptr
  1278	#undef x_ptr
  1279	#undef y_ptr
  1280	
  1281	#undef acc0
  1282	#undef acc1
  1283	#undef acc2
  1284	#undef acc3
  1285	#undef acc4
  1286	#undef acc5
  1287	#undef t0
  1288	#undef t1
  1289	/* ---------------------------------------*/
  1290	#define mul0 AX
  1291	#define mul1 DX
  1292	#define acc0 BX
  1293	#define acc1 CX
  1294	#define acc2 R8
  1295	#define acc3 R9
  1296	#define acc4 R10
  1297	#define acc5 R11
  1298	#define acc6 R12
  1299	#define acc7 R13
  1300	#define t0 R14
  1301	#define t1 R15
  1302	#define t2 DI
  1303	#define t3 SI
  1304	#define hlp BP
  1305	/* ---------------------------------------*/
  1306	TEXT p256SubInternal(SB),NOSPLIT,$0
  1307		XORQ mul0, mul0
  1308		SUBQ t0, acc4
  1309		SBBQ t1, acc5
  1310		SBBQ t2, acc6
  1311		SBBQ t3, acc7
  1312		SBBQ $0, mul0
  1313	
  1314		MOVQ acc4, acc0
  1315		MOVQ acc5, acc1
  1316		MOVQ acc6, acc2
  1317		MOVQ acc7, acc3
  1318	
  1319		ADDQ $-1, acc4
  1320		ADCQ p256const0<>(SB), acc5
  1321		ADCQ $0, acc6
  1322		ADCQ p256const1<>(SB), acc7
  1323		ANDQ $1, mul0
  1324	
  1325		CMOVQEQ acc0, acc4
  1326		CMOVQEQ acc1, acc5
  1327		CMOVQEQ acc2, acc6
  1328		CMOVQEQ acc3, acc7
  1329	
  1330		RET
  1331	/* ---------------------------------------*/
  1332	TEXT p256MulInternal(SB),NOSPLIT,$0
  1333		MOVQ acc4, mul0
  1334		MULQ t0
  1335		MOVQ mul0, acc0
  1336		MOVQ mul1, acc1
  1337	
  1338		MOVQ acc4, mul0
  1339		MULQ t1
  1340		ADDQ mul0, acc1
  1341		ADCQ $0, mul1
  1342		MOVQ mul1, acc2
  1343	
  1344		MOVQ acc4, mul0
  1345		MULQ t2
  1346		ADDQ mul0, acc2
  1347		ADCQ $0, mul1
  1348		MOVQ mul1, acc3
  1349	
  1350		MOVQ acc4, mul0
  1351		MULQ t3
  1352		ADDQ mul0, acc3
  1353		ADCQ $0, mul1
  1354		MOVQ mul1, acc4
  1355	
  1356		MOVQ acc5, mul0
  1357		MULQ t0
  1358		ADDQ mul0, acc1
  1359		ADCQ $0, mul1
  1360		MOVQ mul1, hlp
  1361	
  1362		MOVQ acc5, mul0
  1363		MULQ t1
  1364		ADDQ hlp, acc2
  1365		ADCQ $0, mul1
  1366		ADDQ mul0, acc2
  1367		ADCQ $0, mul1
  1368		MOVQ mul1, hlp
  1369	
  1370		MOVQ acc5, mul0
  1371		MULQ t2
  1372		ADDQ hlp, acc3
  1373		ADCQ $0, mul1
  1374		ADDQ mul0, acc3
  1375		ADCQ $0, mul1
  1376		MOVQ mul1, hlp
  1377	
  1378		MOVQ acc5, mul0
  1379		MULQ t3
  1380		ADDQ hlp, acc4
  1381		ADCQ $0, mul1
  1382		ADDQ mul0, acc4
  1383		ADCQ $0, mul1
  1384		MOVQ mul1, acc5
  1385	
  1386		MOVQ acc6, mul0
  1387		MULQ t0
  1388		ADDQ mul0, acc2
  1389		ADCQ $0, mul1
  1390		MOVQ mul1, hlp
  1391	
  1392		MOVQ acc6, mul0
  1393		MULQ t1
  1394		ADDQ hlp, acc3
  1395		ADCQ $0, mul1
  1396		ADDQ mul0, acc3
  1397		ADCQ $0, mul1
  1398		MOVQ mul1, hlp
  1399	
  1400		MOVQ acc6, mul0
  1401		MULQ t2
  1402		ADDQ hlp, acc4
  1403		ADCQ $0, mul1
  1404		ADDQ mul0, acc4
  1405		ADCQ $0, mul1
  1406		MOVQ mul1, hlp
  1407	
  1408		MOVQ acc6, mul0
  1409		MULQ t3
  1410		ADDQ hlp, acc5
  1411		ADCQ $0, mul1
  1412		ADDQ mul0, acc5
  1413		ADCQ $0, mul1
  1414		MOVQ mul1, acc6
  1415	
  1416		MOVQ acc7, mul0
  1417		MULQ t0
  1418		ADDQ mul0, acc3
  1419		ADCQ $0, mul1
  1420		MOVQ mul1, hlp
  1421	
  1422		MOVQ acc7, mul0
  1423		MULQ t1
  1424		ADDQ hlp, acc4
  1425		ADCQ $0, mul1
  1426		ADDQ mul0, acc4
  1427		ADCQ $0, mul1
  1428		MOVQ mul1, hlp
  1429	
  1430		MOVQ acc7, mul0
  1431		MULQ t2
  1432		ADDQ hlp, acc5
  1433		ADCQ $0, mul1
  1434		ADDQ mul0, acc5
  1435		ADCQ $0, mul1
  1436		MOVQ mul1, hlp
  1437	
  1438		MOVQ acc7, mul0
  1439		MULQ t3
  1440		ADDQ hlp, acc6
  1441		ADCQ $0, mul1
  1442		ADDQ mul0, acc6
  1443		ADCQ $0, mul1
  1444		MOVQ mul1, acc7
  1445		// First reduction step
  1446		MOVQ acc0, mul0
  1447		MOVQ acc0, hlp
  1448		SHLQ $32, acc0
  1449		MULQ p256const1<>(SB)
  1450		SHRQ $32, hlp
  1451		ADDQ acc0, acc1
  1452		ADCQ hlp, acc2
  1453		ADCQ mul0, acc3
  1454		ADCQ $0, mul1
  1455		MOVQ mul1, acc0
  1456		// Second reduction step
  1457		MOVQ acc1, mul0
  1458		MOVQ acc1, hlp
  1459		SHLQ $32, acc1
  1460		MULQ p256const1<>(SB)
  1461		SHRQ $32, hlp
  1462		ADDQ acc1, acc2
  1463		ADCQ hlp, acc3
  1464		ADCQ mul0, acc0
  1465		ADCQ $0, mul1
  1466		MOVQ mul1, acc1
  1467		// Third reduction step
  1468		MOVQ acc2, mul0
  1469		MOVQ acc2, hlp
  1470		SHLQ $32, acc2
  1471		MULQ p256const1<>(SB)
  1472		SHRQ $32, hlp
  1473		ADDQ acc2, acc3
  1474		ADCQ hlp, acc0
  1475		ADCQ mul0, acc1
  1476		ADCQ $0, mul1
  1477		MOVQ mul1, acc2
  1478		// Last reduction step
  1479		MOVQ acc3, mul0
  1480		MOVQ acc3, hlp
  1481		SHLQ $32, acc3
  1482		MULQ p256const1<>(SB)
  1483		SHRQ $32, hlp
  1484		ADDQ acc3, acc0
  1485		ADCQ hlp, acc1
  1486		ADCQ mul0, acc2
  1487		ADCQ $0, mul1
  1488		MOVQ mul1, acc3
  1489		BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00   // MOVQ $0, BP
  1490		// Add bits [511:256] of the result
  1491		ADCQ acc0, acc4
  1492		ADCQ acc1, acc5
  1493		ADCQ acc2, acc6
  1494		ADCQ acc3, acc7
  1495		ADCQ $0, hlp
  1496		// Copy result
  1497		MOVQ acc4, acc0
  1498		MOVQ acc5, acc1
  1499		MOVQ acc6, acc2
  1500		MOVQ acc7, acc3
  1501		// Subtract p256
  1502		SUBQ $-1, acc4
  1503		SBBQ p256const0<>(SB) ,acc5
  1504		SBBQ $0, acc6
  1505		SBBQ p256const1<>(SB), acc7
  1506		SBBQ $0, hlp
  1507		// If the result of the subtraction is negative, restore the previous result
  1508		CMOVQCS acc0, acc4
  1509		CMOVQCS acc1, acc5
  1510		CMOVQCS acc2, acc6
  1511		CMOVQCS acc3, acc7
  1512	
  1513		RET
  1514	/* ---------------------------------------*/
  1515	TEXT p256SqrInternal(SB),NOSPLIT,$0
  1516	
  1517		MOVQ acc4, mul0
  1518		MULQ acc5
  1519		MOVQ mul0, acc1
  1520		MOVQ mul1, acc2
  1521	
  1522		MOVQ acc4, mul0
  1523		MULQ acc6
  1524		ADDQ mul0, acc2
  1525		ADCQ $0, mul1
  1526		MOVQ mul1, acc3
  1527	
  1528		MOVQ acc4, mul0
  1529		MULQ acc7
  1530		ADDQ mul0, acc3
  1531		ADCQ $0, mul1
  1532		MOVQ mul1, t0
  1533	
  1534		MOVQ acc5, mul0
  1535		MULQ acc6
  1536		ADDQ mul0, acc3
  1537		ADCQ $0, mul1
  1538		MOVQ mul1, hlp
  1539	
  1540		MOVQ acc5, mul0
  1541		MULQ acc7
  1542		ADDQ hlp, t0
  1543		ADCQ $0, mul1
  1544		ADDQ mul0, t0
  1545		ADCQ $0, mul1
  1546		MOVQ mul1, t1
  1547	
  1548		MOVQ acc6, mul0
  1549		MULQ acc7
  1550		ADDQ mul0, t1
  1551		ADCQ $0, mul1
  1552		MOVQ mul1, t2
  1553		XORQ t3, t3
  1554		// *2
  1555		ADDQ acc1, acc1
  1556		ADCQ acc2, acc2
  1557		ADCQ acc3, acc3
  1558		ADCQ t0, t0
  1559		ADCQ t1, t1
  1560		ADCQ t2, t2
  1561		ADCQ $0, t3
  1562		// Missing products
  1563		MOVQ acc4, mul0
  1564		MULQ mul0
  1565		MOVQ mul0, acc0
  1566		MOVQ DX, acc4
  1567	
  1568		MOVQ acc5, mul0
  1569		MULQ mul0
  1570		ADDQ acc4, acc1
  1571		ADCQ mul0, acc2
  1572		ADCQ $0, DX
  1573		MOVQ DX, acc4
  1574	
  1575		MOVQ acc6, mul0
  1576		MULQ mul0
  1577		ADDQ acc4, acc3
  1578		ADCQ mul0, t0
  1579		ADCQ $0, DX
  1580		MOVQ DX, acc4
  1581	
  1582		MOVQ acc7, mul0
  1583		MULQ mul0
  1584		ADDQ acc4, t1
  1585		ADCQ mul0, t2
  1586		ADCQ DX, t3
  1587		// First reduction step
  1588		MOVQ acc0, mul0
  1589		MOVQ acc0, hlp
  1590		SHLQ $32, acc0
  1591		MULQ p256const1<>(SB)
  1592		SHRQ $32, hlp
  1593		ADDQ acc0, acc1
  1594		ADCQ hlp, acc2
  1595		ADCQ mul0, acc3
  1596		ADCQ $0, mul1
  1597		MOVQ mul1, acc0
  1598		// Second reduction step
  1599		MOVQ acc1, mul0
  1600		MOVQ acc1, hlp
  1601		SHLQ $32, acc1
  1602		MULQ p256const1<>(SB)
  1603		SHRQ $32, hlp
  1604		ADDQ acc1, acc2
  1605		ADCQ hlp, acc3
  1606		ADCQ mul0, acc0
  1607		ADCQ $0, mul1
  1608		MOVQ mul1, acc1
  1609		// Third reduction step
  1610		MOVQ acc2, mul0
  1611		MOVQ acc2, hlp
  1612		SHLQ $32, acc2
  1613		MULQ p256const1<>(SB)
  1614		SHRQ $32, hlp
  1615		ADDQ acc2, acc3
  1616		ADCQ hlp, acc0
  1617		ADCQ mul0, acc1
  1618		ADCQ $0, mul1
  1619		MOVQ mul1, acc2
  1620		// Last reduction step
  1621		MOVQ acc3, mul0
  1622		MOVQ acc3, hlp
  1623		SHLQ $32, acc3
  1624		MULQ p256const1<>(SB)
  1625		SHRQ $32, hlp
  1626		ADDQ acc3, acc0
  1627		ADCQ hlp, acc1
  1628		ADCQ mul0, acc2
  1629		ADCQ $0, mul1
  1630		MOVQ mul1, acc3
  1631		BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00   // MOVQ $0, BP
  1632		// Add bits [511:256] of the result
  1633		ADCQ acc0, t0
  1634		ADCQ acc1, t1
  1635		ADCQ acc2, t2
  1636		ADCQ acc3, t3
  1637		ADCQ $0, hlp
  1638		// Copy result
  1639		MOVQ t0, acc4
  1640		MOVQ t1, acc5
  1641		MOVQ t2, acc6
  1642		MOVQ t3, acc7
  1643		// Subtract p256
  1644		SUBQ $-1, acc4
  1645		SBBQ p256const0<>(SB) ,acc5
  1646		SBBQ $0, acc6
  1647		SBBQ p256const1<>(SB), acc7
  1648		SBBQ $0, hlp
  1649		// If the result of the subtraction is negative, restore the previous result
  1650		CMOVQCS t0, acc4
  1651		CMOVQCS t1, acc5
  1652		CMOVQCS t2, acc6
  1653		CMOVQCS t3, acc7
  1654	
  1655		RET
  1656	/* ---------------------------------------*/
  1657	#define p256MulBy2Inline\
  1658		XORQ mul0, mul0;\
  1659		ADDQ acc4, acc4;\
  1660		ADCQ acc5, acc5;\
  1661		ADCQ acc6, acc6;\
  1662		ADCQ acc7, acc7;\
  1663		ADCQ $0, mul0;\
  1664		MOVQ acc4, t0;\
  1665		MOVQ acc5, t1;\
  1666		MOVQ acc6, t2;\
  1667		MOVQ acc7, t3;\
  1668		SUBQ $-1, t0;\
  1669		SBBQ p256const0<>(SB), t1;\
  1670		SBBQ $0, t2;\
  1671		SBBQ p256const1<>(SB), t3;\
  1672		SBBQ $0, mul0;\
  1673		CMOVQCS acc4, t0;\
  1674		CMOVQCS acc5, t1;\
  1675		CMOVQCS acc6, t2;\
  1676		CMOVQCS acc7, t3;
  1677	/* ---------------------------------------*/
  1678	#define p256AddInline \
  1679		XORQ mul0, mul0;\
  1680		ADDQ t0, acc4;\
  1681		ADCQ t1, acc5;\
  1682		ADCQ t2, acc6;\
  1683		ADCQ t3, acc7;\
  1684		ADCQ $0, mul0;\
  1685		MOVQ acc4, t0;\
  1686		MOVQ acc5, t1;\
  1687		MOVQ acc6, t2;\
  1688		MOVQ acc7, t3;\
  1689		SUBQ $-1, t0;\
  1690		SBBQ p256const0<>(SB), t1;\
  1691		SBBQ $0, t2;\
  1692		SBBQ p256const1<>(SB), t3;\
  1693		SBBQ $0, mul0;\
  1694		CMOVQCS acc4, t0;\
  1695		CMOVQCS acc5, t1;\
  1696		CMOVQCS acc6, t2;\
  1697		CMOVQCS acc7, t3;
  1698	/* ---------------------------------------*/
  1699	#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1700	#define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1701	#define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1702	#define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1703	#define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1704	#define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1705	/* ---------------------------------------*/
  1706	#define x1in(off) (32*0 + off)(SP)
  1707	#define y1in(off) (32*1 + off)(SP)
  1708	#define z1in(off) (32*2 + off)(SP)
  1709	#define x2in(off) (32*3 + off)(SP)
  1710	#define y2in(off) (32*4 + off)(SP)
  1711	#define xout(off) (32*5 + off)(SP)
  1712	#define yout(off) (32*6 + off)(SP)
  1713	#define zout(off) (32*7 + off)(SP)
  1714	#define s2(off)   (32*8 + off)(SP)
  1715	#define z1sqr(off) (32*9 + off)(SP)
  1716	#define h(off)	  (32*10 + off)(SP)
  1717	#define r(off)	  (32*11 + off)(SP)
  1718	#define hsqr(off) (32*12 + off)(SP)
  1719	#define rsqr(off) (32*13 + off)(SP)
  1720	#define hcub(off) (32*14 + off)(SP)
  1721	#define rptr	  (32*15)(SP)
  1722	#define sel_save  (32*15 + 8)(SP)
  1723	#define zero_save (32*15 + 8 + 4)(SP)
  1724	
  1725	// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1726	TEXT ·p256PointAddAffineAsm(SB),0,$512-96
  1727		// Move input to stack in order to free registers
  1728		MOVQ res+0(FP), AX
  1729		MOVQ in1+24(FP), BX
  1730		MOVQ in2+48(FP), CX
  1731		MOVQ sign+72(FP), DX
  1732		MOVQ sel+80(FP), t1
  1733		MOVQ zero+88(FP), t2
  1734	
  1735		MOVOU (16*0)(BX), X0
  1736		MOVOU (16*1)(BX), X1
  1737		MOVOU (16*2)(BX), X2
  1738		MOVOU (16*3)(BX), X3
  1739		MOVOU (16*4)(BX), X4
  1740		MOVOU (16*5)(BX), X5
  1741	
  1742		MOVOU X0, x1in(16*0)
  1743		MOVOU X1, x1in(16*1)
  1744		MOVOU X2, y1in(16*0)
  1745		MOVOU X3, y1in(16*1)
  1746		MOVOU X4, z1in(16*0)
  1747		MOVOU X5, z1in(16*1)
  1748	
  1749		MOVOU (16*0)(CX), X0
  1750		MOVOU (16*1)(CX), X1
  1751	
  1752		MOVOU X0, x2in(16*0)
  1753		MOVOU X1, x2in(16*1)
  1754		// Store pointer to result
  1755		MOVQ mul0, rptr
  1756		MOVL t1, sel_save
  1757		MOVL t2, zero_save
  1758		// Negate y2in based on sign
  1759		MOVQ (16*2 + 8*0)(CX), acc4
  1760		MOVQ (16*2 + 8*1)(CX), acc5
  1761		MOVQ (16*2 + 8*2)(CX), acc6
  1762		MOVQ (16*2 + 8*3)(CX), acc7
  1763		MOVQ $-1, acc0
  1764		MOVQ p256const0<>(SB), acc1
  1765		MOVQ $0, acc2
  1766		MOVQ p256const1<>(SB), acc3
  1767		XORQ mul0, mul0
  1768		// Speculatively subtract
  1769		SUBQ acc4, acc0
  1770		SBBQ acc5, acc1
  1771		SBBQ acc6, acc2
  1772		SBBQ acc7, acc3
  1773		SBBQ $0, mul0
  1774		MOVQ acc0, t0
  1775		MOVQ acc1, t1
  1776		MOVQ acc2, t2
  1777		MOVQ acc3, t3
  1778		// Add in case the operand was > p256
  1779		ADDQ $-1, acc0
  1780		ADCQ p256const0<>(SB), acc1
  1781		ADCQ $0, acc2
  1782		ADCQ p256const1<>(SB), acc3
  1783		ADCQ $0, mul0
  1784		CMOVQNE t0, acc0
  1785		CMOVQNE t1, acc1
  1786		CMOVQNE t2, acc2
  1787		CMOVQNE t3, acc3
  1788		// If condition is 0, keep original value
  1789		TESTQ DX, DX
  1790		CMOVQEQ acc4, acc0
  1791		CMOVQEQ acc5, acc1
  1792		CMOVQEQ acc6, acc2
  1793		CMOVQEQ acc7, acc3
  1794		// Store result
  1795		MOVQ acc0, y2in(8*0)
  1796		MOVQ acc1, y2in(8*1)
  1797		MOVQ acc2, y2in(8*2)
  1798		MOVQ acc3, y2in(8*3)
  1799		// Begin point add
  1800		LDacc (z1in)
  1801		CALL p256SqrInternal(SB)	// z1ˆ2
  1802		ST (z1sqr)
  1803	
  1804		LDt (x2in)
  1805		CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1806	
  1807		LDt (x1in)
  1808		CALL p256SubInternal(SB)	// h = u2 - u1
  1809		ST (h)
  1810	
  1811		LDt (z1in)
  1812		CALL p256MulInternal(SB)	// z3 = h * z1
  1813		ST (zout)
  1814	
  1815		LDacc (z1sqr)
  1816		CALL p256MulInternal(SB)	// z1ˆ3
  1817	
  1818		LDt (y2in)
  1819		CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1820		ST (s2)
  1821	
  1822		LDt (y1in)
  1823		CALL p256SubInternal(SB)	// r = s2 - s1
  1824		ST (r)
  1825	
  1826		CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1827		ST (rsqr)
  1828	
  1829		LDacc (h)
  1830		CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1831		ST (hsqr)
  1832	
  1833		LDt (h)
  1834		CALL p256MulInternal(SB)	// hcub = hˆ3
  1835		ST (hcub)
  1836	
  1837		LDt (y1in)
  1838		CALL p256MulInternal(SB)	// y1 * hˆ3
  1839		ST (s2)
  1840	
  1841		LDacc (x1in)
  1842		LDt (hsqr)
  1843		CALL p256MulInternal(SB)	// u1 * hˆ2
  1844		ST (h)
  1845	
  1846		p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1847		LDacc (rsqr)
  1848		CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1849	
  1850		LDt (hcub)
  1851		CALL p256SubInternal(SB)
  1852		ST (xout)
  1853	
  1854		MOVQ acc4, t0
  1855		MOVQ acc5, t1
  1856		MOVQ acc6, t2
  1857		MOVQ acc7, t3
  1858		LDacc (h)
  1859		CALL p256SubInternal(SB)
  1860	
  1861		LDt (r)
  1862		CALL p256MulInternal(SB)
  1863	
  1864		LDt (s2)
  1865		CALL p256SubInternal(SB)
  1866		ST (yout)
  1867		// Load stored values from stack
  1868		MOVQ rptr, AX
  1869		MOVL sel_save, BX
  1870		MOVL zero_save, CX
  1871		// The result is not valid if (sel == 0), conditional choose
  1872		MOVOU xout(16*0), X0
  1873		MOVOU xout(16*1), X1
  1874		MOVOU yout(16*0), X2
  1875		MOVOU yout(16*1), X3
  1876		MOVOU zout(16*0), X4
  1877		MOVOU zout(16*1), X5
  1878	
  1879		MOVL BX, X6
  1880		MOVL CX, X7
  1881	
  1882		PXOR X8, X8
  1883		PCMPEQL X9, X9
  1884	
  1885		PSHUFD $0, X6, X6
  1886		PSHUFD $0, X7, X7
  1887	
  1888		PCMPEQL X8, X6
  1889		PCMPEQL X8, X7
  1890	
  1891		MOVOU X6, X15
  1892		PANDN X9, X15
  1893	
  1894		MOVOU x1in(16*0), X9
  1895		MOVOU x1in(16*1), X10
  1896		MOVOU y1in(16*0), X11
  1897		MOVOU y1in(16*1), X12
  1898		MOVOU z1in(16*0), X13
  1899		MOVOU z1in(16*1), X14
  1900	
  1901		PAND X15, X0
  1902		PAND X15, X1
  1903		PAND X15, X2
  1904		PAND X15, X3
  1905		PAND X15, X4
  1906		PAND X15, X5
  1907	
  1908		PAND X6, X9
  1909		PAND X6, X10
  1910		PAND X6, X11
  1911		PAND X6, X12
  1912		PAND X6, X13
  1913		PAND X6, X14
  1914	
  1915		PXOR X9, X0
  1916		PXOR X10, X1
  1917		PXOR X11, X2
  1918		PXOR X12, X3
  1919		PXOR X13, X4
  1920		PXOR X14, X5
  1921		// Similarly if zero == 0
  1922		PCMPEQL X9, X9
  1923		MOVOU X7, X15
  1924		PANDN X9, X15
  1925	
  1926		MOVOU x2in(16*0), X9
  1927		MOVOU x2in(16*1), X10
  1928		MOVOU y2in(16*0), X11
  1929		MOVOU y2in(16*1), X12
  1930		MOVOU p256one<>+0x00(SB), X13
  1931		MOVOU p256one<>+0x10(SB), X14
  1932	
  1933		PAND X15, X0
  1934		PAND X15, X1
  1935		PAND X15, X2
  1936		PAND X15, X3
  1937		PAND X15, X4
  1938		PAND X15, X5
  1939	
  1940		PAND X7, X9
  1941		PAND X7, X10
  1942		PAND X7, X11
  1943		PAND X7, X12
  1944		PAND X7, X13
  1945		PAND X7, X14
  1946	
  1947		PXOR X9, X0
  1948		PXOR X10, X1
  1949		PXOR X11, X2
  1950		PXOR X12, X3
  1951		PXOR X13, X4
  1952		PXOR X14, X5
  1953		// Finally output the result
  1954		MOVOU X0, (16*0)(AX)
  1955		MOVOU X1, (16*1)(AX)
  1956		MOVOU X2, (16*2)(AX)
  1957		MOVOU X3, (16*3)(AX)
  1958		MOVOU X4, (16*4)(AX)
  1959		MOVOU X5, (16*5)(AX)
  1960		MOVQ $0, rptr
  1961	
  1962		RET
  1963	#undef x1in
  1964	#undef y1in
  1965	#undef z1in
  1966	#undef x2in
  1967	#undef y2in
  1968	#undef xout
  1969	#undef yout
  1970	#undef zout
  1971	#undef s2
  1972	#undef z1sqr
  1973	#undef h
  1974	#undef r
  1975	#undef hsqr
  1976	#undef rsqr
  1977	#undef hcub
  1978	#undef rptr
  1979	#undef sel_save
  1980	#undef zero_save
  1981	
  1982	// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1983	// otherwise. It writes to [acc4..acc7], t0 and t1.
  1984	TEXT p256IsZero(SB),NOSPLIT,$0
  1985		// AX contains a flag that is set if the input is zero.
  1986		XORQ AX, AX
  1987		MOVQ $1, t1
  1988	
  1989		// Check whether [acc4..acc7] are all zero.
  1990		MOVQ acc4, t0
  1991		ORQ acc5, t0
  1992		ORQ acc6, t0
  1993		ORQ acc7, t0
  1994	
  1995		// Set the zero flag if so. (CMOV of a constant to a register doesn't
  1996		// appear to be supported in Go. Thus t1 = 1.)
  1997		CMOVQEQ t1, AX
  1998	
  1999		// XOR [acc4..acc7] with P and compare with zero again.
  2000		XORQ $-1, acc4
  2001		XORQ p256const0<>(SB), acc5
  2002		XORQ p256const1<>(SB), acc7
  2003		ORQ acc5, acc4
  2004		ORQ acc6, acc4
  2005		ORQ acc7, acc4
  2006	
  2007		// Set the zero flag if so.
  2008		CMOVQEQ t1, AX
  2009		RET
  2010	
  2011	/* ---------------------------------------*/
  2012	#define x1in(off) (32*0 + off)(SP)
  2013	#define y1in(off) (32*1 + off)(SP)
  2014	#define z1in(off) (32*2 + off)(SP)
  2015	#define x2in(off) (32*3 + off)(SP)
  2016	#define y2in(off) (32*4 + off)(SP)
  2017	#define z2in(off) (32*5 + off)(SP)
  2018	
  2019	#define xout(off) (32*6 + off)(SP)
  2020	#define yout(off) (32*7 + off)(SP)
  2021	#define zout(off) (32*8 + off)(SP)
  2022	
  2023	#define u1(off)    (32*9 + off)(SP)
  2024	#define u2(off)    (32*10 + off)(SP)
  2025	#define s1(off)    (32*11 + off)(SP)
  2026	#define s2(off)    (32*12 + off)(SP)
  2027	#define z1sqr(off) (32*13 + off)(SP)
  2028	#define z2sqr(off) (32*14 + off)(SP)
  2029	#define h(off)     (32*15 + off)(SP)
  2030	#define r(off)     (32*16 + off)(SP)
  2031	#define hsqr(off)  (32*17 + off)(SP)
  2032	#define rsqr(off)  (32*18 + off)(SP)
  2033	#define hcub(off)  (32*19 + off)(SP)
  2034	#define rptr       (32*20)(SP)
  2035	#define points_eq  (32*20+8)(SP)
  2036	
  2037	//func p256PointAddAsm(res, in1, in2 []uint64) int
  2038	TEXT ·p256PointAddAsm(SB),0,$680-80
  2039		// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2040		// Move input to stack in order to free registers
  2041		MOVQ res+0(FP), AX
  2042		MOVQ in1+24(FP), BX
  2043		MOVQ in2+48(FP), CX
  2044	
  2045		MOVOU (16*0)(BX), X0
  2046		MOVOU (16*1)(BX), X1
  2047		MOVOU (16*2)(BX), X2
  2048		MOVOU (16*3)(BX), X3
  2049		MOVOU (16*4)(BX), X4
  2050		MOVOU (16*5)(BX), X5
  2051	
  2052		MOVOU X0, x1in(16*0)
  2053		MOVOU X1, x1in(16*1)
  2054		MOVOU X2, y1in(16*0)
  2055		MOVOU X3, y1in(16*1)
  2056		MOVOU X4, z1in(16*0)
  2057		MOVOU X5, z1in(16*1)
  2058	
  2059		MOVOU (16*0)(CX), X0
  2060		MOVOU (16*1)(CX), X1
  2061		MOVOU (16*2)(CX), X2
  2062		MOVOU (16*3)(CX), X3
  2063		MOVOU (16*4)(CX), X4
  2064		MOVOU (16*5)(CX), X5
  2065	
  2066		MOVOU X0, x2in(16*0)
  2067		MOVOU X1, x2in(16*1)
  2068		MOVOU X2, y2in(16*0)
  2069		MOVOU X3, y2in(16*1)
  2070		MOVOU X4, z2in(16*0)
  2071		MOVOU X5, z2in(16*1)
  2072		// Store pointer to result
  2073		MOVQ AX, rptr
  2074		// Begin point add
  2075		LDacc (z2in)
  2076		CALL p256SqrInternal(SB)	// z2ˆ2
  2077		ST (z2sqr)
  2078		LDt (z2in)
  2079		CALL p256MulInternal(SB)	// z2ˆ3
  2080		LDt (y1in)
  2081		CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2082		ST (s1)
  2083	
  2084		LDacc (z1in)
  2085		CALL p256SqrInternal(SB)	// z1ˆ2
  2086		ST (z1sqr)
  2087		LDt (z1in)
  2088		CALL p256MulInternal(SB)	// z1ˆ3
  2089		LDt (y2in)
  2090		CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2091		ST (s2)
  2092	
  2093		LDt (s1)
  2094		CALL p256SubInternal(SB)	// r = s2 - s1
  2095		ST (r)
  2096		CALL p256IsZero(SB)
  2097		MOVQ AX, points_eq
  2098	
  2099		LDacc (z2sqr)
  2100		LDt (x1in)
  2101		CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2102		ST (u1)
  2103		LDacc (z1sqr)
  2104		LDt (x2in)
  2105		CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2106		ST (u2)
  2107	
  2108		LDt (u1)
  2109		CALL p256SubInternal(SB)	// h = u2 - u1
  2110		ST (h)
  2111		CALL p256IsZero(SB)
  2112		ANDQ points_eq, AX
  2113		MOVQ AX, points_eq
  2114	
  2115		LDacc (r)
  2116		CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2117		ST (rsqr)
  2118	
  2119		LDacc (h)
  2120		CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2121		ST (hsqr)
  2122	
  2123		LDt (h)
  2124		CALL p256MulInternal(SB)	// hcub = hˆ3
  2125		ST (hcub)
  2126	
  2127		LDt (s1)
  2128		CALL p256MulInternal(SB)
  2129		ST (s2)
  2130	
  2131		LDacc (z1in)
  2132		LDt (z2in)
  2133		CALL p256MulInternal(SB)	// z1 * z2
  2134		LDt (h)
  2135		CALL p256MulInternal(SB)	// z1 * z2 * h
  2136		ST (zout)
  2137	
  2138		LDacc (hsqr)
  2139		LDt (u1)
  2140		CALL p256MulInternal(SB)	// hˆ2 * u1
  2141		ST (u2)
  2142	
  2143		p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2144		LDacc (rsqr)
  2145		CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2146	
  2147		LDt (hcub)
  2148		CALL p256SubInternal(SB)
  2149		ST (xout)
  2150	
  2151		MOVQ acc4, t0
  2152		MOVQ acc5, t1
  2153		MOVQ acc6, t2
  2154		MOVQ acc7, t3
  2155		LDacc (u2)
  2156		CALL p256SubInternal(SB)
  2157	
  2158		LDt (r)
  2159		CALL p256MulInternal(SB)
  2160	
  2161		LDt (s2)
  2162		CALL p256SubInternal(SB)
  2163		ST (yout)
  2164	
  2165		MOVOU xout(16*0), X0
  2166		MOVOU xout(16*1), X1
  2167		MOVOU yout(16*0), X2
  2168		MOVOU yout(16*1), X3
  2169		MOVOU zout(16*0), X4
  2170		MOVOU zout(16*1), X5
  2171		// Finally output the result
  2172		MOVQ rptr, AX
  2173		MOVQ $0, rptr
  2174		MOVOU X0, (16*0)(AX)
  2175		MOVOU X1, (16*1)(AX)
  2176		MOVOU X2, (16*2)(AX)
  2177		MOVOU X3, (16*3)(AX)
  2178		MOVOU X4, (16*4)(AX)
  2179		MOVOU X5, (16*5)(AX)
  2180	
  2181		MOVQ points_eq, AX
  2182		MOVQ AX, ret+72(FP)
  2183	
  2184		RET
  2185	#undef x1in
  2186	#undef y1in
  2187	#undef z1in
  2188	#undef x2in
  2189	#undef y2in
  2190	#undef z2in
  2191	#undef xout
  2192	#undef yout
  2193	#undef zout
  2194	#undef s1
  2195	#undef s2
  2196	#undef u1
  2197	#undef u2
  2198	#undef z1sqr
  2199	#undef z2sqr
  2200	#undef h
  2201	#undef r
  2202	#undef hsqr
  2203	#undef rsqr
  2204	#undef hcub
  2205	#undef rptr
  2206	/* ---------------------------------------*/
  2207	#define x(off) (32*0 + off)(SP)
  2208	#define y(off) (32*1 + off)(SP)
  2209	#define z(off) (32*2 + off)(SP)
  2210	
  2211	#define s(off)	(32*3 + off)(SP)
  2212	#define m(off)	(32*4 + off)(SP)
  2213	#define zsqr(off) (32*5 + off)(SP)
  2214	#define tmp(off)  (32*6 + off)(SP)
  2215	#define rptr	  (32*7)(SP)
  2216	
  2217	//func p256PointDoubleAsm(res, in []uint64)
  2218	TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
  2219		// Move input to stack in order to free registers
  2220		MOVQ res+0(FP), AX
  2221		MOVQ in+24(FP), BX
  2222	
  2223		MOVOU (16*0)(BX), X0
  2224		MOVOU (16*1)(BX), X1
  2225		MOVOU (16*2)(BX), X2
  2226		MOVOU (16*3)(BX), X3
  2227		MOVOU (16*4)(BX), X4
  2228		MOVOU (16*5)(BX), X5
  2229	
  2230		MOVOU X0, x(16*0)
  2231		MOVOU X1, x(16*1)
  2232		MOVOU X2, y(16*0)
  2233		MOVOU X3, y(16*1)
  2234		MOVOU X4, z(16*0)
  2235		MOVOU X5, z(16*1)
  2236		// Store pointer to result
  2237		MOVQ AX, rptr
  2238		// Begin point double
  2239		LDacc (z)
  2240		CALL p256SqrInternal(SB)
  2241		ST (zsqr)
  2242	
  2243		LDt (x)
  2244		p256AddInline
  2245		STt (m)
  2246	
  2247		LDacc (z)
  2248		LDt (y)
  2249		CALL p256MulInternal(SB)
  2250		p256MulBy2Inline
  2251		MOVQ rptr, AX
  2252		// Store z
  2253		MOVQ t0, (16*4 + 8*0)(AX)
  2254		MOVQ t1, (16*4 + 8*1)(AX)
  2255		MOVQ t2, (16*4 + 8*2)(AX)
  2256		MOVQ t3, (16*4 + 8*3)(AX)
  2257	
  2258		LDacc (x)
  2259		LDt (zsqr)
  2260		CALL p256SubInternal(SB)
  2261		LDt (m)
  2262		CALL p256MulInternal(SB)
  2263		ST (m)
  2264		// Multiply by 3
  2265		p256MulBy2Inline
  2266		LDacc (m)
  2267		p256AddInline
  2268		STt (m)
  2269		////////////////////////
  2270		LDacc (y)
  2271		p256MulBy2Inline
  2272		t2acc
  2273		CALL p256SqrInternal(SB)
  2274		ST (s)
  2275		CALL p256SqrInternal(SB)
  2276		// Divide by 2
  2277		XORQ mul0, mul0
  2278		MOVQ acc4, t0
  2279		MOVQ acc5, t1
  2280		MOVQ acc6, t2
  2281		MOVQ acc7, t3
  2282	
  2283		ADDQ $-1, acc4
  2284		ADCQ p256const0<>(SB), acc5
  2285		ADCQ $0, acc6
  2286		ADCQ p256const1<>(SB), acc7
  2287		ADCQ $0, mul0
  2288		TESTQ $1, t0
  2289	
  2290		CMOVQEQ t0, acc4
  2291		CMOVQEQ t1, acc5
  2292		CMOVQEQ t2, acc6
  2293		CMOVQEQ t3, acc7
  2294		ANDQ t0, mul0
  2295	
  2296		SHRQ $1, acc4:acc5
  2297		SHRQ $1, acc5:acc6
  2298		SHRQ $1, acc6:acc7
  2299		SHRQ $1, acc7:mul0
  2300		ST (y)
  2301		/////////////////////////
  2302		LDacc (x)
  2303		LDt (s)
  2304		CALL p256MulInternal(SB)
  2305		ST (s)
  2306		p256MulBy2Inline
  2307		STt (tmp)
  2308	
  2309		LDacc (m)
  2310		CALL p256SqrInternal(SB)
  2311		LDt (tmp)
  2312		CALL p256SubInternal(SB)
  2313	
  2314		MOVQ rptr, AX
  2315		// Store x
  2316		MOVQ acc4, (16*0 + 8*0)(AX)
  2317		MOVQ acc5, (16*0 + 8*1)(AX)
  2318		MOVQ acc6, (16*0 + 8*2)(AX)
  2319		MOVQ acc7, (16*0 + 8*3)(AX)
  2320	
  2321		acc2t
  2322		LDacc (s)
  2323		CALL p256SubInternal(SB)
  2324	
  2325		LDt (m)
  2326		CALL p256MulInternal(SB)
  2327	
  2328		LDt (y)
  2329		CALL p256SubInternal(SB)
  2330		MOVQ rptr, AX
  2331		// Store y
  2332		MOVQ acc4, (16*2 + 8*0)(AX)
  2333		MOVQ acc5, (16*2 + 8*1)(AX)
  2334		MOVQ acc6, (16*2 + 8*2)(AX)
  2335		MOVQ acc7, (16*2 + 8*3)(AX)
  2336		///////////////////////
  2337		MOVQ $0, rptr
  2338	
  2339		RET
  2340	/* ---------------------------------------*/
  2341	

View as plain text