...
Run Format

Text file src/crypto/elliptic/p256_asm_amd64.s

Documentation: crypto/elliptic

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file contains constant-time, 64-bit assembly implementation of
     6	// P256. The optimizations performed here are described in detail in:
     7	// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8	//                          256-bit primes"
     9	// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10	// https://eprint.iacr.org/2013/816.pdf
    11	
    12	#include "textflag.h"
    13	
    14	#define res_ptr DI
    15	#define x_ptr SI
    16	#define y_ptr CX
    17	
    18	#define acc0 R8
    19	#define acc1 R9
    20	#define acc2 R10
    21	#define acc3 R11
    22	#define acc4 R12
    23	#define acc5 R13
    24	#define t0 R14
    25	#define t1 R15
    26	
    27	DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    28	DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    29	DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    30	DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    31	DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    32	DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    33	DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    34	DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    35	DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    36	DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    37	DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    38	GLOBL p256const0<>(SB), 8, $8
    39	GLOBL p256const1<>(SB), 8, $8
    40	GLOBL p256ordK0<>(SB), 8, $8
    41	GLOBL p256ord<>(SB), 8, $32
    42	GLOBL p256one<>(SB), 8, $32
    43	
    44	/* ---------------------------------------*/
    45	// func p256LittleToBig(res []byte, in []uint64)
    46	TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    47		JMP ·p256BigToLittle(SB)
    48	/* ---------------------------------------*/
    49	// func p256BigToLittle(res []uint64, in []byte)
    50	TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    51		MOVQ res+0(FP), res_ptr
    52		MOVQ in+24(FP), x_ptr
    53	
    54		MOVQ (8*0)(x_ptr), acc0
    55		MOVQ (8*1)(x_ptr), acc1
    56		MOVQ (8*2)(x_ptr), acc2
    57		MOVQ (8*3)(x_ptr), acc3
    58	
    59		BSWAPQ acc0
    60		BSWAPQ acc1
    61		BSWAPQ acc2
    62		BSWAPQ acc3
    63	
    64		MOVQ acc3, (8*0)(res_ptr)
    65		MOVQ acc2, (8*1)(res_ptr)
    66		MOVQ acc1, (8*2)(res_ptr)
    67		MOVQ acc0, (8*3)(res_ptr)
    68	
    69		RET
    70	/* ---------------------------------------*/
    71	// func p256MovCond(res, a, b []uint64, cond int)
    72	// If cond == 0 res=b, else res=a
    73	TEXT ·p256MovCond(SB),NOSPLIT,$0
    74		MOVQ res+0(FP), res_ptr
    75		MOVQ a+24(FP), x_ptr
    76		MOVQ b+48(FP), y_ptr
    77		MOVQ cond+72(FP), X12
    78	
    79		PXOR X13, X13
    80		PSHUFD $0, X12, X12
    81		PCMPEQL X13, X12
    82	
    83		MOVOU X12, X0
    84		PANDN (16*0)(x_ptr), X0
    85		MOVOU X12, X1
    86		PANDN (16*1)(x_ptr), X1
    87		MOVOU X12, X2
    88		PANDN (16*2)(x_ptr), X2
    89		MOVOU X12, X3
    90		PANDN (16*3)(x_ptr), X3
    91		MOVOU X12, X4
    92		PANDN (16*4)(x_ptr), X4
    93		MOVOU X12, X5
    94		PANDN (16*5)(x_ptr), X5
    95	
    96		MOVOU (16*0)(y_ptr), X6
    97		MOVOU (16*1)(y_ptr), X7
    98		MOVOU (16*2)(y_ptr), X8
    99		MOVOU (16*3)(y_ptr), X9
   100		MOVOU (16*4)(y_ptr), X10
   101		MOVOU (16*5)(y_ptr), X11
   102	
   103		PAND X12, X6
   104		PAND X12, X7
   105		PAND X12, X8
   106		PAND X12, X9
   107		PAND X12, X10
   108		PAND X12, X11
   109	
   110		PXOR X6, X0
   111		PXOR X7, X1
   112		PXOR X8, X2
   113		PXOR X9, X3
   114		PXOR X10, X4
   115		PXOR X11, X5
   116	
   117		MOVOU X0, (16*0)(res_ptr)
   118		MOVOU X1, (16*1)(res_ptr)
   119		MOVOU X2, (16*2)(res_ptr)
   120		MOVOU X3, (16*3)(res_ptr)
   121		MOVOU X4, (16*4)(res_ptr)
   122		MOVOU X5, (16*5)(res_ptr)
   123	
   124		RET
   125	/* ---------------------------------------*/
   126	// func p256NegCond(val []uint64, cond int)
   127	TEXT ·p256NegCond(SB),NOSPLIT,$0
   128		MOVQ val+0(FP), res_ptr
   129		MOVQ cond+24(FP), t0
   130		// acc = poly
   131		MOVQ $-1, acc0
   132		MOVQ p256const0<>(SB), acc1
   133		MOVQ $0, acc2
   134		MOVQ p256const1<>(SB), acc3
   135		// Load the original value
   136		MOVQ (8*0)(res_ptr), acc5
   137		MOVQ (8*1)(res_ptr), x_ptr
   138		MOVQ (8*2)(res_ptr), y_ptr
   139		MOVQ (8*3)(res_ptr), t1
   140		// Speculatively subtract
   141		SUBQ acc5, acc0
   142		SBBQ x_ptr, acc1
   143		SBBQ y_ptr, acc2
   144		SBBQ t1, acc3
   145		// If condition is 0, keep original value
   146		TESTQ t0, t0
   147		CMOVQEQ acc5, acc0
   148		CMOVQEQ x_ptr, acc1
   149		CMOVQEQ y_ptr, acc2
   150		CMOVQEQ t1, acc3
   151		// Store result
   152		MOVQ acc0, (8*0)(res_ptr)
   153		MOVQ acc1, (8*1)(res_ptr)
   154		MOVQ acc2, (8*2)(res_ptr)
   155		MOVQ acc3, (8*3)(res_ptr)
   156	
   157		RET
   158	/* ---------------------------------------*/
   159	// func p256Sqr(res, in []uint64)
   160	TEXT ·p256Sqr(SB),NOSPLIT,$0
   161		MOVQ res+0(FP), res_ptr
   162		MOVQ in+24(FP), x_ptr
   163		// y[1:] * y[0]
   164		MOVQ (8*0)(x_ptr), t0
   165	
   166		MOVQ (8*1)(x_ptr), AX
   167		MULQ t0
   168		MOVQ AX, acc1
   169		MOVQ DX, acc2
   170	
   171		MOVQ (8*2)(x_ptr), AX
   172		MULQ t0
   173		ADDQ AX, acc2
   174		ADCQ $0, DX
   175		MOVQ DX, acc3
   176	
   177		MOVQ (8*3)(x_ptr), AX
   178		MULQ t0
   179		ADDQ AX, acc3
   180		ADCQ $0, DX
   181		MOVQ DX, acc4
   182		// y[2:] * y[1]
   183		MOVQ (8*1)(x_ptr), t0
   184	
   185		MOVQ (8*2)(x_ptr), AX
   186		MULQ t0
   187		ADDQ AX, acc3
   188		ADCQ $0, DX
   189		MOVQ DX, t1
   190	
   191		MOVQ (8*3)(x_ptr), AX
   192		MULQ t0
   193		ADDQ t1, acc4
   194		ADCQ $0, DX
   195		ADDQ AX, acc4
   196		ADCQ $0, DX
   197		MOVQ DX, acc5
   198		// y[3] * y[2]
   199		MOVQ (8*2)(x_ptr), t0
   200	
   201		MOVQ (8*3)(x_ptr), AX
   202		MULQ t0
   203		ADDQ AX, acc5
   204		ADCQ $0, DX
   205		MOVQ DX, y_ptr
   206		XORQ t1, t1
   207		// *2
   208		ADDQ acc1, acc1
   209		ADCQ acc2, acc2
   210		ADCQ acc3, acc3
   211		ADCQ acc4, acc4
   212		ADCQ acc5, acc5
   213		ADCQ y_ptr, y_ptr
   214		ADCQ $0, t1
   215		// Missing products
   216		MOVQ (8*0)(x_ptr), AX
   217		MULQ AX
   218		MOVQ AX, acc0
   219		MOVQ DX, t0
   220	
   221		MOVQ (8*1)(x_ptr), AX
   222		MULQ AX
   223		ADDQ t0, acc1
   224		ADCQ AX, acc2
   225		ADCQ $0, DX
   226		MOVQ DX, t0
   227	
   228		MOVQ (8*2)(x_ptr), AX
   229		MULQ AX
   230		ADDQ t0, acc3
   231		ADCQ AX, acc4
   232		ADCQ $0, DX
   233		MOVQ DX, t0
   234	
   235		MOVQ (8*3)(x_ptr), AX
   236		MULQ AX
   237		ADDQ t0, acc5
   238		ADCQ AX, y_ptr
   239		ADCQ DX, t1
   240		MOVQ t1, x_ptr
   241		// First reduction step
   242		MOVQ acc0, AX
   243		MOVQ acc0, t1
   244		SHLQ $32, acc0
   245		MULQ p256const1<>(SB)
   246		SHRQ $32, t1
   247		ADDQ acc0, acc1
   248		ADCQ t1, acc2
   249		ADCQ AX, acc3
   250		ADCQ $0, DX
   251		MOVQ DX, acc0
   252		// Second reduction step
   253		MOVQ acc1, AX
   254		MOVQ acc1, t1
   255		SHLQ $32, acc1
   256		MULQ p256const1<>(SB)
   257		SHRQ $32, t1
   258		ADDQ acc1, acc2
   259		ADCQ t1, acc3
   260		ADCQ AX, acc0
   261		ADCQ $0, DX
   262		MOVQ DX, acc1
   263		// Third reduction step
   264		MOVQ acc2, AX
   265		MOVQ acc2, t1
   266		SHLQ $32, acc2
   267		MULQ p256const1<>(SB)
   268		SHRQ $32, t1
   269		ADDQ acc2, acc3
   270		ADCQ t1, acc0
   271		ADCQ AX, acc1
   272		ADCQ $0, DX
   273		MOVQ DX, acc2
   274		// Last reduction step
   275		XORQ t0, t0
   276		MOVQ acc3, AX
   277		MOVQ acc3, t1
   278		SHLQ $32, acc3
   279		MULQ p256const1<>(SB)
   280		SHRQ $32, t1
   281		ADDQ acc3, acc0
   282		ADCQ t1, acc1
   283		ADCQ AX, acc2
   284		ADCQ $0, DX
   285		MOVQ DX, acc3
   286		// Add bits [511:256] of the sqr result
   287		ADCQ acc4, acc0
   288		ADCQ acc5, acc1
   289		ADCQ y_ptr, acc2
   290		ADCQ x_ptr, acc3
   291		ADCQ $0, t0
   292	
   293		MOVQ acc0, acc4
   294		MOVQ acc1, acc5
   295		MOVQ acc2, y_ptr
   296		MOVQ acc3, t1
   297		// Subtract p256
   298		SUBQ $-1, acc0
   299		SBBQ p256const0<>(SB) ,acc1
   300		SBBQ $0, acc2
   301		SBBQ p256const1<>(SB), acc3
   302		SBBQ $0, t0
   303	
   304		CMOVQCS acc4, acc0
   305		CMOVQCS acc5, acc1
   306		CMOVQCS y_ptr, acc2
   307		CMOVQCS t1, acc3
   308	
   309		MOVQ acc0, (8*0)(res_ptr)
   310		MOVQ acc1, (8*1)(res_ptr)
   311		MOVQ acc2, (8*2)(res_ptr)
   312		MOVQ acc3, (8*3)(res_ptr)
   313	
   314		RET
   315	/* ---------------------------------------*/
   316	// func p256Mul(res, in1, in2 []uint64)
   317	TEXT ·p256Mul(SB),NOSPLIT,$0
   318		MOVQ res+0(FP), res_ptr
   319		MOVQ in1+24(FP), x_ptr
   320		MOVQ in2+48(FP), y_ptr
   321		// x * y[0]
   322		MOVQ (8*0)(y_ptr), t0
   323	
   324		MOVQ (8*0)(x_ptr), AX
   325		MULQ t0
   326		MOVQ AX, acc0
   327		MOVQ DX, acc1
   328	
   329		MOVQ (8*1)(x_ptr), AX
   330		MULQ t0
   331		ADDQ AX, acc1
   332		ADCQ $0, DX
   333		MOVQ DX, acc2
   334	
   335		MOVQ (8*2)(x_ptr), AX
   336		MULQ t0
   337		ADDQ AX, acc2
   338		ADCQ $0, DX
   339		MOVQ DX, acc3
   340	
   341		MOVQ (8*3)(x_ptr), AX
   342		MULQ t0
   343		ADDQ AX, acc3
   344		ADCQ $0, DX
   345		MOVQ DX, acc4
   346		XORQ acc5, acc5
   347		// First reduction step
   348		MOVQ acc0, AX
   349		MOVQ acc0, t1
   350		SHLQ $32, acc0
   351		MULQ p256const1<>(SB)
   352		SHRQ $32, t1
   353		ADDQ acc0, acc1
   354		ADCQ t1, acc2
   355		ADCQ AX, acc3
   356		ADCQ DX, acc4
   357		ADCQ $0, acc5
   358		XORQ acc0, acc0
   359		// x * y[1]
   360		MOVQ (8*1)(y_ptr), t0
   361	
   362		MOVQ (8*0)(x_ptr), AX
   363		MULQ t0
   364		ADDQ AX, acc1
   365		ADCQ $0, DX
   366		MOVQ DX, t1
   367	
   368		MOVQ (8*1)(x_ptr), AX
   369		MULQ t0
   370		ADDQ t1, acc2
   371		ADCQ $0, DX
   372		ADDQ AX, acc2
   373		ADCQ $0, DX
   374		MOVQ DX, t1
   375	
   376		MOVQ (8*2)(x_ptr), AX
   377		MULQ t0
   378		ADDQ t1, acc3
   379		ADCQ $0, DX
   380		ADDQ AX, acc3
   381		ADCQ $0, DX
   382		MOVQ DX, t1
   383	
   384		MOVQ (8*3)(x_ptr), AX
   385		MULQ t0
   386		ADDQ t1, acc4
   387		ADCQ $0, DX
   388		ADDQ AX, acc4
   389		ADCQ DX, acc5
   390		ADCQ $0, acc0
   391		// Second reduction step
   392		MOVQ acc1, AX
   393		MOVQ acc1, t1
   394		SHLQ $32, acc1
   395		MULQ p256const1<>(SB)
   396		SHRQ $32, t1
   397		ADDQ acc1, acc2
   398		ADCQ t1, acc3
   399		ADCQ AX, acc4
   400		ADCQ DX, acc5
   401		ADCQ $0, acc0
   402		XORQ acc1, acc1
   403		// x * y[2]
   404		MOVQ (8*2)(y_ptr), t0
   405	
   406		MOVQ (8*0)(x_ptr), AX
   407		MULQ t0
   408		ADDQ AX, acc2
   409		ADCQ $0, DX
   410		MOVQ DX, t1
   411	
   412		MOVQ (8*1)(x_ptr), AX
   413		MULQ t0
   414		ADDQ t1, acc3
   415		ADCQ $0, DX
   416		ADDQ AX, acc3
   417		ADCQ $0, DX
   418		MOVQ DX, t1
   419	
   420		MOVQ (8*2)(x_ptr), AX
   421		MULQ t0
   422		ADDQ t1, acc4
   423		ADCQ $0, DX
   424		ADDQ AX, acc4
   425		ADCQ $0, DX
   426		MOVQ DX, t1
   427	
   428		MOVQ (8*3)(x_ptr), AX
   429		MULQ t0
   430		ADDQ t1, acc5
   431		ADCQ $0, DX
   432		ADDQ AX, acc5
   433		ADCQ DX, acc0
   434		ADCQ $0, acc1
   435		// Third reduction step
   436		MOVQ acc2, AX
   437		MOVQ acc2, t1
   438		SHLQ $32, acc2
   439		MULQ p256const1<>(SB)
   440		SHRQ $32, t1
   441		ADDQ acc2, acc3
   442		ADCQ t1, acc4
   443		ADCQ AX, acc5
   444		ADCQ DX, acc0
   445		ADCQ $0, acc1
   446		XORQ acc2, acc2
   447		// x * y[3]
   448		MOVQ (8*3)(y_ptr), t0
   449	
   450		MOVQ (8*0)(x_ptr), AX
   451		MULQ t0
   452		ADDQ AX, acc3
   453		ADCQ $0, DX
   454		MOVQ DX, t1
   455	
   456		MOVQ (8*1)(x_ptr), AX
   457		MULQ t0
   458		ADDQ t1, acc4
   459		ADCQ $0, DX
   460		ADDQ AX, acc4
   461		ADCQ $0, DX
   462		MOVQ DX, t1
   463	
   464		MOVQ (8*2)(x_ptr), AX
   465		MULQ t0
   466		ADDQ t1, acc5
   467		ADCQ $0, DX
   468		ADDQ AX, acc5
   469		ADCQ $0, DX
   470		MOVQ DX, t1
   471	
   472		MOVQ (8*3)(x_ptr), AX
   473		MULQ t0
   474		ADDQ t1, acc0
   475		ADCQ $0, DX
   476		ADDQ AX, acc0
   477		ADCQ DX, acc1
   478		ADCQ $0, acc2
   479		// Last reduction step
   480		MOVQ acc3, AX
   481		MOVQ acc3, t1
   482		SHLQ $32, acc3
   483		MULQ p256const1<>(SB)
   484		SHRQ $32, t1
   485		ADDQ acc3, acc4
   486		ADCQ t1, acc5
   487		ADCQ AX, acc0
   488		ADCQ DX, acc1
   489		ADCQ $0, acc2
   490		// Copy result [255:0]
   491		MOVQ acc4, x_ptr
   492		MOVQ acc5, acc3
   493		MOVQ acc0, t0
   494		MOVQ acc1, t1
   495		// Subtract p256
   496		SUBQ $-1, acc4
   497		SBBQ p256const0<>(SB) ,acc5
   498		SBBQ $0, acc0
   499		SBBQ p256const1<>(SB), acc1
   500		SBBQ $0, acc2
   501	
   502		CMOVQCS x_ptr, acc4
   503		CMOVQCS acc3, acc5
   504		CMOVQCS t0, acc0
   505		CMOVQCS t1, acc1
   506	
   507		MOVQ acc4, (8*0)(res_ptr)
   508		MOVQ acc5, (8*1)(res_ptr)
   509		MOVQ acc0, (8*2)(res_ptr)
   510		MOVQ acc1, (8*3)(res_ptr)
   511	
   512		RET
   513	/* ---------------------------------------*/
   514	// func p256FromMont(res, in []uint64)
   515	TEXT ·p256FromMont(SB),NOSPLIT,$0
   516		MOVQ res+0(FP), res_ptr
   517		MOVQ in+24(FP), x_ptr
   518	
   519		MOVQ (8*0)(x_ptr), acc0
   520		MOVQ (8*1)(x_ptr), acc1
   521		MOVQ (8*2)(x_ptr), acc2
   522		MOVQ (8*3)(x_ptr), acc3
   523		XORQ acc4, acc4
   524	
   525		// Only reduce, no multiplications are needed
   526		// First stage
   527		MOVQ acc0, AX
   528		MOVQ acc0, t1
   529		SHLQ $32, acc0
   530		MULQ p256const1<>(SB)
   531		SHRQ $32, t1
   532		ADDQ acc0, acc1
   533		ADCQ t1, acc2
   534		ADCQ AX, acc3
   535		ADCQ DX, acc4
   536		XORQ acc5, acc5
   537		// Second stage
   538		MOVQ acc1, AX
   539		MOVQ acc1, t1
   540		SHLQ $32, acc1
   541		MULQ p256const1<>(SB)
   542		SHRQ $32, t1
   543		ADDQ acc1, acc2
   544		ADCQ t1, acc3
   545		ADCQ AX, acc4
   546		ADCQ DX, acc5
   547		XORQ acc0, acc0
   548		// Third stage
   549		MOVQ acc2, AX
   550		MOVQ acc2, t1
   551		SHLQ $32, acc2
   552		MULQ p256const1<>(SB)
   553		SHRQ $32, t1
   554		ADDQ acc2, acc3
   555		ADCQ t1, acc4
   556		ADCQ AX, acc5
   557		ADCQ DX, acc0
   558		XORQ acc1, acc1
   559		// Last stage
   560		MOVQ acc3, AX
   561		MOVQ acc3, t1
   562		SHLQ $32, acc3
   563		MULQ p256const1<>(SB)
   564		SHRQ $32, t1
   565		ADDQ acc3, acc4
   566		ADCQ t1, acc5
   567		ADCQ AX, acc0
   568		ADCQ DX, acc1
   569	
   570		MOVQ acc4, x_ptr
   571		MOVQ acc5, acc3
   572		MOVQ acc0, t0
   573		MOVQ acc1, t1
   574	
   575		SUBQ $-1, acc4
   576		SBBQ p256const0<>(SB), acc5
   577		SBBQ $0, acc0
   578		SBBQ p256const1<>(SB), acc1
   579	
   580		CMOVQCS x_ptr, acc4
   581		CMOVQCS acc3, acc5
   582		CMOVQCS t0, acc0
   583		CMOVQCS t1, acc1
   584	
   585		MOVQ acc4, (8*0)(res_ptr)
   586		MOVQ acc5, (8*1)(res_ptr)
   587		MOVQ acc0, (8*2)(res_ptr)
   588		MOVQ acc1, (8*3)(res_ptr)
   589	
   590		RET
   591	/* ---------------------------------------*/
   592	// Constant time point access to arbitrary point table.
   593	// Indexed from 1 to 15, with -1 offset
   594	// (index 0 is implicitly point at infinity)
   595	// func p256Select(point, table []uint64, idx int)
   596	TEXT ·p256Select(SB),NOSPLIT,$0
   597		MOVQ idx+48(FP),AX
   598		MOVQ table+24(FP),DI
   599		MOVQ point+0(FP),DX
   600	
   601		PXOR X15, X15	// X15 = 0
   602		PCMPEQL X14, X14 // X14 = -1
   603		PSUBL X14, X15   // X15 = 1
   604		MOVL AX, X14
   605		PSHUFD $0, X14, X14
   606	
   607		PXOR X0, X0
   608		PXOR X1, X1
   609		PXOR X2, X2
   610		PXOR X3, X3
   611		PXOR X4, X4
   612		PXOR X5, X5
   613		MOVQ $16, AX
   614	
   615		MOVOU X15, X13
   616	
   617	loop_select:
   618	
   619			MOVOU X13, X12
   620			PADDL X15, X13
   621			PCMPEQL X14, X12
   622	
   623			MOVOU (16*0)(DI), X6
   624			MOVOU (16*1)(DI), X7
   625			MOVOU (16*2)(DI), X8
   626			MOVOU (16*3)(DI), X9
   627			MOVOU (16*4)(DI), X10
   628			MOVOU (16*5)(DI), X11
   629			ADDQ $(16*6), DI
   630	
   631			PAND X12, X6
   632			PAND X12, X7
   633			PAND X12, X8
   634			PAND X12, X9
   635			PAND X12, X10
   636			PAND X12, X11
   637	
   638			PXOR X6, X0
   639			PXOR X7, X1
   640			PXOR X8, X2
   641			PXOR X9, X3
   642			PXOR X10, X4
   643			PXOR X11, X5
   644	
   645			DECQ AX
   646			JNE loop_select
   647	
   648		MOVOU X0, (16*0)(DX)
   649		MOVOU X1, (16*1)(DX)
   650		MOVOU X2, (16*2)(DX)
   651		MOVOU X3, (16*3)(DX)
   652		MOVOU X4, (16*4)(DX)
   653		MOVOU X5, (16*5)(DX)
   654	
   655		RET
   656	/* ---------------------------------------*/
   657	// Constant time point access to base point table.
   658	// func p256SelectBase(point, table []uint64, idx int)
   659	TEXT ·p256SelectBase(SB),NOSPLIT,$0
   660		MOVQ idx+48(FP),AX
   661		MOVQ table+24(FP),DI
   662		MOVQ point+0(FP),DX
   663	
   664		PXOR X15, X15	// X15 = 0
   665		PCMPEQL X14, X14 // X14 = -1
   666		PSUBL X14, X15   // X15 = 1
   667		MOVL AX, X14
   668		PSHUFD $0, X14, X14
   669	
   670		PXOR X0, X0
   671		PXOR X1, X1
   672		PXOR X2, X2
   673		PXOR X3, X3
   674		MOVQ $32, AX
   675	
   676		MOVOU X15, X13
   677	
   678	loop_select_base:
   679	
   680			MOVOU X13, X12
   681			PADDL X15, X13
   682			PCMPEQL X14, X12
   683	
   684			MOVOU (16*0)(DI), X4
   685			MOVOU (16*1)(DI), X5
   686			MOVOU (16*2)(DI), X6
   687			MOVOU (16*3)(DI), X7
   688	
   689			MOVOU (16*4)(DI), X8
   690			MOVOU (16*5)(DI), X9
   691			MOVOU (16*6)(DI), X10
   692			MOVOU (16*7)(DI), X11
   693	
   694			ADDQ $(16*8), DI
   695	
   696			PAND X12, X4
   697			PAND X12, X5
   698			PAND X12, X6
   699			PAND X12, X7
   700	
   701			MOVOU X13, X12
   702			PADDL X15, X13
   703			PCMPEQL X14, X12
   704	
   705			PAND X12, X8
   706			PAND X12, X9
   707			PAND X12, X10
   708			PAND X12, X11
   709	
   710			PXOR X4, X0
   711			PXOR X5, X1
   712			PXOR X6, X2
   713			PXOR X7, X3
   714	
   715			PXOR X8, X0
   716			PXOR X9, X1
   717			PXOR X10, X2
   718			PXOR X11, X3
   719	
   720			DECQ AX
   721			JNE loop_select_base
   722	
   723		MOVOU X0, (16*0)(DX)
   724		MOVOU X1, (16*1)(DX)
   725		MOVOU X2, (16*2)(DX)
   726		MOVOU X3, (16*3)(DX)
   727	
   728		RET
   729	/* ---------------------------------------*/
   730	// func p256OrdMul(res, in1, in2 []uint64)
   731	TEXT ·p256OrdMul(SB),NOSPLIT,$0
   732		MOVQ res+0(FP), res_ptr
   733		MOVQ in1+24(FP), x_ptr
   734		MOVQ in2+48(FP), y_ptr
   735		// x * y[0]
   736		MOVQ (8*0)(y_ptr), t0
   737	
   738		MOVQ (8*0)(x_ptr), AX
   739		MULQ t0
   740		MOVQ AX, acc0
   741		MOVQ DX, acc1
   742	
   743		MOVQ (8*1)(x_ptr), AX
   744		MULQ t0
   745		ADDQ AX, acc1
   746		ADCQ $0, DX
   747		MOVQ DX, acc2
   748	
   749		MOVQ (8*2)(x_ptr), AX
   750		MULQ t0
   751		ADDQ AX, acc2
   752		ADCQ $0, DX
   753		MOVQ DX, acc3
   754	
   755		MOVQ (8*3)(x_ptr), AX
   756		MULQ t0
   757		ADDQ AX, acc3
   758		ADCQ $0, DX
   759		MOVQ DX, acc4
   760		XORQ acc5, acc5
   761		// First reduction step
   762		MOVQ acc0, AX
   763		MULQ p256ordK0<>(SB)
   764		MOVQ AX, t0
   765	
   766		MOVQ p256ord<>+0x00(SB), AX
   767		MULQ t0
   768		ADDQ AX, acc0
   769		ADCQ $0, DX
   770		MOVQ DX, t1
   771	
   772		MOVQ p256ord<>+0x08(SB), AX
   773		MULQ t0
   774		ADDQ t1, acc1
   775		ADCQ $0, DX
   776		ADDQ AX, acc1
   777		ADCQ $0, DX
   778		MOVQ DX, t1
   779	
   780		MOVQ p256ord<>+0x10(SB), AX
   781		MULQ t0
   782		ADDQ t1, acc2
   783		ADCQ $0, DX
   784		ADDQ AX, acc2
   785		ADCQ $0, DX
   786		MOVQ DX, t1
   787	
   788		MOVQ p256ord<>+0x18(SB), AX
   789		MULQ t0
   790		ADDQ t1, acc3
   791		ADCQ $0, DX
   792		ADDQ AX, acc3
   793		ADCQ DX, acc4
   794		ADCQ $0, acc5
   795		// x * y[1]
   796		MOVQ (8*1)(y_ptr), t0
   797	
   798		MOVQ (8*0)(x_ptr), AX
   799		MULQ t0
   800		ADDQ AX, acc1
   801		ADCQ $0, DX
   802		MOVQ DX, t1
   803	
   804		MOVQ (8*1)(x_ptr), AX
   805		MULQ t0
   806		ADDQ t1, acc2
   807		ADCQ $0, DX
   808		ADDQ AX, acc2
   809		ADCQ $0, DX
   810		MOVQ DX, t1
   811	
   812		MOVQ (8*2)(x_ptr), AX
   813		MULQ t0
   814		ADDQ t1, acc3
   815		ADCQ $0, DX
   816		ADDQ AX, acc3
   817		ADCQ $0, DX
   818		MOVQ DX, t1
   819	
   820		MOVQ (8*3)(x_ptr), AX
   821		MULQ t0
   822		ADDQ t1, acc4
   823		ADCQ $0, DX
   824		ADDQ AX, acc4
   825		ADCQ DX, acc5
   826		ADCQ $0, acc0
   827		// Second reduction step
   828		MOVQ acc1, AX
   829		MULQ p256ordK0<>(SB)
   830		MOVQ AX, t0
   831	
   832		MOVQ p256ord<>+0x00(SB), AX
   833		MULQ t0
   834		ADDQ AX, acc1
   835		ADCQ $0, DX
   836		MOVQ DX, t1
   837	
   838		MOVQ p256ord<>+0x08(SB), AX
   839		MULQ t0
   840		ADDQ t1, acc2
   841		ADCQ $0, DX
   842		ADDQ AX, acc2
   843		ADCQ $0, DX
   844		MOVQ DX, t1
   845	
   846		MOVQ p256ord<>+0x10(SB), AX
   847		MULQ t0
   848		ADDQ t1, acc3
   849		ADCQ $0, DX
   850		ADDQ AX, acc3
   851		ADCQ $0, DX
   852		MOVQ DX, t1
   853	
   854		MOVQ p256ord<>+0x18(SB), AX
   855		MULQ t0
   856		ADDQ t1, acc4
   857		ADCQ $0, DX
   858		ADDQ AX, acc4
   859		ADCQ DX, acc5
   860		ADCQ $0, acc0
   861		// x * y[2]
   862		MOVQ (8*2)(y_ptr), t0
   863	
   864		MOVQ (8*0)(x_ptr), AX
   865		MULQ t0
   866		ADDQ AX, acc2
   867		ADCQ $0, DX
   868		MOVQ DX, t1
   869	
   870		MOVQ (8*1)(x_ptr), AX
   871		MULQ t0
   872		ADDQ t1, acc3
   873		ADCQ $0, DX
   874		ADDQ AX, acc3
   875		ADCQ $0, DX
   876		MOVQ DX, t1
   877	
   878		MOVQ (8*2)(x_ptr), AX
   879		MULQ t0
   880		ADDQ t1, acc4
   881		ADCQ $0, DX
   882		ADDQ AX, acc4
   883		ADCQ $0, DX
   884		MOVQ DX, t1
   885	
   886		MOVQ (8*3)(x_ptr), AX
   887		MULQ t0
   888		ADDQ t1, acc5
   889		ADCQ $0, DX
   890		ADDQ AX, acc5
   891		ADCQ DX, acc0
   892		ADCQ $0, acc1
   893		// Third reduction step
   894		MOVQ acc2, AX
   895		MULQ p256ordK0<>(SB)
   896		MOVQ AX, t0
   897	
   898		MOVQ p256ord<>+0x00(SB), AX
   899		MULQ t0
   900		ADDQ AX, acc2
   901		ADCQ $0, DX
   902		MOVQ DX, t1
   903	
   904		MOVQ p256ord<>+0x08(SB), AX
   905		MULQ t0
   906		ADDQ t1, acc3
   907		ADCQ $0, DX
   908		ADDQ AX, acc3
   909		ADCQ $0, DX
   910		MOVQ DX, t1
   911	
   912		MOVQ p256ord<>+0x10(SB), AX
   913		MULQ t0
   914		ADDQ t1, acc4
   915		ADCQ $0, DX
   916		ADDQ AX, acc4
   917		ADCQ $0, DX
   918		MOVQ DX, t1
   919	
   920		MOVQ p256ord<>+0x18(SB), AX
   921		MULQ t0
   922		ADDQ t1, acc5
   923		ADCQ $0, DX
   924		ADDQ AX, acc5
   925		ADCQ DX, acc0
   926		ADCQ $0, acc1
   927		// x * y[3]
   928		MOVQ (8*3)(y_ptr), t0
   929	
   930		MOVQ (8*0)(x_ptr), AX
   931		MULQ t0
   932		ADDQ AX, acc3
   933		ADCQ $0, DX
   934		MOVQ DX, t1
   935	
   936		MOVQ (8*1)(x_ptr), AX
   937		MULQ t0
   938		ADDQ t1, acc4
   939		ADCQ $0, DX
   940		ADDQ AX, acc4
   941		ADCQ $0, DX
   942		MOVQ DX, t1
   943	
   944		MOVQ (8*2)(x_ptr), AX
   945		MULQ t0
   946		ADDQ t1, acc5
   947		ADCQ $0, DX
   948		ADDQ AX, acc5
   949		ADCQ $0, DX
   950		MOVQ DX, t1
   951	
   952		MOVQ (8*3)(x_ptr), AX
   953		MULQ t0
   954		ADDQ t1, acc0
   955		ADCQ $0, DX
   956		ADDQ AX, acc0
   957		ADCQ DX, acc1
   958		ADCQ $0, acc2
   959		// Last reduction step
   960		MOVQ acc3, AX
   961		MULQ p256ordK0<>(SB)
   962		MOVQ AX, t0
   963	
   964		MOVQ p256ord<>+0x00(SB), AX
   965		MULQ t0
   966		ADDQ AX, acc3
   967		ADCQ $0, DX
   968		MOVQ DX, t1
   969	
   970		MOVQ p256ord<>+0x08(SB), AX
   971		MULQ t0
   972		ADDQ t1, acc4
   973		ADCQ $0, DX
   974		ADDQ AX, acc4
   975		ADCQ $0, DX
   976		MOVQ DX, t1
   977	
   978		MOVQ p256ord<>+0x10(SB), AX
   979		MULQ t0
   980		ADDQ t1, acc5
   981		ADCQ $0, DX
   982		ADDQ AX, acc5
   983		ADCQ $0, DX
   984		MOVQ DX, t1
   985	
   986		MOVQ p256ord<>+0x18(SB), AX
   987		MULQ t0
   988		ADDQ t1, acc0
   989		ADCQ $0, DX
   990		ADDQ AX, acc0
   991		ADCQ DX, acc1
   992		ADCQ $0, acc2
   993		// Copy result [255:0]
   994		MOVQ acc4, x_ptr
   995		MOVQ acc5, acc3
   996		MOVQ acc0, t0
   997		MOVQ acc1, t1
   998		// Subtract p256
   999		SUBQ p256ord<>+0x00(SB), acc4
  1000		SBBQ p256ord<>+0x08(SB) ,acc5
  1001		SBBQ p256ord<>+0x10(SB), acc0
  1002		SBBQ p256ord<>+0x18(SB), acc1
  1003		SBBQ $0, acc2
  1004	
  1005		CMOVQCS x_ptr, acc4
  1006		CMOVQCS acc3, acc5
  1007		CMOVQCS t0, acc0
  1008		CMOVQCS t1, acc1
  1009	
  1010		MOVQ acc4, (8*0)(res_ptr)
  1011		MOVQ acc5, (8*1)(res_ptr)
  1012		MOVQ acc0, (8*2)(res_ptr)
  1013		MOVQ acc1, (8*3)(res_ptr)
  1014	
  1015		RET
  1016	/* ---------------------------------------*/
  1017	// func p256OrdSqr(res, in []uint64, n int)
  1018	TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1019		MOVQ res+0(FP), res_ptr
  1020		MOVQ in+24(FP), x_ptr
  1021		MOVQ n+48(FP), BX
  1022	
  1023	ordSqrLoop:
  1024	
  1025		// y[1:] * y[0]
  1026		MOVQ (8*0)(x_ptr), t0
  1027	
  1028		MOVQ (8*1)(x_ptr), AX
  1029		MULQ t0
  1030		MOVQ AX, acc1
  1031		MOVQ DX, acc2
  1032	
  1033		MOVQ (8*2)(x_ptr), AX
  1034		MULQ t0
  1035		ADDQ AX, acc2
  1036		ADCQ $0, DX
  1037		MOVQ DX, acc3
  1038	
  1039		MOVQ (8*3)(x_ptr), AX
  1040		MULQ t0
  1041		ADDQ AX, acc3
  1042		ADCQ $0, DX
  1043		MOVQ DX, acc4
  1044		// y[2:] * y[1]
  1045		MOVQ (8*1)(x_ptr), t0
  1046	
  1047		MOVQ (8*2)(x_ptr), AX
  1048		MULQ t0
  1049		ADDQ AX, acc3
  1050		ADCQ $0, DX
  1051		MOVQ DX, t1
  1052	
  1053		MOVQ (8*3)(x_ptr), AX
  1054		MULQ t0
  1055		ADDQ t1, acc4
  1056		ADCQ $0, DX
  1057		ADDQ AX, acc4
  1058		ADCQ $0, DX
  1059		MOVQ DX, acc5
  1060		// y[3] * y[2]
  1061		MOVQ (8*2)(x_ptr), t0
  1062	
  1063		MOVQ (8*3)(x_ptr), AX
  1064		MULQ t0
  1065		ADDQ AX, acc5
  1066		ADCQ $0, DX
  1067		MOVQ DX, y_ptr
  1068		XORQ t1, t1
  1069		// *2
  1070		ADDQ acc1, acc1
  1071		ADCQ acc2, acc2
  1072		ADCQ acc3, acc3
  1073		ADCQ acc4, acc4
  1074		ADCQ acc5, acc5
  1075		ADCQ y_ptr, y_ptr
  1076		ADCQ $0, t1
  1077		// Missing products
  1078		MOVQ (8*0)(x_ptr), AX
  1079		MULQ AX
  1080		MOVQ AX, acc0
  1081		MOVQ DX, t0
  1082	
  1083		MOVQ (8*1)(x_ptr), AX
  1084		MULQ AX
  1085		ADDQ t0, acc1
  1086		ADCQ AX, acc2
  1087		ADCQ $0, DX
  1088		MOVQ DX, t0
  1089	
  1090		MOVQ (8*2)(x_ptr), AX
  1091		MULQ AX
  1092		ADDQ t0, acc3
  1093		ADCQ AX, acc4
  1094		ADCQ $0, DX
  1095		MOVQ DX, t0
  1096	
  1097		MOVQ (8*3)(x_ptr), AX
  1098		MULQ AX
  1099		ADDQ t0, acc5
  1100		ADCQ AX, y_ptr
  1101		ADCQ DX, t1
  1102		MOVQ t1, x_ptr
  1103		// First reduction step
  1104		MOVQ acc0, AX
  1105		MULQ p256ordK0<>(SB)
  1106		MOVQ AX, t0
  1107	
  1108		MOVQ p256ord<>+0x00(SB), AX
  1109		MULQ t0
  1110		ADDQ AX, acc0
  1111		ADCQ $0, DX
  1112		MOVQ DX, t1
  1113	
  1114		MOVQ p256ord<>+0x08(SB), AX
  1115		MULQ t0
  1116		ADDQ t1, acc1
  1117		ADCQ $0, DX
  1118		ADDQ AX, acc1
  1119	
  1120		MOVQ t0, t1
  1121		ADCQ DX, acc2
  1122		ADCQ $0, t1
  1123		SUBQ t0, acc2
  1124		SBBQ $0, t1
  1125	
  1126		MOVQ t0, AX
  1127		MOVQ t0, DX
  1128		MOVQ t0, acc0
  1129		SHLQ $32, AX
  1130		SHRQ $32, DX
  1131	
  1132		ADDQ t1, acc3
  1133		ADCQ $0, acc0
  1134		SUBQ AX, acc3
  1135		SBBQ DX, acc0
  1136		// Second reduction step
  1137		MOVQ acc1, AX
  1138		MULQ p256ordK0<>(SB)
  1139		MOVQ AX, t0
  1140	
  1141		MOVQ p256ord<>+0x00(SB), AX
  1142		MULQ t0
  1143		ADDQ AX, acc1
  1144		ADCQ $0, DX
  1145		MOVQ DX, t1
  1146	
  1147		MOVQ p256ord<>+0x08(SB), AX
  1148		MULQ t0
  1149		ADDQ t1, acc2
  1150		ADCQ $0, DX
  1151		ADDQ AX, acc2
  1152	
  1153		MOVQ t0, t1
  1154		ADCQ DX, acc3
  1155		ADCQ $0, t1
  1156		SUBQ t0, acc3
  1157		SBBQ $0, t1
  1158	
  1159		MOVQ t0, AX
  1160		MOVQ t0, DX
  1161		MOVQ t0, acc1
  1162		SHLQ $32, AX
  1163		SHRQ $32, DX
  1164	
  1165		ADDQ t1, acc0
  1166		ADCQ $0, acc1
  1167		SUBQ AX, acc0
  1168		SBBQ DX, acc1
  1169		// Third reduction step
  1170		MOVQ acc2, AX
  1171		MULQ p256ordK0<>(SB)
  1172		MOVQ AX, t0
  1173	
  1174		MOVQ p256ord<>+0x00(SB), AX
  1175		MULQ t0
  1176		ADDQ AX, acc2
  1177		ADCQ $0, DX
  1178		MOVQ DX, t1
  1179	
  1180		MOVQ p256ord<>+0x08(SB), AX
  1181		MULQ t0
  1182		ADDQ t1, acc3
  1183		ADCQ $0, DX
  1184		ADDQ AX, acc3
  1185	
  1186		MOVQ t0, t1
  1187		ADCQ DX, acc0
  1188		ADCQ $0, t1
  1189		SUBQ t0, acc0
  1190		SBBQ $0, t1
  1191	
  1192		MOVQ t0, AX
  1193		MOVQ t0, DX
  1194		MOVQ t0, acc2
  1195		SHLQ $32, AX
  1196		SHRQ $32, DX
  1197	
  1198		ADDQ t1, acc1
  1199		ADCQ $0, acc2
  1200		SUBQ AX, acc1
  1201		SBBQ DX, acc2
  1202		// Last reduction step
  1203		MOVQ acc3, AX
  1204		MULQ p256ordK0<>(SB)
  1205		MOVQ AX, t0
  1206	
  1207		MOVQ p256ord<>+0x00(SB), AX
  1208		MULQ t0
  1209		ADDQ AX, acc3
  1210		ADCQ $0, DX
  1211		MOVQ DX, t1
  1212	
  1213		MOVQ p256ord<>+0x08(SB), AX
  1214		MULQ t0
  1215		ADDQ t1, acc0
  1216		ADCQ $0, DX
  1217		ADDQ AX, acc0
  1218		ADCQ $0, DX
  1219		MOVQ DX, t1
  1220	
  1221		MOVQ t0, t1
  1222		ADCQ DX, acc1
  1223		ADCQ $0, t1
  1224		SUBQ t0, acc1
  1225		SBBQ $0, t1
  1226	
  1227		MOVQ t0, AX
  1228		MOVQ t0, DX
  1229		MOVQ t0, acc3
  1230		SHLQ $32, AX
  1231		SHRQ $32, DX
  1232	
  1233		ADDQ t1, acc2
  1234		ADCQ $0, acc3
  1235		SUBQ AX, acc2
  1236		SBBQ DX, acc3
  1237		XORQ t0, t0
  1238		// Add bits [511:256] of the sqr result
  1239		ADCQ acc4, acc0
  1240		ADCQ acc5, acc1
  1241		ADCQ y_ptr, acc2
  1242		ADCQ x_ptr, acc3
  1243		ADCQ $0, t0
  1244	
  1245		MOVQ acc0, acc4
  1246		MOVQ acc1, acc5
  1247		MOVQ acc2, y_ptr
  1248		MOVQ acc3, t1
  1249		// Subtract p256
  1250		SUBQ p256ord<>+0x00(SB), acc0
  1251		SBBQ p256ord<>+0x08(SB) ,acc1
  1252		SBBQ p256ord<>+0x10(SB), acc2
  1253		SBBQ p256ord<>+0x18(SB), acc3
  1254		SBBQ $0, t0
  1255	
  1256		CMOVQCS acc4, acc0
  1257		CMOVQCS acc5, acc1
  1258		CMOVQCS y_ptr, acc2
  1259		CMOVQCS t1, acc3
  1260	
  1261		MOVQ acc0, (8*0)(res_ptr)
  1262		MOVQ acc1, (8*1)(res_ptr)
  1263		MOVQ acc2, (8*2)(res_ptr)
  1264		MOVQ acc3, (8*3)(res_ptr)
  1265		MOVQ res_ptr, x_ptr
  1266		DECQ BX
  1267		JNE ordSqrLoop
  1268	
  1269		RET
  1270	/* ---------------------------------------*/
  1271	#undef res_ptr
  1272	#undef x_ptr
  1273	#undef y_ptr
  1274	
  1275	#undef acc0
  1276	#undef acc1
  1277	#undef acc2
  1278	#undef acc3
  1279	#undef acc4
  1280	#undef acc5
  1281	#undef t0
  1282	#undef t1
  1283	/* ---------------------------------------*/
  1284	#define mul0 AX
  1285	#define mul1 DX
  1286	#define acc0 BX
  1287	#define acc1 CX
  1288	#define acc2 R8
  1289	#define acc3 R9
  1290	#define acc4 R10
  1291	#define acc5 R11
  1292	#define acc6 R12
  1293	#define acc7 R13
  1294	#define t0 R14
  1295	#define t1 R15
  1296	#define t2 DI
  1297	#define t3 SI
  1298	#define hlp BP
  1299	/* ---------------------------------------*/
  1300	TEXT p256SubInternal(SB),NOSPLIT,$0
  1301		XORQ mul0, mul0
  1302		SUBQ t0, acc4
  1303		SBBQ t1, acc5
  1304		SBBQ t2, acc6
  1305		SBBQ t3, acc7
  1306		SBBQ $0, mul0
  1307	
  1308		MOVQ acc4, acc0
  1309		MOVQ acc5, acc1
  1310		MOVQ acc6, acc2
  1311		MOVQ acc7, acc3
  1312	
  1313		ADDQ $-1, acc4
  1314		ADCQ p256const0<>(SB), acc5
  1315		ADCQ $0, acc6
  1316		ADCQ p256const1<>(SB), acc7
  1317		ANDQ $1, mul0
  1318	
  1319		CMOVQEQ acc0, acc4
  1320		CMOVQEQ acc1, acc5
  1321		CMOVQEQ acc2, acc6
  1322		CMOVQEQ acc3, acc7
  1323	
  1324		RET
  1325	/* ---------------------------------------*/
  1326	TEXT p256MulInternal(SB),NOSPLIT,$0
  1327		MOVQ acc4, mul0
  1328		MULQ t0
  1329		MOVQ mul0, acc0
  1330		MOVQ mul1, acc1
  1331	
  1332		MOVQ acc4, mul0
  1333		MULQ t1
  1334		ADDQ mul0, acc1
  1335		ADCQ $0, mul1
  1336		MOVQ mul1, acc2
  1337	
  1338		MOVQ acc4, mul0
  1339		MULQ t2
  1340		ADDQ mul0, acc2
  1341		ADCQ $0, mul1
  1342		MOVQ mul1, acc3
  1343	
  1344		MOVQ acc4, mul0
  1345		MULQ t3
  1346		ADDQ mul0, acc3
  1347		ADCQ $0, mul1
  1348		MOVQ mul1, acc4
  1349	
  1350		MOVQ acc5, mul0
  1351		MULQ t0
  1352		ADDQ mul0, acc1
  1353		ADCQ $0, mul1
  1354		MOVQ mul1, hlp
  1355	
  1356		MOVQ acc5, mul0
  1357		MULQ t1
  1358		ADDQ hlp, acc2
  1359		ADCQ $0, mul1
  1360		ADDQ mul0, acc2
  1361		ADCQ $0, mul1
  1362		MOVQ mul1, hlp
  1363	
  1364		MOVQ acc5, mul0
  1365		MULQ t2
  1366		ADDQ hlp, acc3
  1367		ADCQ $0, mul1
  1368		ADDQ mul0, acc3
  1369		ADCQ $0, mul1
  1370		MOVQ mul1, hlp
  1371	
  1372		MOVQ acc5, mul0
  1373		MULQ t3
  1374		ADDQ hlp, acc4
  1375		ADCQ $0, mul1
  1376		ADDQ mul0, acc4
  1377		ADCQ $0, mul1
  1378		MOVQ mul1, acc5
  1379	
  1380		MOVQ acc6, mul0
  1381		MULQ t0
  1382		ADDQ mul0, acc2
  1383		ADCQ $0, mul1
  1384		MOVQ mul1, hlp
  1385	
  1386		MOVQ acc6, mul0
  1387		MULQ t1
  1388		ADDQ hlp, acc3
  1389		ADCQ $0, mul1
  1390		ADDQ mul0, acc3
  1391		ADCQ $0, mul1
  1392		MOVQ mul1, hlp
  1393	
  1394		MOVQ acc6, mul0
  1395		MULQ t2
  1396		ADDQ hlp, acc4
  1397		ADCQ $0, mul1
  1398		ADDQ mul0, acc4
  1399		ADCQ $0, mul1
  1400		MOVQ mul1, hlp
  1401	
  1402		MOVQ acc6, mul0
  1403		MULQ t3
  1404		ADDQ hlp, acc5
  1405		ADCQ $0, mul1
  1406		ADDQ mul0, acc5
  1407		ADCQ $0, mul1
  1408		MOVQ mul1, acc6
  1409	
  1410		MOVQ acc7, mul0
  1411		MULQ t0
  1412		ADDQ mul0, acc3
  1413		ADCQ $0, mul1
  1414		MOVQ mul1, hlp
  1415	
  1416		MOVQ acc7, mul0
  1417		MULQ t1
  1418		ADDQ hlp, acc4
  1419		ADCQ $0, mul1
  1420		ADDQ mul0, acc4
  1421		ADCQ $0, mul1
  1422		MOVQ mul1, hlp
  1423	
  1424		MOVQ acc7, mul0
  1425		MULQ t2
  1426		ADDQ hlp, acc5
  1427		ADCQ $0, mul1
  1428		ADDQ mul0, acc5
  1429		ADCQ $0, mul1
  1430		MOVQ mul1, hlp
  1431	
  1432		MOVQ acc7, mul0
  1433		MULQ t3
  1434		ADDQ hlp, acc6
  1435		ADCQ $0, mul1
  1436		ADDQ mul0, acc6
  1437		ADCQ $0, mul1
  1438		MOVQ mul1, acc7
  1439		// First reduction step
  1440		MOVQ acc0, mul0
  1441		MOVQ acc0, hlp
  1442		SHLQ $32, acc0
  1443		MULQ p256const1<>(SB)
  1444		SHRQ $32, hlp
  1445		ADDQ acc0, acc1
  1446		ADCQ hlp, acc2
  1447		ADCQ mul0, acc3
  1448		ADCQ $0, mul1
  1449		MOVQ mul1, acc0
  1450		// Second reduction step
  1451		MOVQ acc1, mul0
  1452		MOVQ acc1, hlp
  1453		SHLQ $32, acc1
  1454		MULQ p256const1<>(SB)
  1455		SHRQ $32, hlp
  1456		ADDQ acc1, acc2
  1457		ADCQ hlp, acc3
  1458		ADCQ mul0, acc0
  1459		ADCQ $0, mul1
  1460		MOVQ mul1, acc1
  1461		// Third reduction step
  1462		MOVQ acc2, mul0
  1463		MOVQ acc2, hlp
  1464		SHLQ $32, acc2
  1465		MULQ p256const1<>(SB)
  1466		SHRQ $32, hlp
  1467		ADDQ acc2, acc3
  1468		ADCQ hlp, acc0
  1469		ADCQ mul0, acc1
  1470		ADCQ $0, mul1
  1471		MOVQ mul1, acc2
  1472		// Last reduction step
  1473		MOVQ acc3, mul0
  1474		MOVQ acc3, hlp
  1475		SHLQ $32, acc3
  1476		MULQ p256const1<>(SB)
  1477		SHRQ $32, hlp
  1478		ADDQ acc3, acc0
  1479		ADCQ hlp, acc1
  1480		ADCQ mul0, acc2
  1481		ADCQ $0, mul1
  1482		MOVQ mul1, acc3
  1483		BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00   // MOVQ $0, BP
  1484		// Add bits [511:256] of the result
  1485		ADCQ acc0, acc4
  1486		ADCQ acc1, acc5
  1487		ADCQ acc2, acc6
  1488		ADCQ acc3, acc7
  1489		ADCQ $0, hlp
  1490		// Copy result
  1491		MOVQ acc4, acc0
  1492		MOVQ acc5, acc1
  1493		MOVQ acc6, acc2
  1494		MOVQ acc7, acc3
  1495		// Subtract p256
  1496		SUBQ $-1, acc4
  1497		SBBQ p256const0<>(SB) ,acc5
  1498		SBBQ $0, acc6
  1499		SBBQ p256const1<>(SB), acc7
  1500		SBBQ $0, hlp
  1501		// If the result of the subtraction is negative, restore the previous result
  1502		CMOVQCS acc0, acc4
  1503		CMOVQCS acc1, acc5
  1504		CMOVQCS acc2, acc6
  1505		CMOVQCS acc3, acc7
  1506	
  1507		RET
  1508	/* ---------------------------------------*/
  1509	TEXT p256SqrInternal(SB),NOSPLIT,$0
  1510	
  1511		MOVQ acc4, mul0
  1512		MULQ acc5
  1513		MOVQ mul0, acc1
  1514		MOVQ mul1, acc2
  1515	
  1516		MOVQ acc4, mul0
  1517		MULQ acc6
  1518		ADDQ mul0, acc2
  1519		ADCQ $0, mul1
  1520		MOVQ mul1, acc3
  1521	
  1522		MOVQ acc4, mul0
  1523		MULQ acc7
  1524		ADDQ mul0, acc3
  1525		ADCQ $0, mul1
  1526		MOVQ mul1, t0
  1527	
  1528		MOVQ acc5, mul0
  1529		MULQ acc6
  1530		ADDQ mul0, acc3
  1531		ADCQ $0, mul1
  1532		MOVQ mul1, hlp
  1533	
  1534		MOVQ acc5, mul0
  1535		MULQ acc7
  1536		ADDQ hlp, t0
  1537		ADCQ $0, mul1
  1538		ADDQ mul0, t0
  1539		ADCQ $0, mul1
  1540		MOVQ mul1, t1
  1541	
  1542		MOVQ acc6, mul0
  1543		MULQ acc7
  1544		ADDQ mul0, t1
  1545		ADCQ $0, mul1
  1546		MOVQ mul1, t2
  1547		XORQ t3, t3
  1548		// *2
  1549		ADDQ acc1, acc1
  1550		ADCQ acc2, acc2
  1551		ADCQ acc3, acc3
  1552		ADCQ t0, t0
  1553		ADCQ t1, t1
  1554		ADCQ t2, t2
  1555		ADCQ $0, t3
  1556		// Missing products
  1557		MOVQ acc4, mul0
  1558		MULQ mul0
  1559		MOVQ mul0, acc0
  1560		MOVQ DX, acc4
  1561	
  1562		MOVQ acc5, mul0
  1563		MULQ mul0
  1564		ADDQ acc4, acc1
  1565		ADCQ mul0, acc2
  1566		ADCQ $0, DX
  1567		MOVQ DX, acc4
  1568	
  1569		MOVQ acc6, mul0
  1570		MULQ mul0
  1571		ADDQ acc4, acc3
  1572		ADCQ mul0, t0
  1573		ADCQ $0, DX
  1574		MOVQ DX, acc4
  1575	
  1576		MOVQ acc7, mul0
  1577		MULQ mul0
  1578		ADDQ acc4, t1
  1579		ADCQ mul0, t2
  1580		ADCQ DX, t3
  1581		// First reduction step
  1582		MOVQ acc0, mul0
  1583		MOVQ acc0, hlp
  1584		SHLQ $32, acc0
  1585		MULQ p256const1<>(SB)
  1586		SHRQ $32, hlp
  1587		ADDQ acc0, acc1
  1588		ADCQ hlp, acc2
  1589		ADCQ mul0, acc3
  1590		ADCQ $0, mul1
  1591		MOVQ mul1, acc0
  1592		// Second reduction step
  1593		MOVQ acc1, mul0
  1594		MOVQ acc1, hlp
  1595		SHLQ $32, acc1
  1596		MULQ p256const1<>(SB)
  1597		SHRQ $32, hlp
  1598		ADDQ acc1, acc2
  1599		ADCQ hlp, acc3
  1600		ADCQ mul0, acc0
  1601		ADCQ $0, mul1
  1602		MOVQ mul1, acc1
  1603		// Third reduction step
  1604		MOVQ acc2, mul0
  1605		MOVQ acc2, hlp
  1606		SHLQ $32, acc2
  1607		MULQ p256const1<>(SB)
  1608		SHRQ $32, hlp
  1609		ADDQ acc2, acc3
  1610		ADCQ hlp, acc0
  1611		ADCQ mul0, acc1
  1612		ADCQ $0, mul1
  1613		MOVQ mul1, acc2
  1614		// Last reduction step
  1615		MOVQ acc3, mul0
  1616		MOVQ acc3, hlp
  1617		SHLQ $32, acc3
  1618		MULQ p256const1<>(SB)
  1619		SHRQ $32, hlp
  1620		ADDQ acc3, acc0
  1621		ADCQ hlp, acc1
  1622		ADCQ mul0, acc2
  1623		ADCQ $0, mul1
  1624		MOVQ mul1, acc3
  1625		BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00   // MOVQ $0, BP
  1626		// Add bits [511:256] of the result
  1627		ADCQ acc0, t0
  1628		ADCQ acc1, t1
  1629		ADCQ acc2, t2
  1630		ADCQ acc3, t3
  1631		ADCQ $0, hlp
  1632		// Copy result
  1633		MOVQ t0, acc4
  1634		MOVQ t1, acc5
  1635		MOVQ t2, acc6
  1636		MOVQ t3, acc7
  1637		// Subtract p256
  1638		SUBQ $-1, acc4
  1639		SBBQ p256const0<>(SB) ,acc5
  1640		SBBQ $0, acc6
  1641		SBBQ p256const1<>(SB), acc7
  1642		SBBQ $0, hlp
  1643		// If the result of the subtraction is negative, restore the previous result
  1644		CMOVQCS t0, acc4
  1645		CMOVQCS t1, acc5
  1646		CMOVQCS t2, acc6
  1647		CMOVQCS t3, acc7
  1648	
  1649		RET
  1650	/* ---------------------------------------*/
  1651	#define p256MulBy2Inline\
  1652		XORQ mul0, mul0;\
  1653		ADDQ acc4, acc4;\
  1654		ADCQ acc5, acc5;\
  1655		ADCQ acc6, acc6;\
  1656		ADCQ acc7, acc7;\
  1657		ADCQ $0, mul0;\
  1658		MOVQ acc4, t0;\
  1659		MOVQ acc5, t1;\
  1660		MOVQ acc6, t2;\
  1661		MOVQ acc7, t3;\
  1662		SUBQ $-1, t0;\
  1663		SBBQ p256const0<>(SB), t1;\
  1664		SBBQ $0, t2;\
  1665		SBBQ p256const1<>(SB), t3;\
  1666		SBBQ $0, mul0;\
  1667		CMOVQCS acc4, t0;\
  1668		CMOVQCS acc5, t1;\
  1669		CMOVQCS acc6, t2;\
  1670		CMOVQCS acc7, t3;
  1671	/* ---------------------------------------*/
  1672	#define p256AddInline \
  1673		XORQ mul0, mul0;\
  1674		ADDQ t0, acc4;\
  1675		ADCQ t1, acc5;\
  1676		ADCQ t2, acc6;\
  1677		ADCQ t3, acc7;\
  1678		ADCQ $0, mul0;\
  1679		MOVQ acc4, t0;\
  1680		MOVQ acc5, t1;\
  1681		MOVQ acc6, t2;\
  1682		MOVQ acc7, t3;\
  1683		SUBQ $-1, t0;\
  1684		SBBQ p256const0<>(SB), t1;\
  1685		SBBQ $0, t2;\
  1686		SBBQ p256const1<>(SB), t3;\
  1687		SBBQ $0, mul0;\
  1688		CMOVQCS acc4, t0;\
  1689		CMOVQCS acc5, t1;\
  1690		CMOVQCS acc6, t2;\
  1691		CMOVQCS acc7, t3;
  1692	/* ---------------------------------------*/
  1693	#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1694	#define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1695	#define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1696	#define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1697	#define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1698	#define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1699	/* ---------------------------------------*/
  1700	#define x1in(off) (32*0 + off)(SP)
  1701	#define y1in(off) (32*1 + off)(SP)
  1702	#define z1in(off) (32*2 + off)(SP)
  1703	#define x2in(off) (32*3 + off)(SP)
  1704	#define y2in(off) (32*4 + off)(SP)
  1705	#define xout(off) (32*5 + off)(SP)
  1706	#define yout(off) (32*6 + off)(SP)
  1707	#define zout(off) (32*7 + off)(SP)
  1708	#define s2(off)   (32*8 + off)(SP)
  1709	#define z1sqr(off) (32*9 + off)(SP)
  1710	#define h(off)	  (32*10 + off)(SP)
  1711	#define r(off)	  (32*11 + off)(SP)
  1712	#define hsqr(off) (32*12 + off)(SP)
  1713	#define rsqr(off) (32*13 + off)(SP)
  1714	#define hcub(off) (32*14 + off)(SP)
  1715	#define rptr	  (32*15)(SP)
  1716	#define sel_save  (32*15 + 8)(SP)
  1717	#define zero_save (32*15 + 8 + 4)(SP)
  1718	
  1719	// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1720	TEXT ·p256PointAddAffineAsm(SB),0,$512-96
  1721		// Move input to stack in order to free registers
  1722		MOVQ res+0(FP), AX
  1723		MOVQ in1+24(FP), BX
  1724		MOVQ in2+48(FP), CX
  1725		MOVQ sign+72(FP), DX
  1726		MOVQ sel+80(FP), t1
  1727		MOVQ zero+88(FP), t2
  1728	
  1729		MOVOU (16*0)(BX), X0
  1730		MOVOU (16*1)(BX), X1
  1731		MOVOU (16*2)(BX), X2
  1732		MOVOU (16*3)(BX), X3
  1733		MOVOU (16*4)(BX), X4
  1734		MOVOU (16*5)(BX), X5
  1735	
  1736		MOVOU X0, x1in(16*0)
  1737		MOVOU X1, x1in(16*1)
  1738		MOVOU X2, y1in(16*0)
  1739		MOVOU X3, y1in(16*1)
  1740		MOVOU X4, z1in(16*0)
  1741		MOVOU X5, z1in(16*1)
  1742	
  1743		MOVOU (16*0)(CX), X0
  1744		MOVOU (16*1)(CX), X1
  1745	
  1746		MOVOU X0, x2in(16*0)
  1747		MOVOU X1, x2in(16*1)
  1748		// Store pointer to result
  1749		MOVQ mul0, rptr
  1750		MOVL t1, sel_save
  1751		MOVL t2, zero_save
  1752		// Negate y2in based on sign
  1753		MOVQ (16*2 + 8*0)(CX), acc4
  1754		MOVQ (16*2 + 8*1)(CX), acc5
  1755		MOVQ (16*2 + 8*2)(CX), acc6
  1756		MOVQ (16*2 + 8*3)(CX), acc7
  1757		MOVQ $-1, acc0
  1758		MOVQ p256const0<>(SB), acc1
  1759		MOVQ $0, acc2
  1760		MOVQ p256const1<>(SB), acc3
  1761		XORQ mul0, mul0
  1762		// Speculatively subtract
  1763		SUBQ acc4, acc0
  1764		SBBQ acc5, acc1
  1765		SBBQ acc6, acc2
  1766		SBBQ acc7, acc3
  1767		SBBQ $0, mul0
  1768		MOVQ acc0, t0
  1769		MOVQ acc1, t1
  1770		MOVQ acc2, t2
  1771		MOVQ acc3, t3
  1772		// Add in case the operand was > p256
  1773		ADDQ $-1, acc0
  1774		ADCQ p256const0<>(SB), acc1
  1775		ADCQ $0, acc2
  1776		ADCQ p256const1<>(SB), acc3
  1777		ADCQ $0, mul0
  1778		CMOVQNE t0, acc0
  1779		CMOVQNE t1, acc1
  1780		CMOVQNE t2, acc2
  1781		CMOVQNE t3, acc3
  1782		// If condition is 0, keep original value
  1783		TESTQ DX, DX
  1784		CMOVQEQ acc4, acc0
  1785		CMOVQEQ acc5, acc1
  1786		CMOVQEQ acc6, acc2
  1787		CMOVQEQ acc7, acc3
  1788		// Store result
  1789		MOVQ acc0, y2in(8*0)
  1790		MOVQ acc1, y2in(8*1)
  1791		MOVQ acc2, y2in(8*2)
  1792		MOVQ acc3, y2in(8*3)
  1793		// Begin point add
  1794		LDacc (z1in)
  1795		CALL p256SqrInternal(SB)	// z1ˆ2
  1796		ST (z1sqr)
  1797	
  1798		LDt (x2in)
  1799		CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1800	
  1801		LDt (x1in)
  1802		CALL p256SubInternal(SB)	// h = u2 - u1
  1803		ST (h)
  1804	
  1805		LDt (z1in)
  1806		CALL p256MulInternal(SB)	// z3 = h * z1
  1807		ST (zout)
  1808	
  1809		LDacc (z1sqr)
  1810		CALL p256MulInternal(SB)	// z1ˆ3
  1811	
  1812		LDt (y2in)
  1813		CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1814		ST (s2)
  1815	
  1816		LDt (y1in)
  1817		CALL p256SubInternal(SB)	// r = s2 - s1
  1818		ST (r)
  1819	
  1820		CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1821		ST (rsqr)
  1822	
  1823		LDacc (h)
  1824		CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1825		ST (hsqr)
  1826	
  1827		LDt (h)
  1828		CALL p256MulInternal(SB)	// hcub = hˆ3
  1829		ST (hcub)
  1830	
  1831		LDt (y1in)
  1832		CALL p256MulInternal(SB)	// y1 * hˆ3
  1833		ST (s2)
  1834	
  1835		LDacc (x1in)
  1836		LDt (hsqr)
  1837		CALL p256MulInternal(SB)	// u1 * hˆ2
  1838		ST (h)
  1839	
  1840		p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1841		LDacc (rsqr)
  1842		CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1843	
  1844		LDt (hcub)
  1845		CALL p256SubInternal(SB)
  1846		ST (xout)
  1847	
  1848		MOVQ acc4, t0
  1849		MOVQ acc5, t1
  1850		MOVQ acc6, t2
  1851		MOVQ acc7, t3
  1852		LDacc (h)
  1853		CALL p256SubInternal(SB)
  1854	
  1855		LDt (r)
  1856		CALL p256MulInternal(SB)
  1857	
  1858		LDt (s2)
  1859		CALL p256SubInternal(SB)
  1860		ST (yout)
  1861		// Load stored values from stack
  1862		MOVQ rptr, AX
  1863		MOVL sel_save, BX
  1864		MOVL zero_save, CX
  1865		// The result is not valid if (sel == 0), conditional choose
  1866		MOVOU xout(16*0), X0
  1867		MOVOU xout(16*1), X1
  1868		MOVOU yout(16*0), X2
  1869		MOVOU yout(16*1), X3
  1870		MOVOU zout(16*0), X4
  1871		MOVOU zout(16*1), X5
  1872	
  1873		MOVL BX, X6
  1874		MOVL CX, X7
  1875	
  1876		PXOR X8, X8
  1877		PCMPEQL X9, X9
  1878	
  1879		PSHUFD $0, X6, X6
  1880		PSHUFD $0, X7, X7
  1881	
  1882		PCMPEQL X8, X6
  1883		PCMPEQL X8, X7
  1884	
  1885		MOVOU X6, X15
  1886		PANDN X9, X15
  1887	
  1888		MOVOU x1in(16*0), X9
  1889		MOVOU x1in(16*1), X10
  1890		MOVOU y1in(16*0), X11
  1891		MOVOU y1in(16*1), X12
  1892		MOVOU z1in(16*0), X13
  1893		MOVOU z1in(16*1), X14
  1894	
  1895		PAND X15, X0
  1896		PAND X15, X1
  1897		PAND X15, X2
  1898		PAND X15, X3
  1899		PAND X15, X4
  1900		PAND X15, X5
  1901	
  1902		PAND X6, X9
  1903		PAND X6, X10
  1904		PAND X6, X11
  1905		PAND X6, X12
  1906		PAND X6, X13
  1907		PAND X6, X14
  1908	
  1909		PXOR X9, X0
  1910		PXOR X10, X1
  1911		PXOR X11, X2
  1912		PXOR X12, X3
  1913		PXOR X13, X4
  1914		PXOR X14, X5
  1915		// Similarly if zero == 0
  1916		PCMPEQL X9, X9
  1917		MOVOU X7, X15
  1918		PANDN X9, X15
  1919	
  1920		MOVOU x2in(16*0), X9
  1921		MOVOU x2in(16*1), X10
  1922		MOVOU y2in(16*0), X11
  1923		MOVOU y2in(16*1), X12
  1924		MOVOU p256one<>+0x00(SB), X13
  1925		MOVOU p256one<>+0x10(SB), X14
  1926	
  1927		PAND X15, X0
  1928		PAND X15, X1
  1929		PAND X15, X2
  1930		PAND X15, X3
  1931		PAND X15, X4
  1932		PAND X15, X5
  1933	
  1934		PAND X7, X9
  1935		PAND X7, X10
  1936		PAND X7, X11
  1937		PAND X7, X12
  1938		PAND X7, X13
  1939		PAND X7, X14
  1940	
  1941		PXOR X9, X0
  1942		PXOR X10, X1
  1943		PXOR X11, X2
  1944		PXOR X12, X3
  1945		PXOR X13, X4
  1946		PXOR X14, X5
  1947		// Finally output the result
  1948		MOVOU X0, (16*0)(AX)
  1949		MOVOU X1, (16*1)(AX)
  1950		MOVOU X2, (16*2)(AX)
  1951		MOVOU X3, (16*3)(AX)
  1952		MOVOU X4, (16*4)(AX)
  1953		MOVOU X5, (16*5)(AX)
  1954		MOVQ $0, rptr
  1955	
  1956		RET
  1957	#undef x1in
  1958	#undef y1in
  1959	#undef z1in
  1960	#undef x2in
  1961	#undef y2in
  1962	#undef xout
  1963	#undef yout
  1964	#undef zout
  1965	#undef s2
  1966	#undef z1sqr
  1967	#undef h
  1968	#undef r
  1969	#undef hsqr
  1970	#undef rsqr
  1971	#undef hcub
  1972	#undef rptr
  1973	#undef sel_save
  1974	#undef zero_save
  1975	/* ---------------------------------------*/
  1976	#define x1in(off) (32*0 + off)(SP)
  1977	#define y1in(off) (32*1 + off)(SP)
  1978	#define z1in(off) (32*2 + off)(SP)
  1979	#define x2in(off) (32*3 + off)(SP)
  1980	#define y2in(off) (32*4 + off)(SP)
  1981	#define z2in(off) (32*5 + off)(SP)
  1982	
  1983	#define xout(off) (32*6 + off)(SP)
  1984	#define yout(off) (32*7 + off)(SP)
  1985	#define zout(off) (32*8 + off)(SP)
  1986	
  1987	#define u1(off)    (32*9 + off)(SP)
  1988	#define u2(off)    (32*10 + off)(SP)
  1989	#define s1(off)    (32*11 + off)(SP)
  1990	#define s2(off)    (32*12 + off)(SP)
  1991	#define z1sqr(off) (32*13 + off)(SP)
  1992	#define z2sqr(off) (32*14 + off)(SP)
  1993	#define h(off)     (32*15 + off)(SP)
  1994	#define r(off)     (32*16 + off)(SP)
  1995	#define hsqr(off)  (32*17 + off)(SP)
  1996	#define rsqr(off)  (32*18 + off)(SP)
  1997	#define hcub(off)  (32*19 + off)(SP)
  1998	#define rptr       (32*20)(SP)
  1999	
  2000	//func p256PointAddAsm(res, in1, in2 []uint64)
  2001	TEXT ·p256PointAddAsm(SB),0,$672-72
  2002		// Move input to stack in order to free registers
  2003		MOVQ res+0(FP), AX
  2004		MOVQ in1+24(FP), BX
  2005		MOVQ in2+48(FP), CX
  2006	
  2007		MOVOU (16*0)(BX), X0
  2008		MOVOU (16*1)(BX), X1
  2009		MOVOU (16*2)(BX), X2
  2010		MOVOU (16*3)(BX), X3
  2011		MOVOU (16*4)(BX), X4
  2012		MOVOU (16*5)(BX), X5
  2013	
  2014		MOVOU X0, x1in(16*0)
  2015		MOVOU X1, x1in(16*1)
  2016		MOVOU X2, y1in(16*0)
  2017		MOVOU X3, y1in(16*1)
  2018		MOVOU X4, z1in(16*0)
  2019		MOVOU X5, z1in(16*1)
  2020	
  2021		MOVOU (16*0)(CX), X0
  2022		MOVOU (16*1)(CX), X1
  2023		MOVOU (16*2)(CX), X2
  2024		MOVOU (16*3)(CX), X3
  2025		MOVOU (16*4)(CX), X4
  2026		MOVOU (16*5)(CX), X5
  2027	
  2028		MOVOU X0, x2in(16*0)
  2029		MOVOU X1, x2in(16*1)
  2030		MOVOU X2, y2in(16*0)
  2031		MOVOU X3, y2in(16*1)
  2032		MOVOU X4, z2in(16*0)
  2033		MOVOU X5, z2in(16*1)
  2034		// Store pointer to result
  2035		MOVQ AX, rptr
  2036		// Begin point add
  2037		LDacc (z2in)
  2038		CALL p256SqrInternal(SB)	// z2ˆ2
  2039		ST (z2sqr)
  2040		LDt (z2in)
  2041		CALL p256MulInternal(SB)	// z2ˆ3
  2042		LDt (y1in)
  2043		CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2044		ST (s1)
  2045	
  2046		LDacc (z1in)
  2047		CALL p256SqrInternal(SB)	// z1ˆ2
  2048		ST (z1sqr)
  2049		LDt (z1in)
  2050		CALL p256MulInternal(SB)	// z1ˆ3
  2051		LDt (y2in)
  2052		CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2053		ST (s2)
  2054	
  2055		LDt (s1)
  2056		CALL p256SubInternal(SB)	// r = s2 - s1
  2057		ST (r)
  2058	
  2059		LDacc (z2sqr)
  2060		LDt (x1in)
  2061		CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2062		ST (u1)
  2063		LDacc (z1sqr)
  2064		LDt (x2in)
  2065		CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2066		ST (u2)
  2067	
  2068		LDt (u1)
  2069		CALL p256SubInternal(SB)	// h = u2 - u1
  2070		ST (h)
  2071	
  2072		LDacc (r)
  2073		CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2074		ST (rsqr)
  2075	
  2076		LDacc (h)
  2077		CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2078		ST (hsqr)
  2079	
  2080		LDt (h)
  2081		CALL p256MulInternal(SB)	// hcub = hˆ3
  2082		ST (hcub)
  2083	
  2084		LDt (s1)
  2085		CALL p256MulInternal(SB)
  2086		ST (s2)
  2087	
  2088		LDacc (z1in)
  2089		LDt (z2in)
  2090		CALL p256MulInternal(SB)	// z1 * z2
  2091		LDt (h)
  2092		CALL p256MulInternal(SB)	// z1 * z2 * h
  2093		ST (zout)
  2094	
  2095		LDacc (hsqr)
  2096		LDt (u1)
  2097		CALL p256MulInternal(SB)	// hˆ2 * u1
  2098		ST (u2)
  2099	
  2100		p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2101		LDacc (rsqr)
  2102		CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2103	
  2104		LDt (hcub)
  2105		CALL p256SubInternal(SB)
  2106		ST (xout)
  2107	
  2108		MOVQ acc4, t0
  2109		MOVQ acc5, t1
  2110		MOVQ acc6, t2
  2111		MOVQ acc7, t3
  2112		LDacc (u2)
  2113		CALL p256SubInternal(SB)
  2114	
  2115		LDt (r)
  2116		CALL p256MulInternal(SB)
  2117	
  2118		LDt (s2)
  2119		CALL p256SubInternal(SB)
  2120		ST (yout)
  2121	
  2122		MOVOU xout(16*0), X0
  2123		MOVOU xout(16*1), X1
  2124		MOVOU yout(16*0), X2
  2125		MOVOU yout(16*1), X3
  2126		MOVOU zout(16*0), X4
  2127		MOVOU zout(16*1), X5
  2128		// Finally output the result
  2129		MOVQ rptr, AX
  2130		MOVQ $0, rptr
  2131		MOVOU X0, (16*0)(AX)
  2132		MOVOU X1, (16*1)(AX)
  2133		MOVOU X2, (16*2)(AX)
  2134		MOVOU X3, (16*3)(AX)
  2135		MOVOU X4, (16*4)(AX)
  2136		MOVOU X5, (16*5)(AX)
  2137	
  2138		RET
  2139	#undef x1in
  2140	#undef y1in
  2141	#undef z1in
  2142	#undef x2in
  2143	#undef y2in
  2144	#undef z2in
  2145	#undef xout
  2146	#undef yout
  2147	#undef zout
  2148	#undef s1
  2149	#undef s2
  2150	#undef u1
  2151	#undef u2
  2152	#undef z1sqr
  2153	#undef z2sqr
  2154	#undef h
  2155	#undef r
  2156	#undef hsqr
  2157	#undef rsqr
  2158	#undef hcub
  2159	#undef rptr
  2160	/* ---------------------------------------*/
  2161	#define x(off) (32*0 + off)(SP)
  2162	#define y(off) (32*1 + off)(SP)
  2163	#define z(off) (32*2 + off)(SP)
  2164	
  2165	#define s(off)	(32*3 + off)(SP)
  2166	#define m(off)	(32*4 + off)(SP)
  2167	#define zsqr(off) (32*5 + off)(SP)
  2168	#define tmp(off)  (32*6 + off)(SP)
  2169	#define rptr	  (32*7)(SP)
  2170	
  2171	//func p256PointDoubleAsm(res, in []uint64)
  2172	TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
  2173		// Move input to stack in order to free registers
  2174		MOVQ res+0(FP), AX
  2175		MOVQ in+24(FP), BX
  2176	
  2177		MOVOU (16*0)(BX), X0
  2178		MOVOU (16*1)(BX), X1
  2179		MOVOU (16*2)(BX), X2
  2180		MOVOU (16*3)(BX), X3
  2181		MOVOU (16*4)(BX), X4
  2182		MOVOU (16*5)(BX), X5
  2183	
  2184		MOVOU X0, x(16*0)
  2185		MOVOU X1, x(16*1)
  2186		MOVOU X2, y(16*0)
  2187		MOVOU X3, y(16*1)
  2188		MOVOU X4, z(16*0)
  2189		MOVOU X5, z(16*1)
  2190		// Store pointer to result
  2191		MOVQ AX, rptr
  2192		// Begin point double
  2193		LDacc (z)
  2194		CALL p256SqrInternal(SB)
  2195		ST (zsqr)
  2196	
  2197		LDt (x)
  2198		p256AddInline
  2199		STt (m)
  2200	
  2201		LDacc (z)
  2202		LDt (y)
  2203		CALL p256MulInternal(SB)
  2204		p256MulBy2Inline
  2205		MOVQ rptr, AX
  2206		// Store z
  2207		MOVQ t0, (16*4 + 8*0)(AX)
  2208		MOVQ t1, (16*4 + 8*1)(AX)
  2209		MOVQ t2, (16*4 + 8*2)(AX)
  2210		MOVQ t3, (16*4 + 8*3)(AX)
  2211	
  2212		LDacc (x)
  2213		LDt (zsqr)
  2214		CALL p256SubInternal(SB)
  2215		LDt (m)
  2216		CALL p256MulInternal(SB)
  2217		ST (m)
  2218		// Multiply by 3
  2219		p256MulBy2Inline
  2220		LDacc (m)
  2221		p256AddInline
  2222		STt (m)
  2223		////////////////////////
  2224		LDacc (y)
  2225		p256MulBy2Inline
  2226		t2acc
  2227		CALL p256SqrInternal(SB)
  2228		ST (s)
  2229		CALL p256SqrInternal(SB)
  2230		// Divide by 2
  2231		XORQ mul0, mul0
  2232		MOVQ acc4, t0
  2233		MOVQ acc5, t1
  2234		MOVQ acc6, t2
  2235		MOVQ acc7, t3
  2236	
  2237		ADDQ $-1, acc4
  2238		ADCQ p256const0<>(SB), acc5
  2239		ADCQ $0, acc6
  2240		ADCQ p256const1<>(SB), acc7
  2241		ADCQ $0, mul0
  2242		TESTQ $1, t0
  2243	
  2244		CMOVQEQ t0, acc4
  2245		CMOVQEQ t1, acc5
  2246		CMOVQEQ t2, acc6
  2247		CMOVQEQ t3, acc7
  2248		ANDQ t0, mul0
  2249	
  2250		SHRQ $1, acc4:acc5
  2251		SHRQ $1, acc5:acc6
  2252		SHRQ $1, acc6:acc7
  2253		SHRQ $1, acc7:mul0
  2254		ST (y)
  2255		/////////////////////////
  2256		LDacc (x)
  2257		LDt (s)
  2258		CALL p256MulInternal(SB)
  2259		ST (s)
  2260		p256MulBy2Inline
  2261		STt (tmp)
  2262	
  2263		LDacc (m)
  2264		CALL p256SqrInternal(SB)
  2265		LDt (tmp)
  2266		CALL p256SubInternal(SB)
  2267	
  2268		MOVQ rptr, AX
  2269		// Store x
  2270		MOVQ acc4, (16*0 + 8*0)(AX)
  2271		MOVQ acc5, (16*0 + 8*1)(AX)
  2272		MOVQ acc6, (16*0 + 8*2)(AX)
  2273		MOVQ acc7, (16*0 + 8*3)(AX)
  2274	
  2275		acc2t
  2276		LDacc (s)
  2277		CALL p256SubInternal(SB)
  2278	
  2279		LDt (m)
  2280		CALL p256MulInternal(SB)
  2281	
  2282		LDt (y)
  2283		CALL p256SubInternal(SB)
  2284		MOVQ rptr, AX
  2285		// Store y
  2286		MOVQ acc4, (16*2 + 8*0)(AX)
  2287		MOVQ acc5, (16*2 + 8*1)(AX)
  2288		MOVQ acc6, (16*2 + 8*2)(AX)
  2289		MOVQ acc7, (16*2 + 8*3)(AX)
  2290		///////////////////////
  2291		MOVQ $0, rptr
  2292	
  2293		RET
  2294	/* ---------------------------------------*/
  2295	

View as plain text