...
Run Format

Text file src/crypto/elliptic/p256_asm_s390x.s

Documentation: crypto/elliptic

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
     8	DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
     9	DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    10	DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    11	DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    12	DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    13	DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    14	DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    15	DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    16	DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    17	DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    18	DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    19	DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    20	DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    21	DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    22	DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    23	DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    24	DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    25	DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    26	DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    27	DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    28	DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    29	DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    30	DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    31	DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    32	DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    33	DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    34	DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    35	DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    36	DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    37	DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    38	DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    39	DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    40	DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    41	DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    42	GLOBL p256ordK0<>(SB), 8, $4
    43	GLOBL p256ord<>(SB), 8, $32
    44	GLOBL p256<>(SB), 8, $80
    45	GLOBL p256mul<>(SB), 8, $160
    46	
    47	// func hasVectorFacility() bool
    48	TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
    49		MOVD  $x-24(SP), R1
    50		XC    $24, 0(R1), 0(R1) // clear the storage
    51		MOVD  $2, R0            // R0 is the number of double words stored -1
    52		WORD  $0xB2B01000       // STFLE 0(R1)
    53		XOR   R0, R0            // reset the value of R0
    54		MOVBZ z-8(SP), R1
    55		AND   $0x40, R1
    56		BEQ   novector
    57	
    58	vectorinstalled:
    59		// check if the vector instruction has been enabled
    60		VLEIB  $0, $0xF, V16
    61		VLGVB  $0, V16, R1
    62		CMPBNE R1, $0xF, novector
    63		MOVB   $1, ret+0(FP) // have vx
    64		RET
    65	
    66	novector:
    67		MOVB $0, ret+0(FP)   // no vx
    68		RET
    69	
    70	// ---------------------------------------
    71	// iff cond == 1  val <- -val
    72	// func p256NegCond(val *p256Point, cond int)
    73	#define P1ptr   R1
    74	#define CPOOL   R4
    75	
    76	#define Y1L   V0
    77	#define Y1H   V1
    78	#define T1L   V2
    79	#define T1H   V3
    80	
    81	#define PL    V30
    82	#define PH    V31
    83	
    84	#define ZER   V4
    85	#define SEL1  V5
    86	#define CAR1  V6
    87	TEXT ·p256NegCond(SB), NOSPLIT, $0
    88		MOVD val+0(FP), P1ptr
    89	
    90		MOVD $p256mul<>+0x00(SB), CPOOL
    91		VL   16(CPOOL), PL
    92		VL   0(CPOOL), PH
    93	
    94		VL 32(P1ptr), Y1H
    95		VL 48(P1ptr), Y1L
    96	
    97		VLREPG cond+8(FP), SEL1
    98		VZERO  ZER
    99		VCEQG  SEL1, ZER, SEL1
   100	
   101		VSCBIQ Y1L, PL, CAR1
   102		VSQ    Y1L, PL, T1L
   103		VSBIQ  PH, Y1H, CAR1, T1H
   104	
   105		VSEL Y1L, T1L, SEL1, Y1L
   106		VSEL Y1H, T1H, SEL1, Y1H
   107	
   108		VST Y1H, 32(P1ptr)
   109		VST Y1L, 48(P1ptr)
   110		RET
   111	
   112	#undef P1ptr
   113	#undef CPOOL
   114	#undef Y1L
   115	#undef Y1H
   116	#undef T1L
   117	#undef T1H
   118	#undef PL
   119	#undef PH
   120	#undef ZER
   121	#undef SEL1
   122	#undef CAR1
   123	
   124	// ---------------------------------------
   125	// if cond == 0 res <- b; else res <- a
   126	// func p256MovCond(res, a, b *p256Point, cond int)
   127	#define P3ptr   R1
   128	#define P1ptr   R2
   129	#define P2ptr   R3
   130	
   131	#define X1L    V0
   132	#define X1H    V1
   133	#define Y1L    V2
   134	#define Y1H    V3
   135	#define Z1L    V4
   136	#define Z1H    V5
   137	#define X2L    V6
   138	#define X2H    V7
   139	#define Y2L    V8
   140	#define Y2H    V9
   141	#define Z2L    V10
   142	#define Z2H    V11
   143	
   144	#define ZER   V18
   145	#define SEL1  V19
   146	TEXT ·p256MovCond(SB), NOSPLIT, $0
   147		MOVD   res+0(FP), P3ptr
   148		MOVD   a+8(FP), P1ptr
   149		MOVD   b+16(FP), P2ptr
   150		VLREPG cond+24(FP), SEL1
   151		VZERO  ZER
   152		VCEQG  SEL1, ZER, SEL1
   153	
   154		VL 0(P1ptr), X1H
   155		VL 16(P1ptr), X1L
   156		VL 32(P1ptr), Y1H
   157		VL 48(P1ptr), Y1L
   158		VL 64(P1ptr), Z1H
   159		VL 80(P1ptr), Z1L
   160	
   161		VL 0(P2ptr), X2H
   162		VL 16(P2ptr), X2L
   163		VL 32(P2ptr), Y2H
   164		VL 48(P2ptr), Y2L
   165		VL 64(P2ptr), Z2H
   166		VL 80(P2ptr), Z2L
   167	
   168		VSEL X2L, X1L, SEL1, X1L
   169		VSEL X2H, X1H, SEL1, X1H
   170		VSEL Y2L, Y1L, SEL1, Y1L
   171		VSEL Y2H, Y1H, SEL1, Y1H
   172		VSEL Z2L, Z1L, SEL1, Z1L
   173		VSEL Z2H, Z1H, SEL1, Z1H
   174	
   175		VST X1H, 0(P3ptr)
   176		VST X1L, 16(P3ptr)
   177		VST Y1H, 32(P3ptr)
   178		VST Y1L, 48(P3ptr)
   179		VST Z1H, 64(P3ptr)
   180		VST Z1L, 80(P3ptr)
   181	
   182		RET
   183	
   184	#undef P3ptr
   185	#undef P1ptr
   186	#undef P2ptr
   187	#undef X1L
   188	#undef X1H
   189	#undef Y1L
   190	#undef Y1H
   191	#undef Z1L
   192	#undef Z1H
   193	#undef X2L
   194	#undef X2H
   195	#undef Y2L
   196	#undef Y2H
   197	#undef Z2L
   198	#undef Z2H
   199	#undef ZER
   200	#undef SEL1
   201	
   202	// ---------------------------------------
   203	// Constant time table access
   204	// Indexed from 1 to 15, with -1 offset
   205	// (index 0 is implicitly point at infinity)
   206	// func p256Select(point *p256Point, table []p256Point, idx int)
   207	#define P3ptr   R1
   208	#define P1ptr   R2
   209	#define COUNT   R4
   210	
   211	#define X1L    V0
   212	#define X1H    V1
   213	#define Y1L    V2
   214	#define Y1H    V3
   215	#define Z1L    V4
   216	#define Z1H    V5
   217	#define X2L    V6
   218	#define X2H    V7
   219	#define Y2L    V8
   220	#define Y2H    V9
   221	#define Z2L    V10
   222	#define Z2H    V11
   223	
   224	#define ONE   V18
   225	#define IDX   V19
   226	#define SEL1  V20
   227	#define SEL2  V21
   228	TEXT ·p256Select(SB), NOSPLIT, $0
   229		MOVD   point+0(FP), P3ptr
   230		MOVD   table+8(FP), P1ptr
   231		VLREPB idx+(32+7)(FP), IDX
   232		VREPIB $1, ONE
   233		VREPIB $1, SEL2
   234		MOVD   $1, COUNT
   235	
   236		VZERO X1H
   237		VZERO X1L
   238		VZERO Y1H
   239		VZERO Y1L
   240		VZERO Z1H
   241		VZERO Z1L
   242	
   243	loop_select:
   244		VL 0(P1ptr), X2H
   245		VL 16(P1ptr), X2L
   246		VL 32(P1ptr), Y2H
   247		VL 48(P1ptr), Y2L
   248		VL 64(P1ptr), Z2H
   249		VL 80(P1ptr), Z2L
   250	
   251		VCEQG SEL2, IDX, SEL1
   252	
   253		VSEL X2L, X1L, SEL1, X1L
   254		VSEL X2H, X1H, SEL1, X1H
   255		VSEL Y2L, Y1L, SEL1, Y1L
   256		VSEL Y2H, Y1H, SEL1, Y1H
   257		VSEL Z2L, Z1L, SEL1, Z1L
   258		VSEL Z2H, Z1H, SEL1, Z1H
   259	
   260		VAB  SEL2, ONE, SEL2
   261		ADDW $1, COUNT
   262		ADD  $96, P1ptr
   263		CMPW COUNT, $17
   264		BLT  loop_select
   265	
   266		VST X1H, 0(P3ptr)
   267		VST X1L, 16(P3ptr)
   268		VST Y1H, 32(P3ptr)
   269		VST Y1L, 48(P3ptr)
   270		VST Z1H, 64(P3ptr)
   271		VST Z1L, 80(P3ptr)
   272		RET
   273	
   274	#undef P3ptr
   275	#undef P1ptr
   276	#undef COUNT
   277	#undef X1L
   278	#undef X1H
   279	#undef Y1L
   280	#undef Y1H
   281	#undef Z1L
   282	#undef Z1H
   283	#undef X2L
   284	#undef X2H
   285	#undef Y2L
   286	#undef Y2H
   287	#undef Z2L
   288	#undef Z2H
   289	#undef ONE
   290	#undef IDX
   291	#undef SEL1
   292	#undef SEL2
   293	
   294	// ---------------------------------------
   295	// Constant time table access
   296	// Indexed from 1 to 15, with -1 offset
   297	// (index 0 is implicitly point at infinity)
   298	// func p256SelectBase(point *p256Point, table []p256Point, idx int)
   299	#define P3ptr   R1
   300	#define P1ptr   R2
   301	#define COUNT   R4
   302	
   303	#define X1L    V0
   304	#define X1H    V1
   305	#define Y1L    V2
   306	#define Y1H    V3
   307	#define Z1L    V4
   308	#define Z1H    V5
   309	#define X2L    V6
   310	#define X2H    V7
   311	#define Y2L    V8
   312	#define Y2H    V9
   313	#define Z2L    V10
   314	#define Z2H    V11
   315	
   316	#define ONE   V18
   317	#define IDX   V19
   318	#define SEL1  V20
   319	#define SEL2  V21
   320	TEXT ·p256SelectBase(SB), NOSPLIT, $0
   321		MOVD   point+0(FP), P3ptr
   322		MOVD   table+8(FP), P1ptr
   323		VLREPB idx+(32+7)(FP), IDX
   324		VREPIB $1, ONE
   325		VREPIB $1, SEL2
   326		MOVD   $1, COUNT
   327	
   328		VZERO X1H
   329		VZERO X1L
   330		VZERO Y1H
   331		VZERO Y1L
   332		VZERO Z1H
   333		VZERO Z1L
   334	
   335	loop_select:
   336		VL 0(P1ptr), X2H
   337		VL 16(P1ptr), X2L
   338		VL 32(P1ptr), Y2H
   339		VL 48(P1ptr), Y2L
   340		VL 64(P1ptr), Z2H
   341		VL 80(P1ptr), Z2L
   342	
   343		VCEQG SEL2, IDX, SEL1
   344	
   345		VSEL X2L, X1L, SEL1, X1L
   346		VSEL X2H, X1H, SEL1, X1H
   347		VSEL Y2L, Y1L, SEL1, Y1L
   348		VSEL Y2H, Y1H, SEL1, Y1H
   349		VSEL Z2L, Z1L, SEL1, Z1L
   350		VSEL Z2H, Z1H, SEL1, Z1H
   351	
   352		VAB  SEL2, ONE, SEL2
   353		ADDW $1, COUNT
   354		ADD  $96, P1ptr
   355		CMPW COUNT, $65
   356		BLT  loop_select
   357	
   358		VST X1H, 0(P3ptr)
   359		VST X1L, 16(P3ptr)
   360		VST Y1H, 32(P3ptr)
   361		VST Y1L, 48(P3ptr)
   362		VST Z1H, 64(P3ptr)
   363		VST Z1L, 80(P3ptr)
   364		RET
   365	
   366	#undef P3ptr
   367	#undef P1ptr
   368	#undef COUNT
   369	#undef X1L
   370	#undef X1H
   371	#undef Y1L
   372	#undef Y1H
   373	#undef Z1L
   374	#undef Z1H
   375	#undef X2L
   376	#undef X2H
   377	#undef Y2L
   378	#undef Y2H
   379	#undef Z2L
   380	#undef Z2H
   381	#undef ONE
   382	#undef IDX
   383	#undef SEL1
   384	#undef SEL2
   385	
   386	// ---------------------------------------
   387	// func p256FromMont(res, in []byte)
   388	#define res_ptr R1
   389	#define x_ptr   R2
   390	#define CPOOL   R4
   391	
   392	#define T0   V0
   393	#define T1   V1
   394	#define T2   V2
   395	#define TT0  V3
   396	#define TT1  V4
   397	
   398	#define ZER   V6
   399	#define SEL1  V7
   400	#define SEL2  V8
   401	#define CAR1  V9
   402	#define CAR2  V10
   403	#define RED1  V11
   404	#define RED2  V12
   405	#define PL    V13
   406	#define PH    V14
   407	
   408	TEXT ·p256FromMont(SB), NOSPLIT, $0
   409		MOVD res+0(FP), res_ptr
   410		MOVD in+24(FP), x_ptr
   411	
   412		VZERO T2
   413		VZERO ZER
   414		MOVD  $p256<>+0x00(SB), CPOOL
   415		VL    16(CPOOL), PL
   416		VL    0(CPOOL), PH
   417		VL    48(CPOOL), SEL2
   418		VL    64(CPOOL), SEL1
   419	
   420		VL (1*16)(x_ptr), T0
   421		VL (0*16)(x_ptr), T1
   422	
   423		// First round
   424		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   425		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   426		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   427	
   428		VSLDB $8, T1, T0, T0
   429		VSLDB $8, T2, T1, T1
   430	
   431		VACCQ  T0, RED1, CAR1
   432		VAQ    T0, RED1, T0
   433		VACCCQ T1, RED2, CAR1, CAR2
   434		VACQ   T1, RED2, CAR1, T1
   435		VAQ    T2, CAR2, T2
   436	
   437		// Second round
   438		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   439		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   440		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   441	
   442		VSLDB $8, T1, T0, T0
   443		VSLDB $8, T2, T1, T1
   444	
   445		VACCQ  T0, RED1, CAR1
   446		VAQ    T0, RED1, T0
   447		VACCCQ T1, RED2, CAR1, CAR2
   448		VACQ   T1, RED2, CAR1, T1
   449		VAQ    T2, CAR2, T2
   450	
   451		// Third round
   452		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   453		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   454		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   455	
   456		VSLDB $8, T1, T0, T0
   457		VSLDB $8, T2, T1, T1
   458	
   459		VACCQ  T0, RED1, CAR1
   460		VAQ    T0, RED1, T0
   461		VACCCQ T1, RED2, CAR1, CAR2
   462		VACQ   T1, RED2, CAR1, T1
   463		VAQ    T2, CAR2, T2
   464	
   465		// Last round
   466		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   467		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   468		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   469	
   470		VSLDB $8, T1, T0, T0
   471		VSLDB $8, T2, T1, T1
   472	
   473		VACCQ  T0, RED1, CAR1
   474		VAQ    T0, RED1, T0
   475		VACCCQ T1, RED2, CAR1, CAR2
   476		VACQ   T1, RED2, CAR1, T1
   477		VAQ    T2, CAR2, T2
   478	
   479		// ---------------------------------------------------
   480	
   481		VSCBIQ  PL, T0, CAR1
   482		VSQ     PL, T0, TT0
   483		VSBCBIQ T1, PH, CAR1, CAR2
   484		VSBIQ   T1, PH, CAR1, TT1
   485		VSBIQ   T2, ZER, CAR2, T2
   486	
   487		// what output to use, TT1||TT0 or T1||T0?
   488		VSEL T0, TT0, T2, T0
   489		VSEL T1, TT1, T2, T1
   490	
   491		VST T0, (1*16)(res_ptr)
   492		VST T1, (0*16)(res_ptr)
   493		RET
   494	
   495	#undef res_ptr
   496	#undef x_ptr
   497	#undef CPOOL
   498	#undef T0
   499	#undef T1
   500	#undef T2
   501	#undef TT0
   502	#undef TT1
   503	#undef ZER
   504	#undef SEL1
   505	#undef SEL2
   506	#undef CAR1
   507	#undef CAR2
   508	#undef RED1
   509	#undef RED2
   510	#undef PL
   511	#undef PH
   512	
   513	// ---------------------------------------
   514	// func p256OrdMul(res, in1, in2 []byte)
   515	#define res_ptr R1
   516	#define x_ptr R2
   517	#define y_ptr R3
   518	#define X0    V0
   519	#define X1    V1
   520	#define Y0    V2
   521	#define Y1    V3
   522	#define M0    V4
   523	#define M1    V5
   524	#define T0    V6
   525	#define T1    V7
   526	#define T2    V8
   527	#define YDIG  V9
   528	
   529	#define ADD1  V16
   530	#define ADD1H V17
   531	#define ADD2  V18
   532	#define ADD2H V19
   533	#define RED1  V20
   534	#define RED1H V21
   535	#define RED2  V22
   536	#define RED2H V23
   537	#define CAR1  V24
   538	#define CAR1M V25
   539	
   540	#define MK0   V30
   541	#define K0    V31
   542	TEXT ·p256OrdMul(SB), NOSPLIT, $0
   543		MOVD res+0(FP), res_ptr
   544		MOVD in1+24(FP), x_ptr
   545		MOVD in2+48(FP), y_ptr
   546	
   547		VZERO T2
   548		MOVD  $p256ordK0<>+0x00(SB), R4
   549	
   550		// VLEF    $3, 0(R4), K0
   551		WORD $0xE7F40000
   552		BYTE $0x38
   553		BYTE $0x03
   554		MOVD $p256ord<>+0x00(SB), R4
   555		VL   16(R4), M0
   556		VL   0(R4), M1
   557	
   558		VL (1*16)(x_ptr), X0
   559		VL (0*16)(x_ptr), X1
   560		VL (1*16)(y_ptr), Y0
   561		VL (0*16)(y_ptr), Y1
   562	
   563		// ---------------------------------------------------------------------------/
   564		VREPF $3, Y0, YDIG
   565		VMLF  X0, YDIG, ADD1
   566		VMLF  ADD1, K0, MK0
   567		VREPF $3, MK0, MK0
   568	
   569		VMLF  X1, YDIG, ADD2
   570		VMLHF X0, YDIG, ADD1H
   571		VMLHF X1, YDIG, ADD2H
   572	
   573		VMALF  M0, MK0, ADD1, RED1
   574		VMALHF M0, MK0, ADD1, RED1H
   575		VMALF  M1, MK0, ADD2, RED2
   576		VMALHF M1, MK0, ADD2, RED2H
   577	
   578		VSLDB $12, RED2, RED1, RED1
   579		VSLDB $12, T2, RED2, RED2
   580	
   581		VACCQ RED1, ADD1H, CAR1
   582		VAQ   RED1, ADD1H, T0
   583		VACCQ RED1H, T0, CAR1M
   584		VAQ   RED1H, T0, T0
   585	
   586		// << ready for next MK0
   587	
   588		VACQ   RED2, ADD2H, CAR1, T1
   589		VACCCQ RED2, ADD2H, CAR1, CAR1
   590		VACCCQ RED2H, T1, CAR1M, T2
   591		VACQ   RED2H, T1, CAR1M, T1
   592		VAQ    CAR1, T2, T2
   593	
   594		// ---------------------------------------------------
   595	/* *
   596	 * ---+--------+--------+
   597	 *  T2|   T1   |   T0   |
   598	 * ---+--------+--------+
   599	 *           *(add)*
   600	 *    +--------+--------+
   601	 *    |   X1   |   X0   |
   602	 *    +--------+--------+
   603	 *           *(mul)*
   604	 *    +--------+--------+
   605	 *    |  YDIG  |  YDIG  |
   606	 *    +--------+--------+
   607	 *           *(add)*
   608	 *    +--------+--------+
   609	 *    |   M1   |   M0   |
   610	 *    +--------+--------+
   611	 *           *(mul)*
   612	 *    +--------+--------+
   613	 *    |   MK0  |   MK0  |
   614	 *    +--------+--------+
   615	 *
   616	 *   ---------------------
   617	 *
   618	 *    +--------+--------+
   619	 *    |  ADD2  |  ADD1  |
   620	 *    +--------+--------+
   621	 *  +--------+--------+
   622	 *  | ADD2H  | ADD1H  |
   623	 *  +--------+--------+
   624	 *    +--------+--------+
   625	 *    |  RED2  |  RED1  |
   626	 *    +--------+--------+
   627	 *  +--------+--------+
   628	 *  | RED2H  | RED1H  |
   629	 *  +--------+--------+
   630	 */
   631		VREPF $2, Y0, YDIG
   632		VMALF X0, YDIG, T0, ADD1
   633		VMLF  ADD1, K0, MK0
   634		VREPF $3, MK0, MK0
   635	
   636		VMALF  X1, YDIG, T1, ADD2
   637		VMALHF X0, YDIG, T0, ADD1H
   638		VMALHF X1, YDIG, T1, ADD2H
   639	
   640		VMALF  M0, MK0, ADD1, RED1
   641		VMALHF M0, MK0, ADD1, RED1H
   642		VMALF  M1, MK0, ADD2, RED2
   643		VMALHF M1, MK0, ADD2, RED2H
   644	
   645		VSLDB $12, RED2, RED1, RED1
   646		VSLDB $12, T2, RED2, RED2
   647	
   648		VACCQ RED1, ADD1H, CAR1
   649		VAQ   RED1, ADD1H, T0
   650		VACCQ RED1H, T0, CAR1M
   651		VAQ   RED1H, T0, T0
   652	
   653		// << ready for next MK0
   654	
   655		VACQ   RED2, ADD2H, CAR1, T1
   656		VACCCQ RED2, ADD2H, CAR1, CAR1
   657		VACCCQ RED2H, T1, CAR1M, T2
   658		VACQ   RED2H, T1, CAR1M, T1
   659		VAQ    CAR1, T2, T2
   660	
   661		// ---------------------------------------------------
   662		VREPF $1, Y0, YDIG
   663		VMALF X0, YDIG, T0, ADD1
   664		VMLF  ADD1, K0, MK0
   665		VREPF $3, MK0, MK0
   666	
   667		VMALF  X1, YDIG, T1, ADD2
   668		VMALHF X0, YDIG, T0, ADD1H
   669		VMALHF X1, YDIG, T1, ADD2H
   670	
   671		VMALF  M0, MK0, ADD1, RED1
   672		VMALHF M0, MK0, ADD1, RED1H
   673		VMALF  M1, MK0, ADD2, RED2
   674		VMALHF M1, MK0, ADD2, RED2H
   675	
   676		VSLDB $12, RED2, RED1, RED1
   677		VSLDB $12, T2, RED2, RED2
   678	
   679		VACCQ RED1, ADD1H, CAR1
   680		VAQ   RED1, ADD1H, T0
   681		VACCQ RED1H, T0, CAR1M
   682		VAQ   RED1H, T0, T0
   683	
   684		// << ready for next MK0
   685	
   686		VACQ   RED2, ADD2H, CAR1, T1
   687		VACCCQ RED2, ADD2H, CAR1, CAR1
   688		VACCCQ RED2H, T1, CAR1M, T2
   689		VACQ   RED2H, T1, CAR1M, T1
   690		VAQ    CAR1, T2, T2
   691	
   692		// ---------------------------------------------------
   693		VREPF $0, Y0, YDIG
   694		VMALF X0, YDIG, T0, ADD1
   695		VMLF  ADD1, K0, MK0
   696		VREPF $3, MK0, MK0
   697	
   698		VMALF  X1, YDIG, T1, ADD2
   699		VMALHF X0, YDIG, T0, ADD1H
   700		VMALHF X1, YDIG, T1, ADD2H
   701	
   702		VMALF  M0, MK0, ADD1, RED1
   703		VMALHF M0, MK0, ADD1, RED1H
   704		VMALF  M1, MK0, ADD2, RED2
   705		VMALHF M1, MK0, ADD2, RED2H
   706	
   707		VSLDB $12, RED2, RED1, RED1
   708		VSLDB $12, T2, RED2, RED2
   709	
   710		VACCQ RED1, ADD1H, CAR1
   711		VAQ   RED1, ADD1H, T0
   712		VACCQ RED1H, T0, CAR1M
   713		VAQ   RED1H, T0, T0
   714	
   715		// << ready for next MK0
   716	
   717		VACQ   RED2, ADD2H, CAR1, T1
   718		VACCCQ RED2, ADD2H, CAR1, CAR1
   719		VACCCQ RED2H, T1, CAR1M, T2
   720		VACQ   RED2H, T1, CAR1M, T1
   721		VAQ    CAR1, T2, T2
   722	
   723		// ---------------------------------------------------
   724		VREPF $3, Y1, YDIG
   725		VMALF X0, YDIG, T0, ADD1
   726		VMLF  ADD1, K0, MK0
   727		VREPF $3, MK0, MK0
   728	
   729		VMALF  X1, YDIG, T1, ADD2
   730		VMALHF X0, YDIG, T0, ADD1H
   731		VMALHF X1, YDIG, T1, ADD2H
   732	
   733		VMALF  M0, MK0, ADD1, RED1
   734		VMALHF M0, MK0, ADD1, RED1H
   735		VMALF  M1, MK0, ADD2, RED2
   736		VMALHF M1, MK0, ADD2, RED2H
   737	
   738		VSLDB $12, RED2, RED1, RED1
   739		VSLDB $12, T2, RED2, RED2
   740	
   741		VACCQ RED1, ADD1H, CAR1
   742		VAQ   RED1, ADD1H, T0
   743		VACCQ RED1H, T0, CAR1M
   744		VAQ   RED1H, T0, T0
   745	
   746		// << ready for next MK0
   747	
   748		VACQ   RED2, ADD2H, CAR1, T1
   749		VACCCQ RED2, ADD2H, CAR1, CAR1
   750		VACCCQ RED2H, T1, CAR1M, T2
   751		VACQ   RED2H, T1, CAR1M, T1
   752		VAQ    CAR1, T2, T2
   753	
   754		// ---------------------------------------------------
   755		VREPF $2, Y1, YDIG
   756		VMALF X0, YDIG, T0, ADD1
   757		VMLF  ADD1, K0, MK0
   758		VREPF $3, MK0, MK0
   759	
   760		VMALF  X1, YDIG, T1, ADD2
   761		VMALHF X0, YDIG, T0, ADD1H
   762		VMALHF X1, YDIG, T1, ADD2H
   763	
   764		VMALF  M0, MK0, ADD1, RED1
   765		VMALHF M0, MK0, ADD1, RED1H
   766		VMALF  M1, MK0, ADD2, RED2
   767		VMALHF M1, MK0, ADD2, RED2H
   768	
   769		VSLDB $12, RED2, RED1, RED1
   770		VSLDB $12, T2, RED2, RED2
   771	
   772		VACCQ RED1, ADD1H, CAR1
   773		VAQ   RED1, ADD1H, T0
   774		VACCQ RED1H, T0, CAR1M
   775		VAQ   RED1H, T0, T0
   776	
   777		// << ready for next MK0
   778	
   779		VACQ   RED2, ADD2H, CAR1, T1
   780		VACCCQ RED2, ADD2H, CAR1, CAR1
   781		VACCCQ RED2H, T1, CAR1M, T2
   782		VACQ   RED2H, T1, CAR1M, T1
   783		VAQ    CAR1, T2, T2
   784	
   785		// ---------------------------------------------------
   786		VREPF $1, Y1, YDIG
   787		VMALF X0, YDIG, T0, ADD1
   788		VMLF  ADD1, K0, MK0
   789		VREPF $3, MK0, MK0
   790	
   791		VMALF  X1, YDIG, T1, ADD2
   792		VMALHF X0, YDIG, T0, ADD1H
   793		VMALHF X1, YDIG, T1, ADD2H
   794	
   795		VMALF  M0, MK0, ADD1, RED1
   796		VMALHF M0, MK0, ADD1, RED1H
   797		VMALF  M1, MK0, ADD2, RED2
   798		VMALHF M1, MK0, ADD2, RED2H
   799	
   800		VSLDB $12, RED2, RED1, RED1
   801		VSLDB $12, T2, RED2, RED2
   802	
   803		VACCQ RED1, ADD1H, CAR1
   804		VAQ   RED1, ADD1H, T0
   805		VACCQ RED1H, T0, CAR1M
   806		VAQ   RED1H, T0, T0
   807	
   808		// << ready for next MK0
   809	
   810		VACQ   RED2, ADD2H, CAR1, T1
   811		VACCCQ RED2, ADD2H, CAR1, CAR1
   812		VACCCQ RED2H, T1, CAR1M, T2
   813		VACQ   RED2H, T1, CAR1M, T1
   814		VAQ    CAR1, T2, T2
   815	
   816		// ---------------------------------------------------
   817		VREPF $0, Y1, YDIG
   818		VMALF X0, YDIG, T0, ADD1
   819		VMLF  ADD1, K0, MK0
   820		VREPF $3, MK0, MK0
   821	
   822		VMALF  X1, YDIG, T1, ADD2
   823		VMALHF X0, YDIG, T0, ADD1H
   824		VMALHF X1, YDIG, T1, ADD2H
   825	
   826		VMALF  M0, MK0, ADD1, RED1
   827		VMALHF M0, MK0, ADD1, RED1H
   828		VMALF  M1, MK0, ADD2, RED2
   829		VMALHF M1, MK0, ADD2, RED2H
   830	
   831		VSLDB $12, RED2, RED1, RED1
   832		VSLDB $12, T2, RED2, RED2
   833	
   834		VACCQ RED1, ADD1H, CAR1
   835		VAQ   RED1, ADD1H, T0
   836		VACCQ RED1H, T0, CAR1M
   837		VAQ   RED1H, T0, T0
   838	
   839		// << ready for next MK0
   840	
   841		VACQ   RED2, ADD2H, CAR1, T1
   842		VACCCQ RED2, ADD2H, CAR1, CAR1
   843		VACCCQ RED2H, T1, CAR1M, T2
   844		VACQ   RED2H, T1, CAR1M, T1
   845		VAQ    CAR1, T2, T2
   846	
   847		// ---------------------------------------------------
   848	
   849		VZERO   RED1
   850		VSCBIQ  M0, T0, CAR1
   851		VSQ     M0, T0, ADD1
   852		VSBCBIQ T1, M1, CAR1, CAR1M
   853		VSBIQ   T1, M1, CAR1, ADD2
   854		VSBIQ   T2, RED1, CAR1M, T2
   855	
   856		// what output to use, ADD2||ADD1 or T1||T0?
   857		VSEL T0, ADD1, T2, T0
   858		VSEL T1, ADD2, T2, T1
   859	
   860		VST T0, (1*16)(res_ptr)
   861		VST T1, (0*16)(res_ptr)
   862		RET
   863	
   864	#undef res_ptr
   865	#undef x_ptr
   866	#undef y_ptr
   867	#undef X0
   868	#undef X1
   869	#undef Y0
   870	#undef Y1
   871	#undef M0
   872	#undef M1
   873	#undef T0
   874	#undef T1
   875	#undef T2
   876	#undef YDIG
   877	
   878	#undef ADD1
   879	#undef ADD1H
   880	#undef ADD2
   881	#undef ADD2H
   882	#undef RED1
   883	#undef RED1H
   884	#undef RED2
   885	#undef RED2H
   886	#undef CAR1
   887	#undef CAR1M
   888	
   889	#undef MK0
   890	#undef K0
   891	
   892	// ---------------------------------------
   893	// p256MulInternal
   894	// V0-V3,V30,V31 - Not Modified
   895	// V4-V15 - Volatile
   896	
   897	#define CPOOL   R4
   898	
   899	// Parameters
   900	#define X0    V0 // Not modified
   901	#define X1    V1 // Not modified
   902	#define Y0    V2 // Not modified
   903	#define Y1    V3 // Not modified
   904	#define T0    V4
   905	#define T1    V5
   906	#define P0    V30 // Not modified
   907	#define P1    V31 // Not modified
   908	
   909	// Temporaries
   910	#define YDIG  V6 // Overloaded with CAR2, ZER
   911	#define ADD1H V7 // Overloaded with ADD3H
   912	#define ADD2H V8 // Overloaded with ADD4H
   913	#define ADD3  V9 // Overloaded with SEL2,SEL5
   914	#define ADD4  V10 // Overloaded with SEL3,SEL6
   915	#define RED1  V11 // Overloaded with CAR2
   916	#define RED2  V12
   917	#define RED3  V13 // Overloaded with SEL1
   918	#define T2    V14
   919	// Overloaded temporaries
   920	#define ADD1  V4 // Overloaded with T0
   921	#define ADD2  V5 // Overloaded with T1
   922	#define ADD3H V7 // Overloaded with ADD1H
   923	#define ADD4H V8 // Overloaded with ADD2H
   924	#define ZER   V6 // Overloaded with YDIG, CAR2
   925	#define CAR1  V6 // Overloaded with YDIG, ZER
   926	#define CAR2  V11 // Overloaded with RED1
   927	// Constant Selects
   928	#define SEL1  V13 // Overloaded with RED3
   929	#define SEL2  V9 // Overloaded with ADD3,SEL5
   930	#define SEL3  V10 // Overloaded with ADD4,SEL6
   931	#define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   932	#define SEL5  V9 // Overloaded with ADD3,SEL2
   933	#define SEL6  V10 // Overloaded with ADD4,SEL3
   934	
   935	/* *
   936	 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   937	 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   938	 * With you, SIMD be...
   939	 *
   940	 *                                           +--------+--------+
   941	 *                                  +--------|  RED2  |  RED1  |
   942	 *                                  |        +--------+--------+
   943	 *                                  |       ---+--------+--------+
   944	 *                                  |  +---- T2|   T1   |   T0   |--+
   945	 *                                  |  |    ---+--------+--------+  |
   946	 *                                  |  |                            |
   947	 *                                  |  |    ======================= |
   948	 *                                  |  |                            |
   949	 *                                  |  |       +--------+--------+<-+
   950	 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   951	 *                                  |  |       +--------+--------+  |     |
   952	 *                                  |  |     +--------+--------+<---+     |
   953	 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   954	 *                                  |  |     +--------+--------+  |       |
   955	 *                                  |  |     +--------+--------+<-+       |
   956	 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   957	 *                                  |  |     +--------+--------+  | |     |
   958	 *                                  |  |   +--------+--------+<---+ |     |
   959	 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   960	 *                                  |  |   +--------+--------+      | |   V
   961	 *                                  |  | ------------------------   | | +--------+
   962	 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   963	 *                                  |  |                            | | +--------+
   964	 *                                  |  +---->+--------+--------+    | |   |
   965	 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   966	 *                                  |        +--------+--------+    | |   |
   967	 *                                  +---->---+--------+--------+    | |   |
   968	 *                                         T2|   T1   |   T0   |----+ |   |
   969	 *                                        ---+--------+--------+    | |   |
   970	 *                                        ---+--------+--------+<---+ |   |
   971	 *                                    +--- T2|   T1   |   T0   |----------+
   972	 *                                    |   ---+--------+--------+      |   |
   973	 *                                    |  +--------+--------+<-------------+
   974	 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   975	 *                                    |  +--------+--------+     |    |   |
   976	 *                                    |  +--------+<----------------------+
   977	 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   978	 *                                    |  +--------+              |    |
   979	 *                                    +--->+--------+--------+   |    |
   980	 *                                         |   T1   |   T0   |--------+
   981	 *                                         +--------+--------+   |    |
   982	 *                                   --------------------------- |    |
   983	 *                                                               |    |
   984	 *                                       +--------+--------+<----+    |
   985	 *                                       |  RED2  |  RED1  |          |
   986	 *                                       +--------+--------+          |
   987	 *                                      ---+--------+--------+<-------+
   988	 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   989	 *                                      ---+--------+--------+
   990	 *
   991	 *                                                                *Mi obra de arte de siglo XXI @vpaprots
   992	 *
   993	 *
   994	 * First group is special, doesnt get the two inputs:
   995	 *                                             +--------+--------+<-+
   996	 *                                     +-------|  ADD2  |  ADD1  |--|-----+
   997	 *                                     |       +--------+--------+  |     |
   998	 *                                     |     +--------+--------+<---+     |
   999	 *                                     |     | ADD2H  | ADD1H  |--+       |
  1000	 *                                     |     +--------+--------+  |       |
  1001	 *                                     |     +--------+--------+<-+       |
  1002	 *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1003	 *                                     |     +--------+--------+  | |     |
  1004	 *                                     |   +--------+--------+<---+ |     |
  1005	 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1006	 *                                     |   +--------+--------+      | |   V
  1007	 *                                     | ------------------------   | | +--------+
  1008	 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1009	 *                                     |                            | | +--------+
  1010	 *                                     +---->+--------+--------+    | |   |
  1011	 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1012	 *                                           +--------+--------+    | |   |
  1013	 *                                        ---+--------+--------+<---+ |   |
  1014	 *                                    +--- T2|   T1   |   T0   |----------+
  1015	 *                                    |   ---+--------+--------+      |   |
  1016	 *                                    |  +--------+--------+<-------------+
  1017	 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1018	 *                                    |  +--------+--------+     |    |   |
  1019	 *                                    |  +--------+<----------------------+
  1020	 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1021	 *                                    |  +--------+              |    |
  1022	 *                                    +--->+--------+--------+   |    |
  1023	 *                                         |   T1   |   T0   |--------+
  1024	 *                                         +--------+--------+   |    |
  1025	 *                                   --------------------------- |    |
  1026	 *                                                               |    |
  1027	 *                                       +--------+--------+<----+    |
  1028	 *                                       |  RED2  |  RED1  |          |
  1029	 *                                       +--------+--------+          |
  1030	 *                                      ---+--------+--------+<-------+
  1031	 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1032	 *                                      ---+--------+--------+
  1033	 *
  1034	 * Last 'group' needs to RED2||RED1 shifted less
  1035	 */
  1036	TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
  1037		VL 32(CPOOL), SEL1
  1038		VL 48(CPOOL), SEL2
  1039		VL 64(CPOOL), SEL3
  1040		VL 80(CPOOL), SEL4
  1041	
  1042		// ---------------------------------------------------
  1043	
  1044		VREPF $3, Y0, YDIG
  1045		VMLHF X0, YDIG, ADD1H
  1046		VMLHF X1, YDIG, ADD2H
  1047		VMLF  X0, YDIG, ADD1
  1048		VMLF  X1, YDIG, ADD2
  1049	
  1050		VREPF  $2, Y0, YDIG
  1051		VMALF  X0, YDIG, ADD1H, ADD3
  1052		VMALF  X1, YDIG, ADD2H, ADD4
  1053		VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1054		VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1055	
  1056		VZERO ZER
  1057		VL    32(CPOOL), SEL1
  1058		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1059	
  1060		VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1061		VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1062	
  1063		VACCQ  T0, ADD3, CAR1
  1064		VAQ    T0, ADD3, T0       // ADD3 Free
  1065		VACCCQ T1, ADD4, CAR1, T2
  1066		VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1067	
  1068		VL    48(CPOOL), SEL2
  1069		VL    64(CPOOL), SEL3
  1070		VL    80(CPOOL), SEL4
  1071		VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1072		VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1073		VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1074		VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1075	
  1076		VSLDB $12, T1, T0, T0
  1077		VSLDB $12, T2, T1, T1
  1078	
  1079		VACCQ  T0, ADD3H, CAR1
  1080		VAQ    T0, ADD3H, T0
  1081		VACCCQ T1, ADD4H, CAR1, T2
  1082		VACQ   T1, ADD4H, CAR1, T1
  1083	
  1084		// ---------------------------------------------------
  1085	
  1086		VREPF  $1, Y0, YDIG
  1087		VMALHF X0, YDIG, T0, ADD1H
  1088		VMALHF X1, YDIG, T1, ADD2H
  1089		VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1090		VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1091	
  1092		VREPF  $0, Y0, YDIG
  1093		VMALF  X0, YDIG, ADD1H, ADD3
  1094		VMALF  X1, YDIG, ADD2H, ADD4
  1095		VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1096		VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1097	
  1098		VZERO ZER
  1099		VL    32(CPOOL), SEL1
  1100		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1101	
  1102		VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1103		VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1104	
  1105		VACCQ  T0, RED1, CAR1
  1106		VAQ    T0, RED1, T0
  1107		VACCCQ T1, RED2, CAR1, T2
  1108		VACQ   T1, RED2, CAR1, T1
  1109	
  1110		VACCQ  T0, ADD3, CAR1
  1111		VAQ    T0, ADD3, T0
  1112		VACCCQ T1, ADD4, CAR1, CAR2
  1113		VACQ   T1, ADD4, CAR1, T1
  1114		VAQ    T2, CAR2, T2
  1115	
  1116		VL    48(CPOOL), SEL2
  1117		VL    64(CPOOL), SEL3
  1118		VL    80(CPOOL), SEL4
  1119		VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1120		VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1121		VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1122		VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1123	
  1124		VSLDB $12, T1, T0, T0
  1125		VSLDB $12, T2, T1, T1
  1126	
  1127		VACCQ  T0, ADD3H, CAR1
  1128		VAQ    T0, ADD3H, T0
  1129		VACCCQ T1, ADD4H, CAR1, T2
  1130		VACQ   T1, ADD4H, CAR1, T1
  1131	
  1132		// ---------------------------------------------------
  1133	
  1134		VREPF  $3, Y1, YDIG
  1135		VMALHF X0, YDIG, T0, ADD1H
  1136		VMALHF X1, YDIG, T1, ADD2H
  1137		VMALF  X0, YDIG, T0, ADD1
  1138		VMALF  X1, YDIG, T1, ADD2
  1139	
  1140		VREPF  $2, Y1, YDIG
  1141		VMALF  X0, YDIG, ADD1H, ADD3
  1142		VMALF  X1, YDIG, ADD2H, ADD4
  1143		VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1144		VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1145	
  1146		VZERO ZER
  1147		VL    32(CPOOL), SEL1
  1148		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1149	
  1150		VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1151		VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1152	
  1153		VACCQ  T0, RED1, CAR1
  1154		VAQ    T0, RED1, T0
  1155		VACCCQ T1, RED2, CAR1, T2
  1156		VACQ   T1, RED2, CAR1, T1
  1157	
  1158		VACCQ  T0, ADD3, CAR1
  1159		VAQ    T0, ADD3, T0
  1160		VACCCQ T1, ADD4, CAR1, CAR2
  1161		VACQ   T1, ADD4, CAR1, T1
  1162		VAQ    T2, CAR2, T2
  1163	
  1164		VL    48(CPOOL), SEL2
  1165		VL    64(CPOOL), SEL3
  1166		VL    80(CPOOL), SEL4
  1167		VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1168		VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1169		VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1170		VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1171	
  1172		VSLDB $12, T1, T0, T0
  1173		VSLDB $12, T2, T1, T1
  1174	
  1175		VACCQ  T0, ADD3H, CAR1
  1176		VAQ    T0, ADD3H, T0
  1177		VACCCQ T1, ADD4H, CAR1, T2
  1178		VACQ   T1, ADD4H, CAR1, T1
  1179	
  1180		// ---------------------------------------------------
  1181	
  1182		VREPF  $1, Y1, YDIG
  1183		VMALHF X0, YDIG, T0, ADD1H
  1184		VMALHF X1, YDIG, T1, ADD2H
  1185		VMALF  X0, YDIG, T0, ADD1
  1186		VMALF  X1, YDIG, T1, ADD2
  1187	
  1188		VREPF  $0, Y1, YDIG
  1189		VMALF  X0, YDIG, ADD1H, ADD3
  1190		VMALF  X1, YDIG, ADD2H, ADD4
  1191		VMALHF X0, YDIG, ADD1H, ADD3H
  1192		VMALHF X1, YDIG, ADD2H, ADD4H
  1193	
  1194		VZERO ZER
  1195		VL    32(CPOOL), SEL1
  1196		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1197	
  1198		VSLDB $12, ADD2, ADD1, T0
  1199		VSLDB $12, T2, ADD2, T1
  1200	
  1201		VACCQ  T0, RED1, CAR1
  1202		VAQ    T0, RED1, T0
  1203		VACCCQ T1, RED2, CAR1, T2
  1204		VACQ   T1, RED2, CAR1, T1
  1205	
  1206		VACCQ  T0, ADD3, CAR1
  1207		VAQ    T0, ADD3, T0
  1208		VACCCQ T1, ADD4, CAR1, CAR2
  1209		VACQ   T1, ADD4, CAR1, T1
  1210		VAQ    T2, CAR2, T2
  1211	
  1212		VL    96(CPOOL), SEL5
  1213		VL    112(CPOOL), SEL6
  1214		VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1215		VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1216		VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1217	
  1218		VSLDB $12, T1, T0, T0
  1219		VSLDB $12, T2, T1, T1
  1220	
  1221		VACCQ  T0, ADD3H, CAR1
  1222		VAQ    T0, ADD3H, T0
  1223		VACCCQ T1, ADD4H, CAR1, T2
  1224		VACQ   T1, ADD4H, CAR1, T1
  1225	
  1226		VACCQ  T0, RED1, CAR1
  1227		VAQ    T0, RED1, T0
  1228		VACCCQ T1, RED2, CAR1, CAR2
  1229		VACQ   T1, RED2, CAR1, T1
  1230		VAQ    T2, CAR2, T2
  1231	
  1232		// ---------------------------------------------------
  1233	
  1234		VZERO   RED3
  1235		VSCBIQ  P0, T0, CAR1
  1236		VSQ     P0, T0, ADD1H
  1237		VSBCBIQ T1, P1, CAR1, CAR2
  1238		VSBIQ   T1, P1, CAR1, ADD2H
  1239		VSBIQ   T2, RED3, CAR2, T2
  1240	
  1241		// what output to use, ADD2H||ADD1H or T1||T0?
  1242		VSEL T0, ADD1H, T2, T0
  1243		VSEL T1, ADD2H, T2, T1
  1244		RET
  1245	
  1246	#undef CPOOL
  1247	
  1248	#undef X0
  1249	#undef X1
  1250	#undef Y0
  1251	#undef Y1
  1252	#undef T0
  1253	#undef T1
  1254	#undef P0
  1255	#undef P1
  1256	
  1257	#undef SEL1
  1258	#undef SEL2
  1259	#undef SEL3
  1260	#undef SEL4
  1261	#undef SEL5
  1262	#undef SEL6
  1263	
  1264	#undef YDIG
  1265	#undef ADD1H
  1266	#undef ADD2H
  1267	#undef ADD3
  1268	#undef ADD4
  1269	#undef RED1
  1270	#undef RED2
  1271	#undef RED3
  1272	#undef T2
  1273	#undef ADD1
  1274	#undef ADD2
  1275	#undef ADD3H
  1276	#undef ADD4H
  1277	#undef ZER
  1278	#undef CAR1
  1279	#undef CAR2
  1280	
  1281	#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1282		VZERO   ZER                \
  1283		VSCBIQ  Y0, X0, CAR1       \
  1284		VSQ     Y0, X0, T0         \
  1285		VSBCBIQ X1, Y1, CAR1, SEL1 \
  1286		VSBIQ   X1, Y1, CAR1, T1   \
  1287		VSQ     SEL1, ZER, SEL1    \
  1288		                           \
  1289		VACCQ   T0, PL, CAR1       \
  1290		VAQ     T0, PL, TT0        \
  1291		VACQ    T1, PH, CAR1, TT1  \
  1292		                           \
  1293		VSEL    T0, TT0, SEL1, T0  \
  1294		VSEL    T1, TT1, SEL1, T1  \
  1295	
  1296	#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1297		VACCQ   X0, Y0, CAR1        \
  1298		VAQ     X0, Y0, T0          \
  1299		VACCCQ  X1, Y1, CAR1, T2    \
  1300		VACQ    X1, Y1, CAR1, T1    \
  1301		                            \
  1302		VZERO   ZER                 \
  1303		VSCBIQ  PL, T0, CAR1        \
  1304		VSQ     PL, T0, TT0         \
  1305		VSBCBIQ T1, PH, CAR1, CAR2  \
  1306		VSBIQ   T1, PH, CAR1, TT1   \
  1307		VSBIQ   T2, ZER, CAR2, SEL1 \
  1308		                            \
  1309		VSEL    T0, TT0, SEL1, T0   \
  1310		VSEL    T1, TT1, SEL1, T1
  1311	
  1312	#define p256HalfInternal(T1, T0, X1, X0) \
  1313		VZERO  ZER                \
  1314		VSBIQ  ZER, ZER, X0, SEL1 \
  1315		                          \
  1316		VACCQ  X0, PL, CAR1       \
  1317		VAQ    X0, PL, T0         \
  1318		VACCCQ X1, PH, CAR1, T2   \
  1319		VACQ   X1, PH, CAR1, T1   \
  1320		                          \
  1321		VSEL   X0, T0, SEL1, T0   \
  1322		VSEL   X1, T1, SEL1, T1   \
  1323		VSEL   ZER, T2, SEL1, T2  \
  1324		                          \
  1325		VSLDB  $15, T2, ZER, TT1  \
  1326		VSLDB  $15, T1, ZER, TT0  \
  1327		VREPIB $1, SEL1           \
  1328		VSRL   SEL1, T0, T0       \
  1329		VSRL   SEL1, T1, T1       \
  1330		VREPIB $7, SEL1           \
  1331		VSL    SEL1, TT0, TT0     \
  1332		VSL    SEL1, TT1, TT1     \
  1333		VO     T0, TT0, T0        \
  1334		VO     T1, TT1, T1
  1335	
  1336	// ---------------------------------------
  1337	// func p256MulAsm(res, in1, in2 []byte)
  1338	#define res_ptr R1
  1339	#define x_ptr   R2
  1340	#define y_ptr   R3
  1341	#define CPOOL   R4
  1342	
  1343	// Parameters
  1344	#define X0    V0
  1345	#define X1    V1
  1346	#define Y0    V2
  1347	#define Y1    V3
  1348	#define T0    V4
  1349	#define T1    V5
  1350	
  1351	// Constants
  1352	#define P0    V30
  1353	#define P1    V31
  1354	TEXT ·p256MulAsm(SB), NOSPLIT, $0
  1355		MOVD res+0(FP), res_ptr
  1356		MOVD in1+24(FP), x_ptr
  1357		MOVD in2+48(FP), y_ptr
  1358	
  1359		VL (1*16)(x_ptr), X0
  1360		VL (0*16)(x_ptr), X1
  1361		VL (1*16)(y_ptr), Y0
  1362		VL (0*16)(y_ptr), Y1
  1363	
  1364		MOVD $p256mul<>+0x00(SB), CPOOL
  1365		VL   16(CPOOL), P0
  1366		VL   0(CPOOL), P1
  1367	
  1368		CALL p256MulInternal<>(SB)
  1369	
  1370		VST T0, (1*16)(res_ptr)
  1371		VST T1, (0*16)(res_ptr)
  1372		RET
  1373	
  1374	#undef res_ptr
  1375	#undef x_ptr
  1376	#undef y_ptr
  1377	#undef CPOOL
  1378	
  1379	#undef X0
  1380	#undef X1
  1381	#undef Y0
  1382	#undef Y1
  1383	#undef T0
  1384	#undef T1
  1385	#undef P0
  1386	#undef P1
  1387	
  1388	// Point add with P2 being affine point
  1389	// If sign == 1 -> P2 = -P2
  1390	// If sel == 0 -> P3 = P1
  1391	// if zero == 0 -> P3 = P2
  1392	// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
  1393	#define P3ptr   R1
  1394	#define P1ptr   R2
  1395	#define P2ptr   R3
  1396	#define CPOOL   R4
  1397	
  1398	// Temporaries in REGs
  1399	#define Y2L    V15
  1400	#define Y2H    V16
  1401	#define T1L    V17
  1402	#define T1H    V18
  1403	#define T2L    V19
  1404	#define T2H    V20
  1405	#define T3L    V21
  1406	#define T3H    V22
  1407	#define T4L    V23
  1408	#define T4H    V24
  1409	
  1410	// Temps for Sub and Add
  1411	#define TT0  V11
  1412	#define TT1  V12
  1413	#define T2   V13
  1414	
  1415	// p256MulAsm Parameters
  1416	#define X0    V0
  1417	#define X1    V1
  1418	#define Y0    V2
  1419	#define Y1    V3
  1420	#define T0    V4
  1421	#define T1    V5
  1422	
  1423	#define PL    V30
  1424	#define PH    V31
  1425	
  1426	// Names for zero/sel selects
  1427	#define X1L    V0
  1428	#define X1H    V1
  1429	#define Y1L    V2 // p256MulAsmParmY
  1430	#define Y1H    V3 // p256MulAsmParmY
  1431	#define Z1L    V4
  1432	#define Z1H    V5
  1433	#define X2L    V0
  1434	#define X2H    V1
  1435	#define Z2L    V4
  1436	#define Z2H    V5
  1437	#define X3L    V17 // T1L
  1438	#define X3H    V18 // T1H
  1439	#define Y3L    V21 // T3L
  1440	#define Y3H    V22 // T3H
  1441	#define Z3L    V28
  1442	#define Z3H    V29
  1443	
  1444	#define ZER   V6
  1445	#define SEL1  V7
  1446	#define CAR1  V8
  1447	#define CAR2  V9
  1448	/* *
  1449	 * Three operand formula:
  1450	 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1451	 * T1 = Z1²
  1452	 * T2 = T1*Z1
  1453	 * T1 = T1*X2
  1454	 * T2 = T2*Y2
  1455	 * T1 = T1-X1
  1456	 * T2 = T2-Y1
  1457	 * Z3 = Z1*T1
  1458	 * T3 = T1²
  1459	 * T4 = T3*T1
  1460	 * T3 = T3*X1
  1461	 * T1 = 2*T3
  1462	 * X3 = T2²
  1463	 * X3 = X3-T1
  1464	 * X3 = X3-T4
  1465	 * T3 = T3-X3
  1466	 * T3 = T3*T2
  1467	 * T4 = T4*Y1
  1468	 * Y3 = T3-T4
  1469	
  1470	 * Three operand formulas, but with MulInternal X,Y used to store temps
  1471	X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1472	X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1473	X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1474	X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1475	SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1476	SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1477	X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1478	X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1479	X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1480	X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1481	ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1482	X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1483	SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1484	SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1485	SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1486	X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1487	X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1488	SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1489	
  1490		*/
  1491	TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1492		MOVD P3+0(FP), P3ptr
  1493		MOVD P1+8(FP), P1ptr
  1494		MOVD P2+16(FP), P2ptr
  1495	
  1496		MOVD $p256mul<>+0x00(SB), CPOOL
  1497		VL   16(CPOOL), PL
  1498		VL   0(CPOOL), PH
  1499	
  1500		//	if (sign == 1) {
  1501		//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1502		//	}
  1503	
  1504		VL 32(P2ptr), Y2H
  1505		VL 48(P2ptr), Y2L
  1506	
  1507		VLREPG sign+24(FP), SEL1
  1508		VZERO  ZER
  1509		VCEQG  SEL1, ZER, SEL1
  1510	
  1511		VSCBIQ Y2L, PL, CAR1
  1512		VSQ    Y2L, PL, T1L
  1513		VSBIQ  PH, Y2H, CAR1, T1H
  1514	
  1515		VSEL Y2L, T1L, SEL1, Y2L
  1516		VSEL Y2H, T1H, SEL1, Y2H
  1517	
  1518	/* *
  1519	 * Three operand formula:
  1520	 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1521	 */
  1522		// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1523		VL   64(P1ptr), X1       // Z1H
  1524		VL   80(P1ptr), X0       // Z1L
  1525		VLR  X0, Y0
  1526		VLR  X1, Y1
  1527		CALL p256MulInternal<>(SB)
  1528	
  1529		// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1530		VLR  T0, X0
  1531		VLR  T1, X1
  1532		CALL p256MulInternal<>(SB)
  1533		VLR  T0, T2L
  1534		VLR  T1, T2H
  1535	
  1536		// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1537		VL   0(P2ptr), Y1        // X2H
  1538		VL   16(P2ptr), Y0       // X2L
  1539		CALL p256MulInternal<>(SB)
  1540		VLR  T0, T1L
  1541		VLR  T1, T1H
  1542	
  1543		// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1544		VLR  T2L, X0
  1545		VLR  T2H, X1
  1546		VLR  Y2L, Y0
  1547		VLR  Y2H, Y1
  1548		CALL p256MulInternal<>(SB)
  1549	
  1550		// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1551		VL 32(P1ptr), Y1H
  1552		VL 48(P1ptr), Y1L
  1553		p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1554	
  1555		// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1556		VL 0(P1ptr), X1H
  1557		VL 16(P1ptr), X1L
  1558		p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1559	
  1560		// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1561		VL   64(P1ptr), X1       // Z1H
  1562		VL   80(P1ptr), X0       // Z1L
  1563		CALL p256MulInternal<>(SB)
  1564	
  1565		// VST T1, 64(P3ptr)
  1566		// VST T0, 80(P3ptr)
  1567		VLR T0, Z3L
  1568		VLR T1, Z3H
  1569	
  1570		// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1571		VLR  Y0, X0
  1572		VLR  Y1, X1
  1573		CALL p256MulInternal<>(SB)
  1574		VLR  T0, X0
  1575		VLR  T1, X1
  1576	
  1577		// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1578		CALL p256MulInternal<>(SB)
  1579		VLR  T0, T4L
  1580		VLR  T1, T4H
  1581	
  1582		// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1583		VL   0(P1ptr), Y1        // X1H
  1584		VL   16(P1ptr), Y0       // X1L
  1585		CALL p256MulInternal<>(SB)
  1586		VLR  T0, T3L
  1587		VLR  T1, T3H
  1588	
  1589		// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1590		p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1591	
  1592		// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1593		VLR  T2L, X0
  1594		VLR  T2H, X1
  1595		VLR  T2L, Y0
  1596		VLR  T2H, Y1
  1597		CALL p256MulInternal<>(SB)
  1598	
  1599		// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1600		p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1601	
  1602		// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1603		p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1604		VLR T0, X3L
  1605		VLR T1, X3H
  1606	
  1607		// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1608		p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1609	
  1610		// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1611		CALL p256MulInternal<>(SB)
  1612		VLR  T0, T3L
  1613		VLR  T1, T3H
  1614	
  1615		// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1616		VLR  T4L, X0
  1617		VLR  T4H, X1
  1618		VL   32(P1ptr), Y1       // Y1H
  1619		VL   48(P1ptr), Y0       // Y1L
  1620		CALL p256MulInternal<>(SB)
  1621	
  1622		// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1623		p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1624	
  1625		//	if (sel == 0) {
  1626		//		copy(P3.x[:], X1)
  1627		//		copy(P3.y[:], Y1)
  1628		//		copy(P3.z[:], Z1)
  1629		//	}
  1630	
  1631		VL 0(P1ptr), X1H
  1632		VL 16(P1ptr), X1L
  1633	
  1634		// Y1 already loaded, left over from addition
  1635		VL 64(P1ptr), Z1H
  1636		VL 80(P1ptr), Z1L
  1637	
  1638		VLREPG sel+32(FP), SEL1
  1639		VZERO  ZER
  1640		VCEQG  SEL1, ZER, SEL1
  1641	
  1642		VSEL X1L, X3L, SEL1, X3L
  1643		VSEL X1H, X3H, SEL1, X3H
  1644		VSEL Y1L, Y3L, SEL1, Y3L
  1645		VSEL Y1H, Y3H, SEL1, Y3H
  1646		VSEL Z1L, Z3L, SEL1, Z3L
  1647		VSEL Z1H, Z3H, SEL1, Z3H
  1648	
  1649		//	if (zero == 0) {
  1650		//		copy(P3.x[:], X2)
  1651		//		copy(P3.y[:], Y2)
  1652		//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1653		//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1654		//	}
  1655		VL 0(P2ptr), X2H
  1656		VL 16(P2ptr), X2L
  1657	
  1658		// Y2 already loaded
  1659		VL 128(CPOOL), Z2H
  1660		VL 144(CPOOL), Z2L
  1661	
  1662		VLREPG zero+40(FP), SEL1
  1663		VZERO  ZER
  1664		VCEQG  SEL1, ZER, SEL1
  1665	
  1666		VSEL X2L, X3L, SEL1, X3L
  1667		VSEL X2H, X3H, SEL1, X3H
  1668		VSEL Y2L, Y3L, SEL1, Y3L
  1669		VSEL Y2H, Y3H, SEL1, Y3H
  1670		VSEL Z2L, Z3L, SEL1, Z3L
  1671		VSEL Z2H, Z3H, SEL1, Z3H
  1672	
  1673		// All done, store out the result!!!
  1674		VST X3H, 0(P3ptr)
  1675		VST X3L, 16(P3ptr)
  1676		VST Y3H, 32(P3ptr)
  1677		VST Y3L, 48(P3ptr)
  1678		VST Z3H, 64(P3ptr)
  1679		VST Z3L, 80(P3ptr)
  1680	
  1681		RET
  1682	
  1683	#undef P3ptr
  1684	#undef P1ptr
  1685	#undef P2ptr
  1686	#undef CPOOL
  1687	
  1688	#undef Y2L
  1689	#undef Y2H
  1690	#undef T1L
  1691	#undef T1H
  1692	#undef T2L
  1693	#undef T2H
  1694	#undef T3L
  1695	#undef T3H
  1696	#undef T4L
  1697	#undef T4H
  1698	
  1699	#undef TT0
  1700	#undef TT1
  1701	#undef T2
  1702	
  1703	#undef X0
  1704	#undef X1
  1705	#undef Y0
  1706	#undef Y1
  1707	#undef T0
  1708	#undef T1
  1709	
  1710	#undef PL
  1711	#undef PH
  1712	
  1713	#undef X1L
  1714	#undef X1H
  1715	#undef Y1L
  1716	#undef Y1H
  1717	#undef Z1L
  1718	#undef Z1H
  1719	#undef X2L
  1720	#undef X2H
  1721	#undef Z2L
  1722	#undef Z2H
  1723	#undef X3L
  1724	#undef X3H
  1725	#undef Y3L
  1726	#undef Y3H
  1727	#undef Z3L
  1728	#undef Z3H
  1729	
  1730	#undef ZER
  1731	#undef SEL1
  1732	#undef CAR1
  1733	#undef CAR2
  1734	
  1735	// p256PointDoubleAsm(P3, P1 *p256Point)
  1736	// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1737	// http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1738	// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1739	#define P3ptr   R1
  1740	#define P1ptr   R2
  1741	#define CPOOL   R4
  1742	
  1743	// Temporaries in REGs
  1744	#define X3L    V15
  1745	#define X3H    V16
  1746	#define Y3L    V17
  1747	#define Y3H    V18
  1748	#define T1L    V19
  1749	#define T1H    V20
  1750	#define T2L    V21
  1751	#define T2H    V22
  1752	#define T3L    V23
  1753	#define T3H    V24
  1754	
  1755	#define X1L    V6
  1756	#define X1H    V7
  1757	#define Y1L    V8
  1758	#define Y1H    V9
  1759	#define Z1L    V10
  1760	#define Z1H    V11
  1761	
  1762	// Temps for Sub and Add
  1763	#define TT0  V11
  1764	#define TT1  V12
  1765	#define T2   V13
  1766	
  1767	// p256MulAsm Parameters
  1768	#define X0    V0
  1769	#define X1    V1
  1770	#define Y0    V2
  1771	#define Y1    V3
  1772	#define T0    V4
  1773	#define T1    V5
  1774	
  1775	#define PL    V30
  1776	#define PH    V31
  1777	
  1778	#define Z3L    V23
  1779	#define Z3H    V24
  1780	
  1781	#define ZER   V26
  1782	#define SEL1  V27
  1783	#define CAR1  V28
  1784	#define CAR2  V29
  1785	/*
  1786	 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1787	 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1788	 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1789	 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1790	 * 	B  = 2Y₁
  1791	 * 	Z₃ = B×Z₁
  1792	 * 	C  = B²
  1793	 * 	D  = C×X₁
  1794	 * 	X₃ = A²-2D
  1795	 * 	Y₃ = (D-X₃)×A-C²/2
  1796	 *
  1797	 * Three-operand formula:
  1798	 *       T1 = Z1²
  1799	 *       T2 = X1-T1
  1800	 *       T1 = X1+T1
  1801	 *       T2 = T2*T1
  1802	 *       T2 = 3*T2
  1803	 *       Y3 = 2*Y1
  1804	 *       Z3 = Y3*Z1
  1805	 *       Y3 = Y3²
  1806	 *       T3 = Y3*X1
  1807	 *       Y3 = Y3²
  1808	 *       Y3 = half*Y3
  1809	 *       X3 = T2²
  1810	 *       T1 = 2*T3
  1811	 *       X3 = X3-T1
  1812	 *       T1 = T3-X3
  1813	 *       T1 = T1*T2
  1814	 *       Y3 = T1-Y3
  1815	 */
  1816	
  1817	TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  1818		MOVD P3+0(FP), P3ptr
  1819		MOVD P1+8(FP), P1ptr
  1820	
  1821		MOVD $p256mul<>+0x00(SB), CPOOL
  1822		VL   16(CPOOL), PL
  1823		VL   0(CPOOL), PH
  1824	
  1825		// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1826		VL   64(P1ptr), X1       // Z1H
  1827		VL   80(P1ptr), X0       // Z1L
  1828		VLR  X0, Y0
  1829		VLR  X1, Y1
  1830		CALL p256MulInternal<>(SB)
  1831	
  1832		// SUB(X<X1-T)            // T2 = X1-T1
  1833		VL 0(P1ptr), X1H
  1834		VL 16(P1ptr), X1L
  1835		p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1836	
  1837		// ADD(Y<X1+T)            // T1 = X1+T1
  1838		p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1839	
  1840		// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1841		CALL p256MulInternal<>(SB)
  1842	
  1843		// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1844		p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1845		p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1846	
  1847		// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1848		VL 32(P1ptr), Y1H
  1849		VL 48(P1ptr), Y1L
  1850		p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  1851	
  1852		// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1853		VL   64(P1ptr), Y1       // Z1H
  1854		VL   80(P1ptr), Y0       // Z1L
  1855		CALL p256MulInternal<>(SB)
  1856		VST  T1, 64(P3ptr)
  1857		VST  T0, 80(P3ptr)
  1858	
  1859		// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1860		VLR  X0, Y0
  1861		VLR  X1, Y1
  1862		CALL p256MulInternal<>(SB)
  1863	
  1864		// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1865		VLR  T0, X0
  1866		VLR  T1, X1
  1867		VL   0(P1ptr), Y1
  1868		VL   16(P1ptr), Y0
  1869		CALL p256MulInternal<>(SB)
  1870		VLR  T0, T3L
  1871		VLR  T1, T3H
  1872	
  1873		// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1874		VLR  X0, Y0
  1875		VLR  X1, Y1
  1876		CALL p256MulInternal<>(SB)
  1877	
  1878		// HAL(Y3<T)              // Y3 = half*Y3
  1879		p256HalfInternal(Y3H,Y3L, T1,T0)
  1880	
  1881		// X=T2; Y=T2; MUL; T-    // X3 = T2²
  1882		VLR  T2L, X0
  1883		VLR  T2H, X1
  1884		VLR  T2L, Y0
  1885		VLR  T2H, Y1
  1886		CALL p256MulInternal<>(SB)
  1887	
  1888		// ADD(T1<T3+T3)          // T1 = 2*T3
  1889		p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  1890	
  1891		// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  1892		p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  1893		VST X3H, 0(P3ptr)
  1894		VST X3L, 16(P3ptr)
  1895	
  1896		// SUB(X<T3-X3)           // T1 = T3-X3
  1897		p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  1898	
  1899		// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  1900		CALL p256MulInternal<>(SB)
  1901	
  1902		// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  1903		p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  1904	
  1905		VST Y3H, 32(P3ptr)
  1906		VST Y3L, 48(P3ptr)
  1907		RET
  1908	
  1909	#undef P3ptr
  1910	#undef P1ptr
  1911	#undef CPOOL
  1912	#undef X3L
  1913	#undef X3H
  1914	#undef Y3L
  1915	#undef Y3H
  1916	#undef T1L
  1917	#undef T1H
  1918	#undef T2L
  1919	#undef T2H
  1920	#undef T3L
  1921	#undef T3H
  1922	#undef X1L
  1923	#undef X1H
  1924	#undef Y1L
  1925	#undef Y1H
  1926	#undef Z1L
  1927	#undef Z1H
  1928	#undef TT0
  1929	#undef TT1
  1930	#undef T2
  1931	#undef X0
  1932	#undef X1
  1933	#undef Y0
  1934	#undef Y1
  1935	#undef T0
  1936	#undef T1
  1937	#undef PL
  1938	#undef PH
  1939	#undef Z3L
  1940	#undef Z3H
  1941	#undef ZER
  1942	#undef SEL1
  1943	#undef CAR1
  1944	#undef CAR2
  1945	
  1946	// p256PointAddAsm(P3, P1, P2 *p256Point)
  1947	#define P3ptr   R1
  1948	#define P1ptr   R2
  1949	#define P2ptr   R3
  1950	#define CPOOL   R4
  1951	
  1952	// Temporaries in REGs
  1953	#define T1L   V16
  1954	#define T1H   V17
  1955	#define T2L   V18
  1956	#define T2H   V19
  1957	#define U1L   V20
  1958	#define U1H   V21
  1959	#define S1L   V22
  1960	#define S1H   V23
  1961	#define HL    V24
  1962	#define HH    V25
  1963	#define RL    V26
  1964	#define RH    V27
  1965	
  1966	// Temps for Sub and Add
  1967	#define ZER   V6
  1968	#define SEL1  V7
  1969	#define CAR1  V8
  1970	#define CAR2  V9
  1971	#define TT0  V11
  1972	#define TT1  V12
  1973	#define T2   V13
  1974	
  1975	// p256MulAsm Parameters
  1976	#define X0    V0
  1977	#define X1    V1
  1978	#define Y0    V2
  1979	#define Y1    V3
  1980	#define T0    V4
  1981	#define T1    V5
  1982	
  1983	#define PL    V30
  1984	#define PH    V31
  1985	/*
  1986	 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  1987	 *
  1988	 * A = X₁×Z₂²
  1989	 * B = Y₁×Z₂³
  1990	 * C = X₂×Z₁²-A
  1991	 * D = Y₂×Z₁³-B
  1992	 * X₃ = D² - 2A×C² - C³
  1993	 * Y₃ = D×(A×C² - X₃) - B×C³
  1994	 * Z₃ = Z₁×Z₂×C
  1995	 *
  1996	 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  1997	 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  1998	 *
  1999	 * T1 = Z1*Z1
  2000	 * T2 = Z2*Z2
  2001	 * U1 = X1*T2
  2002	 * H  = X2*T1
  2003	 * H  = H-U1
  2004	 * Z3 = Z1*Z2
  2005	 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2006	 *
  2007	 * S1 = Z2*T2
  2008	 * S1 = Y1*S1
  2009	 * R  = Z1*T1
  2010	 * R  = Y2*R
  2011	 * R  = R-S1
  2012	 *
  2013	 * T1 = H*H
  2014	 * T2 = H*T1
  2015	 * U1 = U1*T1
  2016	 *
  2017	 * X3 = R*R
  2018	 * X3 = X3-T2
  2019	 * T1 = 2*U1
  2020	 * X3 = X3-T1 << store-out X3 result reg
  2021	 *
  2022	 * T2 = S1*T2
  2023	 * Y3 = U1-X3
  2024	 * Y3 = R*Y3
  2025	 * Y3 = Y3-T2 << store-out Y3 result reg
  2026	
  2027	 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2028		// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2029		// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2030		// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2031		// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2032		// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2033		// SUB(H<H-T)            // H  = H-U1
  2034		// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2035		// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2036		// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2037		// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2038		// SUB(R<T-S1)           // R  = R-S1
  2039		// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2040		// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2041		// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2042		// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2043		// SUB(T<T-T2)           // X3 = X3-T2
  2044		// ADD(X<U1+U1)          // T1 = 2*U1
  2045		// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2046		// SUB(Y<U1-T)           // Y3 = U1-X3
  2047		// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2048		// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2049		// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2050		*/
  2051	TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2052		MOVD P3+0(FP), P3ptr
  2053		MOVD P1+8(FP), P1ptr
  2054		MOVD P2+16(FP), P2ptr
  2055	
  2056		MOVD $p256mul<>+0x00(SB), CPOOL
  2057		VL   16(CPOOL), PL
  2058		VL   0(CPOOL), PH
  2059	
  2060		// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2061		VL   64(P1ptr), X1       // Z1H
  2062		VL   80(P1ptr), X0       // Z1L
  2063		VLR  X0, Y0
  2064		VLR  X1, Y1
  2065		CALL p256MulInternal<>(SB)
  2066	
  2067		// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2068		VLR  T0, Y0
  2069		VLR  T1, Y1
  2070		CALL p256MulInternal<>(SB)
  2071		VLR  T0, RL
  2072		VLR  T1, RH
  2073	
  2074		// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2075		VL   0(P2ptr), X1        // X2H
  2076		VL   16(P2ptr), X0       // X2L
  2077		CALL p256MulInternal<>(SB)
  2078		VLR  T0, HL
  2079		VLR  T1, HH
  2080	
  2081		// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2082		VL   64(P2ptr), X1       // Z2H
  2083		VL   80(P2ptr), X0       // Z2L
  2084		VLR  X0, Y0
  2085		VLR  X1, Y1
  2086		CALL p256MulInternal<>(SB)
  2087	
  2088		// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2089		VLR  T0, Y0
  2090		VLR  T1, Y1
  2091		CALL p256MulInternal<>(SB)
  2092		VLR  T0, S1L
  2093		VLR  T1, S1H
  2094	
  2095		// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2096		VL   0(P1ptr), X1        // X1H
  2097		VL   16(P1ptr), X0       // X1L
  2098		CALL p256MulInternal<>(SB)
  2099		VLR  T0, U1L
  2100		VLR  T1, U1H
  2101	
  2102		// SUB(H<H-T)            // H  = H-U1
  2103		p256SubInternal(HH,HL,HH,HL,T1,T0)
  2104	
  2105		// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2106		VL   64(P1ptr), X1       // Z1H
  2107		VL   80(P1ptr), X0       // Z1L
  2108		VL   64(P2ptr), Y1       // Z2H
  2109		VL   80(P2ptr), Y0       // Z2L
  2110		CALL p256MulInternal<>(SB)
  2111	
  2112		// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2113		VLR  T0, X0
  2114		VLR  T1, X1
  2115		VLR  HL, Y0
  2116		VLR  HH, Y1
  2117		CALL p256MulInternal<>(SB)
  2118		VST  T1, 64(P3ptr)
  2119		VST  T0, 80(P3ptr)
  2120	
  2121		// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2122		VL   32(P1ptr), X1
  2123		VL   48(P1ptr), X0
  2124		VLR  S1L, Y0
  2125		VLR  S1H, Y1
  2126		CALL p256MulInternal<>(SB)
  2127		VLR  T0, S1L
  2128		VLR  T1, S1H
  2129	
  2130		// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2131		VL   32(P2ptr), X1
  2132		VL   48(P2ptr), X0
  2133		VLR  RL, Y0
  2134		VLR  RH, Y1
  2135		CALL p256MulInternal<>(SB)
  2136	
  2137		// SUB(R<T-S1)           // R  = T-S1
  2138		p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2139	
  2140		// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2141		VLR  HL, X0
  2142		VLR  HH, X1
  2143		VLR  HL, Y0
  2144		VLR  HH, Y1
  2145		CALL p256MulInternal<>(SB)
  2146	
  2147		// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2148		VLR  T0, Y0
  2149		VLR  T1, Y1
  2150		CALL p256MulInternal<>(SB)
  2151		VLR  T0, T2L
  2152		VLR  T1, T2H
  2153	
  2154		// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2155		VLR  U1L, X0
  2156		VLR  U1H, X1
  2157		CALL p256MulInternal<>(SB)
  2158		VLR  T0, U1L
  2159		VLR  T1, U1H
  2160	
  2161		// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2162		VLR  RL, X0
  2163		VLR  RH, X1
  2164		VLR  RL, Y0
  2165		VLR  RH, Y1
  2166		CALL p256MulInternal<>(SB)
  2167	
  2168		// SUB(T<T-T2)           // X3 = X3-T2
  2169		p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2170	
  2171		// ADD(X<U1+U1)          // T1 = 2*U1
  2172		p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2173	
  2174		// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2175		p256SubInternal(T1,T0,T1,T0,X1,X0)
  2176		VST T1, 0(P3ptr)
  2177		VST T0, 16(P3ptr)
  2178	
  2179		// SUB(Y<U1-T)           // Y3 = U1-X3
  2180		p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2181	
  2182		// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2183		VLR  RL, X0
  2184		VLR  RH, X1
  2185		CALL p256MulInternal<>(SB)
  2186		VLR  T0, U1L
  2187		VLR  T1, U1H
  2188	
  2189		// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2190		VLR  S1L, X0
  2191		VLR  S1H, X1
  2192		VLR  T2L, Y0
  2193		VLR  T2H, Y1
  2194		CALL p256MulInternal<>(SB)
  2195	
  2196		// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2197		p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2198		VST T1, 32(P3ptr)
  2199		VST T0, 48(P3ptr)
  2200	
  2201		RET

View as plain text