...
Run Format

Text file src/crypto/sha256/sha256block_amd64.s

Documentation: crypto/sha256

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// SHA256 block routine. See sha256block.go for Go equivalent.
     8	//
     9	// The algorithm is detailed in FIPS 180-4:
    10	//
    11	//  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12	
    13	// The avx2-version is described in an Intel White-Paper:
    14	// "Fast SHA-256 Implementations on Intel Architecture Processors"
    15	// To find it, surf to http://www.intel.com/p/en_US/embedded
    16	// and search for that title.
    17	// AVX2 version by Intel, same algorithm as code in Linux kernel:
    18	// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19	// by
    20	//     James Guilford <james.guilford@intel.com>
    21	//     Kirk Yap <kirk.s.yap@intel.com>
    22	//     Tim Chen <tim.c.chen@linux.intel.com>
    23	
    24	// Wt = Mt; for 0 <= t <= 15
    25	// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    26	//
    27	// a = H0
    28	// b = H1
    29	// c = H2
    30	// d = H3
    31	// e = H4
    32	// f = H5
    33	// g = H6
    34	// h = H7
    35	//
    36	// for t = 0 to 63 {
    37	//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    38	//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    39	//    h = g
    40	//    g = f
    41	//    f = e
    42	//    e = d + T1
    43	//    d = c
    44	//    c = b
    45	//    b = a
    46	//    a = T1 + T2
    47	// }
    48	//
    49	// H0 = a + H0
    50	// H1 = b + H1
    51	// H2 = c + H2
    52	// H3 = d + H3
    53	// H4 = e + H4
    54	// H5 = f + H5
    55	// H6 = g + H6
    56	// H7 = h + H7
    57	
    58	// Wt = Mt; for 0 <= t <= 15
    59	#define MSGSCHEDULE0(index) \
    60		MOVL	(index*4)(SI), AX; \
    61		BSWAPL	AX; \
    62		MOVL	AX, (index*4)(BP)
    63	
    64	// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    65	//   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    66	//   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    67	#define MSGSCHEDULE1(index) \
    68		MOVL	((index-2)*4)(BP), AX; \
    69		MOVL	AX, CX; \
    70		RORL	$17, AX; \
    71		MOVL	CX, DX; \
    72		RORL	$19, CX; \
    73		SHRL	$10, DX; \
    74		MOVL	((index-15)*4)(BP), BX; \
    75		XORL	CX, AX; \
    76		MOVL	BX, CX; \
    77		XORL	DX, AX; \
    78		RORL	$7, BX; \
    79		MOVL	CX, DX; \
    80		SHRL	$3, DX; \
    81		RORL	$18, CX; \
    82		ADDL	((index-7)*4)(BP), AX; \
    83		XORL	CX, BX; \
    84		XORL	DX, BX; \
    85		ADDL	((index-16)*4)(BP), BX; \
    86		ADDL	BX, AX; \
    87		MOVL	AX, ((index)*4)(BP)
    88	
    89	// Calculate T1 in AX - uses AX, CX and DX registers.
    90	// h is also used as an accumulator. Wt is passed in AX.
    91	//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    92	//     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
    93	//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    94	#define SHA256T1(const, e, f, g, h) \
    95		ADDL	AX, h; \
    96		MOVL	e, AX; \
    97		ADDL	$const, h; \
    98		MOVL	e, CX; \
    99		RORL	$6, AX; \
   100		MOVL	e, DX; \
   101		RORL	$11, CX; \
   102		XORL	CX, AX; \
   103		MOVL	e, CX; \
   104		RORL	$25, DX; \
   105		ANDL	f, CX; \
   106		XORL	AX, DX; \
   107		MOVL	e, AX; \
   108		NOTL	AX; \
   109		ADDL	DX, h; \
   110		ANDL	g, AX; \
   111		XORL	CX, AX; \
   112		ADDL	h, AX
   113	
   114	// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   115	//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   116	//     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   117	//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   118	#define SHA256T2(a, b, c) \
   119		MOVL	a, DI; \
   120		MOVL	c, BX; \
   121		RORL	$2, DI; \
   122		MOVL	a, DX; \
   123		ANDL	b, BX; \
   124		RORL	$13, DX; \
   125		MOVL	a, CX; \
   126		ANDL	c, CX; \
   127		XORL	DX, DI; \
   128		XORL	CX, BX; \
   129		MOVL	a, DX; \
   130		MOVL	b, CX; \
   131		RORL	$22, DX; \
   132		ANDL	a, CX; \
   133		XORL	CX, BX; \
   134		XORL	DX, DI; \
   135		ADDL	DI, BX
   136	
   137	// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   138	// The values for e and a are stored in d and h, ready for rotation.
   139	#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
   140		SHA256T1(const, e, f, g, h); \
   141		SHA256T2(a, b, c); \
   142		MOVL	BX, h; \
   143		ADDL	AX, d; \
   144		ADDL	AX, h
   145	
   146	#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
   147		MSGSCHEDULE0(index); \
   148		SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   149	
   150	#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
   151		MSGSCHEDULE1(index); \
   152		SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   153	
   154	
   155	// Definitions for AVX2 version
   156	
   157	// addm (mem), reg
   158	// Add reg to mem using reg-mem add and store
   159	#define addm(P1, P2) \
   160		ADDL P2, P1; \
   161		MOVL P1, P2
   162	
   163	#define XDWORD0 Y4
   164	#define XDWORD1 Y5
   165	#define XDWORD2 Y6
   166	#define XDWORD3 Y7
   167	
   168	#define XWORD0 X4
   169	#define XWORD1 X5
   170	#define XWORD2 X6
   171	#define XWORD3 X7
   172	
   173	#define XTMP0 Y0
   174	#define XTMP1 Y1
   175	#define XTMP2 Y2
   176	#define XTMP3 Y3
   177	#define XTMP4 Y8
   178	#define XTMP5 Y11
   179	
   180	#define XFER  Y9
   181	
   182	#define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
   183	#define X_BYTE_FLIP_MASK X13
   184	
   185	#define NUM_BYTES DX
   186	#define INP	DI
   187	
   188	#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
   189	
   190	#define a AX
   191	#define b BX
   192	#define c CX
   193	#define d R8
   194	#define e DX
   195	#define f R9
   196	#define g R10
   197	#define h R11
   198	
   199	#define old_h R11
   200	
   201	#define TBL BP
   202	
   203	#define SRND SI // SRND is same register as CTX
   204	
   205	#define T1 R12
   206	
   207	#define y0 R13
   208	#define y1 R14
   209	#define y2 R15
   210	#define y3 DI
   211	
   212	// Offsets
   213	#define XFER_SIZE 2*64*4
   214	#define INP_END_SIZE 8
   215	#define INP_SIZE 8
   216	#define TMP_SIZE 4
   217	
   218	#define _XFER 0
   219	#define _INP_END _XFER + XFER_SIZE
   220	#define _INP _INP_END + INP_END_SIZE
   221	#define _TMP _INP + INP_SIZE
   222	#define STACK_SIZE _TMP + TMP_SIZE
   223	
   224	#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   225		;                                     \ // #############################  RND N + 0 ############################//
   226		MOVL     a, y3;                       \ // y3 = a					// MAJA
   227		RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
   228		RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
   229		;                                     \
   230		ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
   231		ORL      c, y3;                       \ // y3 = a|c				// MAJA
   232		VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
   233		MOVL     f, y2;                       \ // y2 = f				// CH
   234		RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
   235		;                                     \
   236		XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
   237		XORL     g, y2;                       \ // y2 = f^g                              	// CH
   238		VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
   239		RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   240		;                                     \
   241		ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
   242		XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   243		RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
   244		ADDL     h, d;                        \ // d = k + w + h + d                     	// --
   245		;                                     \
   246		ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
   247		VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
   248		XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   249		RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   250		;                                     \
   251		XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   252		VPSRLD   $7, XTMP1, XTMP2;            \
   253		XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   254		MOVL     a, T1;                       \ // T1 = a								// MAJB
   255		ANDL     c, T1;                       \ // T1 = a&c								// MAJB
   256		;                                     \
   257		ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
   258		VPSLLD   $(32-7), XTMP1, XTMP3;       \
   259		ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   260		ADDL     y1, h;                       \ // h = k + w + h + S0					// --
   261		;                                     \
   262		ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   263		VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
   264		;                                     \
   265		VPSRLD   $18, XTMP1, XTMP2;           \
   266		ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   267		ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
   268	
   269	#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   270		;                                    \ // ################################### RND N + 1 ############################
   271		;                                    \
   272		MOVL    a, y3;                       \ // y3 = a                       // MAJA
   273		RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   274		RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   275		ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
   276		ORL     c, y3;                       \ // y3 = a|c						// MAJA
   277		;                                    \
   278		VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
   279		MOVL    f, y2;                       \ // y2 = f						// CH
   280		RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   281		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   282		XORL    g, y2;                       \ // y2 = f^g						// CH
   283		;                                    \
   284		RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   285		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   286		RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   287		ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   288		ADDL    h, d;                        \ // d = k + w + h + d				// --
   289		;                                    \
   290		VPSLLD  $(32-18), XTMP1, XTMP1;      \
   291		ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   292		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   293		;                                    \
   294		VPXOR   XTMP1, XTMP3, XTMP3;         \
   295		RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   296		XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
   297		;                                    \
   298		VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   299		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   300		MOVL    a, T1;                       \ // T1 = a						// MAJB
   301		ANDL    c, T1;                       \ // T1 = a&c						// MAJB
   302		ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   303		;                                    \
   304		VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
   305		VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
   306		ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   307		ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   308		;                                    \
   309		VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
   310		ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   311		ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   312		ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
   313		;                                    \
   314		VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
   315	
   316	#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   317		;                                    \ // ################################### RND N + 2 ############################
   318		;                                    \
   319		MOVL    a, y3;                       \ // y3 = a							// MAJA
   320		RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
   321		ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
   322		;                                    \
   323		VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
   324		RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
   325		ORL     c, y3;                       \ // y3 = a|c                         // MAJA
   326		MOVL    f, y2;                       \ // y2 = f                           // CH
   327		XORL    g, y2;                       \ // y2 = f^g                         // CH
   328		;                                    \
   329		RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
   330		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
   331		VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
   332		ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   333		;                                    \
   334		RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
   335		VPXOR   XTMP3, XTMP2, XTMP2;         \
   336		ADDL    h, d;                        \ // d = k + w + h + d				// --
   337		ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
   338		;                                    \
   339		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   340		RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   341		VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
   342		XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   343		;                                    \
   344		MOVL    f, _TMP(SP);                 \
   345		MOVQ    $shuff_00BA<>(SB), f;        \ // f is used to keep SHUF_00BA
   346		VPSHUFB (f), XTMP4, XTMP4;           \ // XTMP4 = s1 {00BA}
   347		MOVL    _TMP(SP), f;                 \ // f is restored
   348		;                                    \
   349		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   350		RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   351		VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
   352		;                                    \
   353		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   354		MOVL    a, T1;                       \ // T1 = a                                // MAJB
   355		ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
   356		ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
   357		VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
   358		;                                    \
   359		ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   360		ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   361		ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   362		ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   363		;                                    \
   364		ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
   365	
   366	#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   367		;                                    \ // ################################### RND N + 3 ############################
   368		;                                    \
   369		MOVL    a, y3;                       \ // y3 = a						// MAJA
   370		RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   371		RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   372		ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
   373		ORL     c, y3;                       \ // y3 = a|c                     // MAJA
   374		;                                    \
   375		VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
   376		MOVL    f, y2;                       \ // y2 = f						// CH
   377		RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   378		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   379		XORL    g, y2;                       \ // y2 = f^g						// CH
   380		;                                    \
   381		VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
   382		RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   383		ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
   384		ADDL    h, d;                        \ // d = k + w + h + d			// --
   385		ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   386		;                                    \
   387		VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
   388		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   389		XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   390		;                                    \
   391		VPXOR   XTMP3, XTMP2, XTMP2;         \
   392		RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
   393		ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   394		;                                    \
   395		VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
   396		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   397		ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   398		;                                    \
   399		RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   400		;                                    \
   401		MOVL    f, _TMP(SP);                 \ // Save f
   402		MOVQ    $shuff_DC00<>(SB), f;        \ // SHUF_00DC
   403		VPSHUFB (f), XTMP5, XTMP5;           \ // XTMP5 = s1 {DC00}
   404		MOVL    _TMP(SP), f;                 \ // Restore f
   405		;                                    \
   406		VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   407		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   408		MOVL    a, T1;                       \ // T1 = a							// MAJB
   409		ANDL    c, T1;                       \ // T1 = a&c							// MAJB
   410		ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
   411		;                                    \
   412		ADDL    y1, h;                       \ // h = k + w + h + S0				// --
   413		ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   414		ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
   415	
   416	#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
   417		;                                  \ // ################################### RND N + 0 ###########################
   418		MOVL  f, y2;                       \ // y2 = f					// CH
   419		RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   420		RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   421		XORL  g, y2;                       \ // y2 = f^g					// CH
   422		;                                  \
   423		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
   424		RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
   425		ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
   426		;                                  \
   427		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   428		RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
   429		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   430		RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
   431		MOVL  a, y3;                       \ // y3 = a							// MAJA
   432		;                                  \
   433		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
   434		RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
   435		ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   436		ORL   c, y3;                       \ // y3 = a|c							// MAJA
   437		;                                  \
   438		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   439		MOVL  a, T1;                       \ // T1 = a							// MAJB
   440		ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
   441		ANDL  c, T1;                       \ // T1 = a&c							// MAJB
   442		ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
   443		;                                  \
   444		ADDL  h, d;                        \ // d = k + w + h + d					// --
   445		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   446		ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   447		ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
   448	
   449	#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
   450		;                                  \ // ################################### RND N + 1 ###########################
   451		ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
   452		MOVL  f, y2;                       \ // y2 = f                                // CH
   453		RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   454		RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   455		XORL  g, y2;                       \ // y2 = f^g                             // CH
   456		;                                  \
   457		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   458		RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   459		ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
   460		ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
   461		;                                  \
   462		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   463		RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   464		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   465		RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   466		MOVL  a, y3;                       \ // y3 = a                               // MAJA
   467		;                                  \
   468		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   469		RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   470		ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   471		ORL   c, y3;                       \ // y3 = a|c                             // MAJA
   472		;                                  \
   473		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   474		MOVL  a, T1;                       \ // T1 = a                               // MAJB
   475		ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
   476		ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
   477		ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
   478		;                                  \
   479		ADDL  h, d;                        \ // d = k + w + h + d                    // --
   480		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
   481		ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
   482		;                                  \
   483		ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   484	
   485	#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
   486		;                                  \ // ################################### RND N + 2 ##############################
   487		ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   488		MOVL  f, y2;                       \ // y2 = f								// CH
   489		RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   490		RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   491		XORL  g, y2;                       \ // y2 = f^g								// CH
   492		;                                  \
   493		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   494		RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   495		ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   496		ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   497		;                                  \
   498		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   499		RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   500		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   501		RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   502		MOVL  a, y3;                       \ // y3 = a								// MAJA
   503		;                                  \
   504		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   505		RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   506		ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   507		ORL   c, y3;                       \ // y3 = a|c								// MAJA
   508		;                                  \
   509		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   510		MOVL  a, T1;                       \ // T1 = a								// MAJB
   511		ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   512		ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   513		ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   514		;                                  \
   515		ADDL  h, d;                        \ // d = k + w + h + d					// --
   516		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   517		ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   518		;                                  \
   519		ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   520	
   521	#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
   522		;                                  \ // ################################### RND N + 3 ###########################
   523		ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   524		MOVL  f, y2;                       \ // y2 = f								// CH
   525		RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   526		RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   527		XORL  g, y2;                       \ // y2 = f^g								// CH
   528		;                                  \
   529		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   530		RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   531		ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   532		ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   533		;                                  \
   534		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   535		RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   536		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   537		RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   538		MOVL  a, y3;                       \ // y3 = a								// MAJA
   539		;                                  \
   540		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   541		RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   542		ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   543		ORL   c, y3;                       \ // y3 = a|c								// MAJA
   544		;                                  \
   545		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   546		MOVL  a, T1;                       \ // T1 = a								// MAJB
   547		ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   548		ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   549		ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   550		;                                  \
   551		ADDL  h, d;                        \ // d = k + w + h + d					// --
   552		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   553		ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   554		;                                  \
   555		ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
   556		;                                  \
   557		ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   558		;                                  \
   559		ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
   560	
   561	TEXT ·block(SB), 0, $536-32
   562		CMPB ·useAVX2(SB), $1
   563		JE   avx2
   564	
   565		MOVQ p_base+8(FP), SI
   566		MOVQ p_len+16(FP), DX
   567		SHRQ $6, DX
   568		SHLQ $6, DX
   569	
   570		LEAQ (SI)(DX*1), DI
   571		MOVQ DI, 256(SP)
   572		CMPQ SI, DI
   573		JEQ  end
   574	
   575		MOVQ dig+0(FP), BP
   576		MOVL (0*4)(BP), R8  // a = H0
   577		MOVL (1*4)(BP), R9  // b = H1
   578		MOVL (2*4)(BP), R10 // c = H2
   579		MOVL (3*4)(BP), R11 // d = H3
   580		MOVL (4*4)(BP), R12 // e = H4
   581		MOVL (5*4)(BP), R13 // f = H5
   582		MOVL (6*4)(BP), R14 // g = H6
   583		MOVL (7*4)(BP), R15 // h = H7
   584	
   585	loop:
   586		MOVQ SP, BP
   587	
   588		SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
   589		SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
   590		SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
   591		SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
   592		SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
   593		SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
   594		SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
   595		SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
   596		SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
   597		SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
   598		SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
   599		SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
   600		SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
   601		SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
   602		SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
   603		SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
   604	
   605		SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
   606		SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
   607		SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
   608		SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
   609		SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
   610		SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
   611		SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
   612		SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
   613		SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
   614		SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
   615		SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
   616		SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
   617		SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
   618		SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
   619		SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
   620		SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
   621		SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
   622		SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
   623		SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
   624		SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
   625		SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
   626		SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
   627		SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
   628		SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
   629		SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
   630		SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
   631		SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
   632		SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
   633		SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
   634		SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
   635		SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
   636		SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
   637		SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
   638		SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
   639		SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
   640		SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
   641		SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
   642		SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
   643		SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
   644		SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
   645		SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
   646		SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
   647		SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
   648		SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
   649		SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
   650		SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
   651		SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
   652		SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
   653	
   654		MOVQ dig+0(FP), BP
   655		ADDL (0*4)(BP), R8  // H0 = a + H0
   656		MOVL R8, (0*4)(BP)
   657		ADDL (1*4)(BP), R9  // H1 = b + H1
   658		MOVL R9, (1*4)(BP)
   659		ADDL (2*4)(BP), R10 // H2 = c + H2
   660		MOVL R10, (2*4)(BP)
   661		ADDL (3*4)(BP), R11 // H3 = d + H3
   662		MOVL R11, (3*4)(BP)
   663		ADDL (4*4)(BP), R12 // H4 = e + H4
   664		MOVL R12, (4*4)(BP)
   665		ADDL (5*4)(BP), R13 // H5 = f + H5
   666		MOVL R13, (5*4)(BP)
   667		ADDL (6*4)(BP), R14 // H6 = g + H6
   668		MOVL R14, (6*4)(BP)
   669		ADDL (7*4)(BP), R15 // H7 = h + H7
   670		MOVL R15, (7*4)(BP)
   671	
   672		ADDQ $64, SI
   673		CMPQ SI, 256(SP)
   674		JB   loop
   675	
   676	end:
   677		RET
   678	
   679	avx2:
   680		MOVQ dig+0(FP), CTX          // d.h[8]
   681		MOVQ p_base+8(FP), INP
   682		MOVQ p_len+16(FP), NUM_BYTES
   683	
   684		LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   685		MOVQ NUM_BYTES, _INP_END(SP)
   686	
   687		CMPQ NUM_BYTES, INP
   688		JE   avx2_only_one_block
   689	
   690		// Load initial digest
   691		MOVL 0(CTX), a  // a = H0
   692		MOVL 4(CTX), b  // b = H1
   693		MOVL 8(CTX), c  // c = H2
   694		MOVL 12(CTX), d // d = H3
   695		MOVL 16(CTX), e // e = H4
   696		MOVL 20(CTX), f // f = H5
   697		MOVL 24(CTX), g // g = H6
   698		MOVL 28(CTX), h // h = H7
   699	
   700	avx2_loop0: // at each iteration works with one block (512 bit)
   701	
   702		VMOVDQU (0*32)(INP), XTMP0
   703		VMOVDQU (1*32)(INP), XTMP1
   704		VMOVDQU (2*32)(INP), XTMP2
   705		VMOVDQU (3*32)(INP), XTMP3
   706	
   707		MOVQ    $flip_mask<>(SB), BP // BYTE_FLIP_MASK
   708		VMOVDQU (BP), BYTE_FLIP_MASK
   709	
   710		// Apply Byte Flip Mask: LE -> BE
   711		VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   712		VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   713		VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   714		VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   715	
   716		// Transpose data into high/low parts
   717		VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   718		VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   719		VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   720		VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   721	
   722		MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
   723	
   724	avx2_last_block_enter:
   725		ADDQ $64, INP
   726		MOVQ INP, _INP(SP)
   727		XORQ SRND, SRND
   728	
   729	avx2_loop1: // for w0 - w47
   730		// Do 4 rounds and scheduling
   731		VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   732		VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   733		ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   734		ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   735		ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   736		ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   737	
   738		// Do 4 rounds and scheduling
   739		VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   740		VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   741		ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   742		ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   743		ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   744		ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   745	
   746		// Do 4 rounds and scheduling
   747		VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
   748		VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
   749		ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   750		ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   751		ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   752		ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   753	
   754		// Do 4 rounds and scheduling
   755		VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
   756		VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
   757		ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   758		ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   759		ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   760		ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   761	
   762		ADDQ $4*32, SRND
   763		CMPQ SRND, $3*4*32
   764		JB   avx2_loop1
   765	
   766	avx2_loop2:
   767		// w48 - w63 processed with no scheduliung (last 16 rounds)
   768		VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   769		VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   770		DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
   771		DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
   772		DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
   773		DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
   774	
   775		VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   776		VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   777		DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
   778		DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
   779		DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
   780		DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
   781	
   782		ADDQ $2*32, SRND
   783	
   784		VMOVDQU XDWORD2, XDWORD0
   785		VMOVDQU XDWORD3, XDWORD1
   786	
   787		CMPQ SRND, $4*4*32
   788		JB   avx2_loop2
   789	
   790		MOVQ dig+0(FP), CTX // d.h[8]
   791		MOVQ _INP(SP), INP
   792	
   793		addm(  0(CTX), a)
   794		addm(  4(CTX), b)
   795		addm(  8(CTX), c)
   796		addm( 12(CTX), d)
   797		addm( 16(CTX), e)
   798		addm( 20(CTX), f)
   799		addm( 24(CTX), g)
   800		addm( 28(CTX), h)
   801	
   802		CMPQ _INP_END(SP), INP
   803		JB   done_hash
   804	
   805		XORQ SRND, SRND
   806	
   807	avx2_loop3: // Do second block using previously scheduled results
   808		DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
   809		DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
   810		DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
   811		DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
   812	
   813		DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
   814		DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
   815		DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
   816		DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
   817	
   818		ADDQ $2*32, SRND
   819		CMPQ SRND, $4*4*32
   820		JB   avx2_loop3
   821	
   822		MOVQ dig+0(FP), CTX // d.h[8]
   823		MOVQ _INP(SP), INP
   824		ADDQ $64, INP
   825	
   826		addm(  0(CTX), a)
   827		addm(  4(CTX), b)
   828		addm(  8(CTX), c)
   829		addm( 12(CTX), d)
   830		addm( 16(CTX), e)
   831		addm( 20(CTX), f)
   832		addm( 24(CTX), g)
   833		addm( 28(CTX), h)
   834	
   835		CMPQ _INP_END(SP), INP
   836		JA   avx2_loop0
   837		JB   done_hash
   838	
   839	avx2_do_last_block:
   840	
   841		VMOVDQU 0(INP), XWORD0
   842		VMOVDQU 16(INP), XWORD1
   843		VMOVDQU 32(INP), XWORD2
   844		VMOVDQU 48(INP), XWORD3
   845	
   846		MOVQ    $flip_mask<>(SB), BP
   847		VMOVDQU (BP), X_BYTE_FLIP_MASK
   848	
   849		VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   850		VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   851		VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   852		VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   853	
   854		MOVQ $K256<>(SB), TBL
   855	
   856		JMP avx2_last_block_enter
   857	
   858	avx2_only_one_block:
   859		// Load initial digest
   860		MOVL 0(CTX), a  // a = H0
   861		MOVL 4(CTX), b  // b = H1
   862		MOVL 8(CTX), c  // c = H2
   863		MOVL 12(CTX), d // d = H3
   864		MOVL 16(CTX), e // e = H4
   865		MOVL 20(CTX), f // f = H5
   866		MOVL 24(CTX), g // g = H6
   867		MOVL 28(CTX), h // h = H7
   868	
   869		JMP avx2_do_last_block
   870	
   871	done_hash:
   872		VZEROUPPER
   873		RET
   874	
   875	// shuffle byte order from LE to BE
   876	DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   877	DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   878	DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   879	DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   880	GLOBL flip_mask<>(SB), 8, $32
   881	
   882	// shuffle xBxA -> 00BA
   883	DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
   884	DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   885	DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
   886	DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   887	GLOBL shuff_00BA<>(SB), 8, $32
   888	
   889	// shuffle xDxC -> DC00
   890	DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
   891	DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
   892	DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   893	DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
   894	GLOBL shuff_DC00<>(SB), 8, $32
   895	
   896	// Round specific constants
   897	DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
   898	DATA K256<>+0x04(SB)/4, $0x71374491 // k2
   899	DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
   900	DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
   901	DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
   902	DATA K256<>+0x14(SB)/4, $0x71374491 // k2
   903	DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
   904	DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
   905	
   906	DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
   907	DATA K256<>+0x24(SB)/4, $0x59f111f1
   908	DATA K256<>+0x28(SB)/4, $0x923f82a4
   909	DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
   910	DATA K256<>+0x30(SB)/4, $0x3956c25b
   911	DATA K256<>+0x34(SB)/4, $0x59f111f1
   912	DATA K256<>+0x38(SB)/4, $0x923f82a4
   913	DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
   914	
   915	DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
   916	DATA K256<>+0x44(SB)/4, $0x12835b01
   917	DATA K256<>+0x48(SB)/4, $0x243185be
   918	DATA K256<>+0x4c(SB)/4, $0x550c7dc3
   919	DATA K256<>+0x50(SB)/4, $0xd807aa98
   920	DATA K256<>+0x54(SB)/4, $0x12835b01
   921	DATA K256<>+0x58(SB)/4, $0x243185be
   922	DATA K256<>+0x5c(SB)/4, $0x550c7dc3
   923	
   924	DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
   925	DATA K256<>+0x64(SB)/4, $0x80deb1fe
   926	DATA K256<>+0x68(SB)/4, $0x9bdc06a7
   927	DATA K256<>+0x6c(SB)/4, $0xc19bf174
   928	DATA K256<>+0x70(SB)/4, $0x72be5d74
   929	DATA K256<>+0x74(SB)/4, $0x80deb1fe
   930	DATA K256<>+0x78(SB)/4, $0x9bdc06a7
   931	DATA K256<>+0x7c(SB)/4, $0xc19bf174
   932	
   933	DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
   934	DATA K256<>+0x84(SB)/4, $0xefbe4786
   935	DATA K256<>+0x88(SB)/4, $0x0fc19dc6
   936	DATA K256<>+0x8c(SB)/4, $0x240ca1cc
   937	DATA K256<>+0x90(SB)/4, $0xe49b69c1
   938	DATA K256<>+0x94(SB)/4, $0xefbe4786
   939	DATA K256<>+0x98(SB)/4, $0x0fc19dc6
   940	DATA K256<>+0x9c(SB)/4, $0x240ca1cc
   941	
   942	DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
   943	DATA K256<>+0xa4(SB)/4, $0x4a7484aa
   944	DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
   945	DATA K256<>+0xac(SB)/4, $0x76f988da
   946	DATA K256<>+0xb0(SB)/4, $0x2de92c6f
   947	DATA K256<>+0xb4(SB)/4, $0x4a7484aa
   948	DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
   949	DATA K256<>+0xbc(SB)/4, $0x76f988da
   950	
   951	DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
   952	DATA K256<>+0xc4(SB)/4, $0xa831c66d
   953	DATA K256<>+0xc8(SB)/4, $0xb00327c8
   954	DATA K256<>+0xcc(SB)/4, $0xbf597fc7
   955	DATA K256<>+0xd0(SB)/4, $0x983e5152
   956	DATA K256<>+0xd4(SB)/4, $0xa831c66d
   957	DATA K256<>+0xd8(SB)/4, $0xb00327c8
   958	DATA K256<>+0xdc(SB)/4, $0xbf597fc7
   959	
   960	DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
   961	DATA K256<>+0xe4(SB)/4, $0xd5a79147
   962	DATA K256<>+0xe8(SB)/4, $0x06ca6351
   963	DATA K256<>+0xec(SB)/4, $0x14292967
   964	DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
   965	DATA K256<>+0xf4(SB)/4, $0xd5a79147
   966	DATA K256<>+0xf8(SB)/4, $0x06ca6351
   967	DATA K256<>+0xfc(SB)/4, $0x14292967
   968	
   969	DATA K256<>+0x100(SB)/4, $0x27b70a85
   970	DATA K256<>+0x104(SB)/4, $0x2e1b2138
   971	DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
   972	DATA K256<>+0x10c(SB)/4, $0x53380d13
   973	DATA K256<>+0x110(SB)/4, $0x27b70a85
   974	DATA K256<>+0x114(SB)/4, $0x2e1b2138
   975	DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
   976	DATA K256<>+0x11c(SB)/4, $0x53380d13
   977	
   978	DATA K256<>+0x120(SB)/4, $0x650a7354
   979	DATA K256<>+0x124(SB)/4, $0x766a0abb
   980	DATA K256<>+0x128(SB)/4, $0x81c2c92e
   981	DATA K256<>+0x12c(SB)/4, $0x92722c85
   982	DATA K256<>+0x130(SB)/4, $0x650a7354
   983	DATA K256<>+0x134(SB)/4, $0x766a0abb
   984	DATA K256<>+0x138(SB)/4, $0x81c2c92e
   985	DATA K256<>+0x13c(SB)/4, $0x92722c85
   986	
   987	DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
   988	DATA K256<>+0x144(SB)/4, $0xa81a664b
   989	DATA K256<>+0x148(SB)/4, $0xc24b8b70
   990	DATA K256<>+0x14c(SB)/4, $0xc76c51a3
   991	DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
   992	DATA K256<>+0x154(SB)/4, $0xa81a664b
   993	DATA K256<>+0x158(SB)/4, $0xc24b8b70
   994	DATA K256<>+0x15c(SB)/4, $0xc76c51a3
   995	
   996	DATA K256<>+0x160(SB)/4, $0xd192e819
   997	DATA K256<>+0x164(SB)/4, $0xd6990624
   998	DATA K256<>+0x168(SB)/4, $0xf40e3585
   999	DATA K256<>+0x16c(SB)/4, $0x106aa070
  1000	DATA K256<>+0x170(SB)/4, $0xd192e819
  1001	DATA K256<>+0x174(SB)/4, $0xd6990624
  1002	DATA K256<>+0x178(SB)/4, $0xf40e3585
  1003	DATA K256<>+0x17c(SB)/4, $0x106aa070
  1004	
  1005	DATA K256<>+0x180(SB)/4, $0x19a4c116
  1006	DATA K256<>+0x184(SB)/4, $0x1e376c08
  1007	DATA K256<>+0x188(SB)/4, $0x2748774c
  1008	DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
  1009	DATA K256<>+0x190(SB)/4, $0x19a4c116
  1010	DATA K256<>+0x194(SB)/4, $0x1e376c08
  1011	DATA K256<>+0x198(SB)/4, $0x2748774c
  1012	DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
  1013	
  1014	DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
  1015	DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
  1016	DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
  1017	DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
  1018	DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
  1019	DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
  1020	DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
  1021	DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
  1022	
  1023	DATA K256<>+0x1c0(SB)/4, $0x748f82ee
  1024	DATA K256<>+0x1c4(SB)/4, $0x78a5636f
  1025	DATA K256<>+0x1c8(SB)/4, $0x84c87814
  1026	DATA K256<>+0x1cc(SB)/4, $0x8cc70208
  1027	DATA K256<>+0x1d0(SB)/4, $0x748f82ee
  1028	DATA K256<>+0x1d4(SB)/4, $0x78a5636f
  1029	DATA K256<>+0x1d8(SB)/4, $0x84c87814
  1030	DATA K256<>+0x1dc(SB)/4, $0x8cc70208
  1031	
  1032	DATA K256<>+0x1e0(SB)/4, $0x90befffa
  1033	DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
  1034	DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
  1035	DATA K256<>+0x1ec(SB)/4, $0xc67178f2
  1036	DATA K256<>+0x1f0(SB)/4, $0x90befffa
  1037	DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
  1038	DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
  1039	DATA K256<>+0x1fc(SB)/4, $0xc67178f2
  1040	
  1041	GLOBL K256<>(SB), (NOPTR + RODATA), $512

View as plain text