...
Run Format

Text file src/crypto/sha1/sha1block_amd64.s

Documentation: crypto/sha1

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// AVX2 version by Intel, same algorithm as code in Linux kernel:
     6	// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
     7	// Authors:
     8	// Ilya Albrekht <ilya.albrekht@intel.com>
     9	// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
    10	// Ronen Zohar <ronen.zohar@intel.com>
    11	// Chandramouli Narayanan <mouli@linux.intel.com>
    12	
    13	
    14	#include "textflag.h"
    15	
    16	// SHA-1 block routine. See sha1block.go for Go equivalent.
    17	//
    18	// There are 80 rounds of 4 types:
    19	//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
    20	//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
    21	//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
    22	//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
    23	//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
    24	//
    25	// Each round loads or shuffles the data, then computes a per-round
    26	// function of b, c, d, and then mixes the result into and rotates the
    27	// five registers a, b, c, d, e holding the intermediate results.
    28	//
    29	// The register rotation is implemented by rotating the arguments to
    30	// the round macros instead of by explicit move instructions.
    31	
    32	#define LOAD(index) \
    33		MOVL	(index*4)(SI), R10; \
    34		BSWAPL	R10; \
    35		MOVL	R10, (index*4)(SP)
    36	
    37	#define SHUFFLE(index) \
    38		MOVL	(((index)&0xf)*4)(SP), R10; \
    39		XORL	(((index-3)&0xf)*4)(SP), R10; \
    40		XORL	(((index-8)&0xf)*4)(SP), R10; \
    41		XORL	(((index-14)&0xf)*4)(SP), R10; \
    42		ROLL	$1, R10; \
    43		MOVL	R10, (((index)&0xf)*4)(SP)
    44	
    45	#define FUNC1(a, b, c, d, e) \
    46		MOVL	d, R9; \
    47		XORL	c, R9; \
    48		ANDL	b, R9; \
    49		XORL	d, R9
    50	
    51	#define FUNC2(a, b, c, d, e) \
    52		MOVL	b, R9; \
    53		XORL	c, R9; \
    54		XORL	d, R9
    55	
    56	#define FUNC3(a, b, c, d, e) \
    57		MOVL	b, R8; \
    58		ORL	c, R8; \
    59		ANDL	d, R8; \
    60		MOVL	b, R9; \
    61		ANDL	c, R9; \
    62		ORL	R8, R9
    63		
    64	#define FUNC4 FUNC2
    65	
    66	#define MIX(a, b, c, d, e, const) \
    67		ROLL	$30, b; \
    68		ADDL	R9, e; \
    69		MOVL	a, R8; \
    70		ROLL	$5, R8; \
    71		LEAL	const(e)(R10*1), e; \
    72		ADDL	R8, e
    73	
    74	#define ROUND1(a, b, c, d, e, index) \
    75		LOAD(index); \
    76		FUNC1(a, b, c, d, e); \
    77		MIX(a, b, c, d, e, 0x5A827999)
    78	
    79	#define ROUND1x(a, b, c, d, e, index) \
    80		SHUFFLE(index); \
    81		FUNC1(a, b, c, d, e); \
    82		MIX(a, b, c, d, e, 0x5A827999)
    83	
    84	#define ROUND2(a, b, c, d, e, index) \
    85		SHUFFLE(index); \
    86		FUNC2(a, b, c, d, e); \
    87		MIX(a, b, c, d, e, 0x6ED9EBA1)
    88	
    89	#define ROUND3(a, b, c, d, e, index) \
    90		SHUFFLE(index); \
    91		FUNC3(a, b, c, d, e); \
    92		MIX(a, b, c, d, e, 0x8F1BBCDC)
    93	
    94	#define ROUND4(a, b, c, d, e, index) \
    95		SHUFFLE(index); \
    96		FUNC4(a, b, c, d, e); \
    97		MIX(a, b, c, d, e, 0xCA62C1D6)
    98	
    99	TEXT ·blockAMD64(SB),NOSPLIT,$64-32
   100		MOVQ	dig+0(FP),	BP
   101		MOVQ	p_base+8(FP),	SI
   102		MOVQ	p_len+16(FP),	DX
   103		SHRQ	$6,		DX
   104		SHLQ	$6,		DX
   105		
   106		LEAQ	(SI)(DX*1),	DI
   107		MOVL	(0*4)(BP),	AX
   108		MOVL	(1*4)(BP),	BX
   109		MOVL	(2*4)(BP),	CX
   110		MOVL	(3*4)(BP),	DX
   111		MOVL	(4*4)(BP),	BP
   112	
   113		CMPQ	SI,		DI
   114		JEQ	end
   115	
   116	loop:
   117		MOVL	AX,	R11
   118		MOVL	BX,	R12
   119		MOVL	CX,	R13
   120		MOVL	DX,	R14
   121		MOVL	BP,	R15
   122	
   123		ROUND1(AX, BX, CX, DX, BP, 0)
   124		ROUND1(BP, AX, BX, CX, DX, 1)
   125		ROUND1(DX, BP, AX, BX, CX, 2)
   126		ROUND1(CX, DX, BP, AX, BX, 3)
   127		ROUND1(BX, CX, DX, BP, AX, 4)
   128		ROUND1(AX, BX, CX, DX, BP, 5)
   129		ROUND1(BP, AX, BX, CX, DX, 6)
   130		ROUND1(DX, BP, AX, BX, CX, 7)
   131		ROUND1(CX, DX, BP, AX, BX, 8)
   132		ROUND1(BX, CX, DX, BP, AX, 9)
   133		ROUND1(AX, BX, CX, DX, BP, 10)
   134		ROUND1(BP, AX, BX, CX, DX, 11)
   135		ROUND1(DX, BP, AX, BX, CX, 12)
   136		ROUND1(CX, DX, BP, AX, BX, 13)
   137		ROUND1(BX, CX, DX, BP, AX, 14)
   138		ROUND1(AX, BX, CX, DX, BP, 15)
   139	
   140		ROUND1x(BP, AX, BX, CX, DX, 16)
   141		ROUND1x(DX, BP, AX, BX, CX, 17)
   142		ROUND1x(CX, DX, BP, AX, BX, 18)
   143		ROUND1x(BX, CX, DX, BP, AX, 19)
   144		
   145		ROUND2(AX, BX, CX, DX, BP, 20)
   146		ROUND2(BP, AX, BX, CX, DX, 21)
   147		ROUND2(DX, BP, AX, BX, CX, 22)
   148		ROUND2(CX, DX, BP, AX, BX, 23)
   149		ROUND2(BX, CX, DX, BP, AX, 24)
   150		ROUND2(AX, BX, CX, DX, BP, 25)
   151		ROUND2(BP, AX, BX, CX, DX, 26)
   152		ROUND2(DX, BP, AX, BX, CX, 27)
   153		ROUND2(CX, DX, BP, AX, BX, 28)
   154		ROUND2(BX, CX, DX, BP, AX, 29)
   155		ROUND2(AX, BX, CX, DX, BP, 30)
   156		ROUND2(BP, AX, BX, CX, DX, 31)
   157		ROUND2(DX, BP, AX, BX, CX, 32)
   158		ROUND2(CX, DX, BP, AX, BX, 33)
   159		ROUND2(BX, CX, DX, BP, AX, 34)
   160		ROUND2(AX, BX, CX, DX, BP, 35)
   161		ROUND2(BP, AX, BX, CX, DX, 36)
   162		ROUND2(DX, BP, AX, BX, CX, 37)
   163		ROUND2(CX, DX, BP, AX, BX, 38)
   164		ROUND2(BX, CX, DX, BP, AX, 39)
   165		
   166		ROUND3(AX, BX, CX, DX, BP, 40)
   167		ROUND3(BP, AX, BX, CX, DX, 41)
   168		ROUND3(DX, BP, AX, BX, CX, 42)
   169		ROUND3(CX, DX, BP, AX, BX, 43)
   170		ROUND3(BX, CX, DX, BP, AX, 44)
   171		ROUND3(AX, BX, CX, DX, BP, 45)
   172		ROUND3(BP, AX, BX, CX, DX, 46)
   173		ROUND3(DX, BP, AX, BX, CX, 47)
   174		ROUND3(CX, DX, BP, AX, BX, 48)
   175		ROUND3(BX, CX, DX, BP, AX, 49)
   176		ROUND3(AX, BX, CX, DX, BP, 50)
   177		ROUND3(BP, AX, BX, CX, DX, 51)
   178		ROUND3(DX, BP, AX, BX, CX, 52)
   179		ROUND3(CX, DX, BP, AX, BX, 53)
   180		ROUND3(BX, CX, DX, BP, AX, 54)
   181		ROUND3(AX, BX, CX, DX, BP, 55)
   182		ROUND3(BP, AX, BX, CX, DX, 56)
   183		ROUND3(DX, BP, AX, BX, CX, 57)
   184		ROUND3(CX, DX, BP, AX, BX, 58)
   185		ROUND3(BX, CX, DX, BP, AX, 59)
   186		
   187		ROUND4(AX, BX, CX, DX, BP, 60)
   188		ROUND4(BP, AX, BX, CX, DX, 61)
   189		ROUND4(DX, BP, AX, BX, CX, 62)
   190		ROUND4(CX, DX, BP, AX, BX, 63)
   191		ROUND4(BX, CX, DX, BP, AX, 64)
   192		ROUND4(AX, BX, CX, DX, BP, 65)
   193		ROUND4(BP, AX, BX, CX, DX, 66)
   194		ROUND4(DX, BP, AX, BX, CX, 67)
   195		ROUND4(CX, DX, BP, AX, BX, 68)
   196		ROUND4(BX, CX, DX, BP, AX, 69)
   197		ROUND4(AX, BX, CX, DX, BP, 70)
   198		ROUND4(BP, AX, BX, CX, DX, 71)
   199		ROUND4(DX, BP, AX, BX, CX, 72)
   200		ROUND4(CX, DX, BP, AX, BX, 73)
   201		ROUND4(BX, CX, DX, BP, AX, 74)
   202		ROUND4(AX, BX, CX, DX, BP, 75)
   203		ROUND4(BP, AX, BX, CX, DX, 76)
   204		ROUND4(DX, BP, AX, BX, CX, 77)
   205		ROUND4(CX, DX, BP, AX, BX, 78)
   206		ROUND4(BX, CX, DX, BP, AX, 79)
   207	
   208		ADDL	R11, AX
   209		ADDL	R12, BX
   210		ADDL	R13, CX
   211		ADDL	R14, DX
   212		ADDL	R15, BP
   213	
   214		ADDQ	$64, SI
   215		CMPQ	SI, DI
   216		JB	loop
   217	
   218	end:
   219		MOVQ	dig+0(FP), DI
   220		MOVL	AX, (0*4)(DI)
   221		MOVL	BX, (1*4)(DI)
   222		MOVL	CX, (2*4)(DI)
   223		MOVL	DX, (3*4)(DI)
   224		MOVL	BP, (4*4)(DI)
   225		RET
   226	
   227	
   228	// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
   229	// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
   230	// From http://software.intel.com/en-us/articles
   231	// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
   232	// This implementation is 2x unrolled, and interleaves vector instructions,
   233	// used to precompute W, with scalar computation of current round
   234	// for optimal scheduling.
   235	
   236	// Trivial helper macros.
   237	#define UPDATE_HASH(A,TB,C,D,E) \
   238		ADDL	(R9), A \
   239		MOVL	A, (R9) \
   240		ADDL	4(R9), TB \
   241		MOVL	TB, 4(R9) \
   242		ADDL	8(R9), C \
   243		MOVL	C, 8(R9) \
   244		ADDL	12(R9), D \
   245		MOVL	D, 12(R9) \
   246		ADDL	16(R9), E \
   247		MOVL	E, 16(R9)
   248	
   249	
   250	
   251	// Helper macros for PRECALC, which does precomputations
   252	#define PRECALC_0(OFFSET) \
   253		VMOVDQU   OFFSET(R10),X0
   254	
   255	#define PRECALC_1(OFFSET) \
   256		VINSERTI128 $1, OFFSET(R13), Y0, Y0
   257	
   258	#define PRECALC_2(YREG) \
   259		VPSHUFB Y10, Y0, YREG
   260	
   261	#define PRECALC_4(YREG,K_OFFSET) \
   262		VPADDD K_OFFSET(R8), YREG, Y0
   263	
   264	#define PRECALC_7(OFFSET) \
   265		VMOVDQU Y0, (OFFSET*2)(R14)
   266	
   267	
   268	// Message scheduling pre-compute for rounds 0-15
   269	// R13 is a pointer to even 64-byte block
   270	// R10 is a pointer to odd 64-byte block
   271	// R14 is a pointer to temp buffer
   272	// X0 is used as temp register
   273	// YREG is clobbered as part of computation
   274	// OFFSET chooses 16 byte chunk within a block
   275	// R8 is a pointer to constants block
   276	// K_OFFSET chooses K constants relevant to this round
   277	// X10 holds swap mask
   278	#define PRECALC_00_15(OFFSET,YREG) \
   279		PRECALC_0(OFFSET) \
   280		PRECALC_1(OFFSET) \
   281		PRECALC_2(YREG) \
   282		PRECALC_4(YREG,0x0) \
   283		PRECALC_7(OFFSET)
   284	
   285	
   286	// Helper macros for PRECALC_16_31
   287	#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   288		VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
   289		VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
   290	
   291	#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   292		VPXOR  REG_SUB_8, REG, REG \
   293		VPXOR  REG_SUB_16, Y0, Y0
   294	
   295	#define PRECALC_18(REG) \
   296		VPXOR Y0, REG, REG \
   297		VPSLLDQ $12, REG, Y9
   298	
   299	#define PRECALC_19(REG) \
   300		VPSLLD $1, REG, Y0 \
   301		VPSRLD $31, REG, REG
   302	
   303	#define PRECALC_20(REG) \
   304		VPOR REG, Y0, Y0 \
   305		VPSLLD $2, Y9,  REG
   306	
   307	#define PRECALC_21(REG) \
   308		VPSRLD $30, Y9, Y9 \
   309		VPXOR REG, Y0, Y0
   310	
   311	#define PRECALC_23(REG,K_OFFSET,OFFSET) \
   312		VPXOR Y9, Y0, REG \
   313		VPADDD K_OFFSET(R8), REG, Y0 \
   314		VMOVDQU Y0, (OFFSET)(R14)
   315	
   316	// Message scheduling pre-compute for rounds 16-31
   317	// calculating last 32 w[i] values in 8 XMM registers
   318	// pre-calculate K+w[i] values and store to mem
   319	// for later load by ALU add instruction.
   320	// "brute force" vectorization for rounds 16-31 only
   321	// due to w[i]->w[i-3] dependency.
   322	// clobbers 5 input ymm registers REG_SUB*
   323	// uses X0 and X9 as temp registers
   324	// As always, R8 is a pointer to constants block
   325	// and R14 is a pointer to temp buffer
   326	#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
   327		PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   328		PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   329		PRECALC_18(REG) \
   330		PRECALC_19(REG) \
   331		PRECALC_20(REG) \
   332		PRECALC_21(REG) \
   333		PRECALC_23(REG,K_OFFSET,OFFSET)
   334	
   335	
   336	// Helper macros for PRECALC_32_79
   337	#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
   338		VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
   339	
   340	#define PRECALC_33(REG_SUB_28,REG) \
   341		VPXOR REG_SUB_28, REG, REG
   342	
   343	#define PRECALC_34(REG_SUB_16) \
   344		VPXOR REG_SUB_16, Y0, Y0
   345	
   346	#define PRECALC_35(REG) \
   347		VPXOR Y0, REG, REG
   348	
   349	#define PRECALC_36(REG) \
   350		VPSLLD $2, REG, Y0
   351	
   352	#define PRECALC_37(REG) \
   353		VPSRLD $30, REG, REG \
   354		VPOR REG, Y0, REG
   355	
   356	#define PRECALC_39(REG,K_OFFSET,OFFSET) \
   357		VPADDD K_OFFSET(R8), REG, Y0 \
   358		VMOVDQU Y0, (OFFSET)(R14)
   359	
   360	// Message scheduling pre-compute for rounds 32-79
   361	// In SHA-1 specification we have:
   362	// w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
   363	// Which is the same as:
   364	// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
   365	// This allows for more efficient vectorization,
   366	// since w[i]->w[i-3] dependency is broken
   367	#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
   368		PRECALC_32(REG_SUB_8,REG_SUB_4) \
   369		PRECALC_33(REG_SUB_28,REG) \
   370		PRECALC_34(REG_SUB_16) \
   371		PRECALC_35(REG) \
   372		PRECALC_36(REG) \
   373		PRECALC_37(REG) \
   374		PRECALC_39(REG,K_OFFSET,OFFSET)
   375	
   376	#define PRECALC \
   377		PRECALC_00_15(0,Y15) \
   378		PRECALC_00_15(0x10,Y14) \
   379		PRECALC_00_15(0x20,Y13) \
   380		PRECALC_00_15(0x30,Y12) \
   381		PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
   382		PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
   383		PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
   384		PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
   385		PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
   386		PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
   387		PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
   388		PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
   389		PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
   390		PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
   391		PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
   392		PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
   393		PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
   394		PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
   395		PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
   396		PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
   397	
   398	// Macros calculating individual rounds have general forn
   399	// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
   400	// CALC_ROUND_{PRE,POST} macros follow
   401	
   402	#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
   403		ADDL OFFSET(R15),REG_E \
   404		ANDNL REG_C,REG_A,BP \
   405		LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   406		RORXL $0x1b, REG_A, R12 \
   407		RORXL $2, REG_A, REG_B         // for next round
   408	
   409	// Calculate F for the next round
   410	#define CALC_F1_POST(REG_A,REG_B,REG_E) \
   411		ANDL REG_B,REG_A \             // b&c
   412		XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
   413		LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
   414	
   415	
   416	// Registers are cycleickly rotated DX -> AX -> DI -> SI -> BX -> CX
   417	#define CALC_0 \
   418		MOVL SI, BX \ // Precalculating first round
   419		RORXL $2, SI, SI \
   420		ANDNL AX, BX, BP \
   421		ANDL DI, BX \
   422		XORL BP, BX \
   423		CALC_F1_PRE(0x0,CX,BX,DI,DX) \
   424		PRECALC_0(0x80) \
   425		CALC_F1_POST(CX,SI,DX)
   426	
   427	#define CALC_1 \
   428		CALC_F1_PRE(0x4,DX,CX,SI,AX) \
   429		PRECALC_1(0x80) \
   430		CALC_F1_POST(DX,BX,AX)
   431	
   432	#define CALC_2 \
   433		CALC_F1_PRE(0x8,AX,DX,BX,DI) \
   434		PRECALC_2(Y15) \
   435		CALC_F1_POST(AX,CX,DI)
   436	
   437	#define CALC_3 \
   438		CALC_F1_PRE(0xc,DI,AX,CX,SI) \
   439		CALC_F1_POST(DI,DX,SI)
   440	
   441	#define CALC_4 \
   442		CALC_F1_PRE(0x20,SI,DI,DX,BX) \
   443		PRECALC_4(Y15,0x0) \
   444		CALC_F1_POST(SI,AX,BX)
   445	
   446	#define CALC_5 \
   447		CALC_F1_PRE(0x24,BX,SI,AX,CX) \
   448		CALC_F1_POST(BX,DI,CX)
   449	
   450	#define CALC_6 \
   451		CALC_F1_PRE(0x28,CX,BX,DI,DX) \
   452		CALC_F1_POST(CX,SI,DX)
   453	
   454	#define CALC_7 \
   455		CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
   456		PRECALC_7(0x0) \
   457		CALC_F1_POST(DX,BX,AX)
   458	
   459	#define CALC_8 \
   460		CALC_F1_PRE(0x40,AX,DX,BX,DI) \
   461		PRECALC_0(0x90) \
   462		CALC_F1_POST(AX,CX,DI)
   463	
   464	#define CALC_9 \
   465		CALC_F1_PRE(0x44,DI,AX,CX,SI) \
   466		PRECALC_1(0x90) \
   467		CALC_F1_POST(DI,DX,SI)
   468	
   469	#define CALC_10 \
   470		CALC_F1_PRE(0x48,SI,DI,DX,BX) \
   471		PRECALC_2(Y14) \
   472		CALC_F1_POST(SI,AX,BX)
   473	
   474	#define CALC_11 \
   475		CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
   476		CALC_F1_POST(BX,DI,CX)
   477	
   478	#define CALC_12 \
   479		CALC_F1_PRE(0x60,CX,BX,DI,DX) \
   480		PRECALC_4(Y14,0x0) \
   481		CALC_F1_POST(CX,SI,DX)
   482	
   483	#define CALC_13 \
   484		CALC_F1_PRE(0x64,DX,CX,SI,AX) \
   485		CALC_F1_POST(DX,BX,AX)
   486	
   487	#define CALC_14 \
   488		CALC_F1_PRE(0x68,AX,DX,BX,DI) \
   489		CALC_F1_POST(AX,CX,DI)
   490	
   491	#define CALC_15 \
   492		CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
   493		PRECALC_7(0x10) \
   494		CALC_F1_POST(DI,DX,SI)
   495	
   496	#define CALC_16 \
   497		CALC_F1_PRE(0x80,SI,DI,DX,BX) \
   498		PRECALC_0(0xa0) \
   499		CALC_F1_POST(SI,AX,BX)
   500	
   501	#define CALC_17 \
   502		CALC_F1_PRE(0x84,BX,SI,AX,CX) \
   503		PRECALC_1(0xa0) \
   504		CALC_F1_POST(BX,DI,CX)
   505	
   506	#define CALC_18 \
   507		CALC_F1_PRE(0x88,CX,BX,DI,DX) \
   508		PRECALC_2(Y13) \
   509		CALC_F1_POST(CX,SI,DX)
   510	
   511	
   512	#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
   513		ADDL OFFSET(R15),REG_E \
   514		LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   515		RORXL $0x1b, REG_A, R12 \
   516		RORXL $2, REG_A, REG_B         // for next round
   517	
   518	#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
   519		XORL REG_B, REG_A \
   520		ADDL R12, REG_E \
   521	        XORL REG_C, REG_A
   522	
   523	#define CALC_19 \
   524		CALC_F2_PRE(0x8c,DX,CX,AX) \
   525		CALC_F2_POST(DX,BX,SI,AX)
   526	
   527	#define CALC_20 \
   528		CALC_F2_PRE(0xa0,AX,DX,DI) \
   529		PRECALC_4(Y13,0x0) \
   530		CALC_F2_POST(AX,CX,BX,DI)
   531	
   532	#define CALC_21 \
   533		CALC_F2_PRE(0xa4,DI,AX,SI) \
   534		CALC_F2_POST(DI,DX,CX,SI)
   535	
   536	#define CALC_22 \
   537		CALC_F2_PRE(0xa8,SI,DI,BX) \
   538		CALC_F2_POST(SI,AX,DX,BX)
   539	
   540	#define CALC_23 \
   541		CALC_F2_PRE(0xac,BX,SI,CX) \
   542		PRECALC_7(0x20) \
   543		CALC_F2_POST(BX,DI,AX,CX)
   544	
   545	#define CALC_24 \
   546		CALC_F2_PRE(0xc0,CX,BX,DX) \
   547		PRECALC_0(0xb0) \
   548		CALC_F2_POST(CX,SI,DI,DX)
   549	
   550	#define CALC_25 \
   551		CALC_F2_PRE(0xc4,DX,CX,AX) \
   552		PRECALC_1(0xb0) \
   553		CALC_F2_POST(DX,BX,SI,AX)
   554	
   555	#define CALC_26 \
   556		CALC_F2_PRE(0xc8,AX,DX,DI) \
   557		PRECALC_2(Y12) \
   558		CALC_F2_POST(AX,CX,BX,DI)
   559	
   560	#define CALC_27 \
   561		CALC_F2_PRE(0xcc,DI,AX,SI) \
   562		CALC_F2_POST(DI,DX,CX,SI)
   563	
   564	#define CALC_28 \
   565		CALC_F2_PRE(0xe0,SI,DI,BX) \
   566		PRECALC_4(Y12,0x0) \
   567		CALC_F2_POST(SI,AX,DX,BX)
   568	
   569	#define CALC_29 \
   570		CALC_F2_PRE(0xe4,BX,SI,CX) \
   571		CALC_F2_POST(BX,DI,AX,CX)
   572	
   573	#define CALC_30 \
   574		CALC_F2_PRE(0xe8,CX,BX,DX) \
   575		CALC_F2_POST(CX,SI,DI,DX)
   576	
   577	#define CALC_31 \
   578		CALC_F2_PRE(0xec,DX,CX,AX) \
   579		PRECALC_7(0x30) \
   580		CALC_F2_POST(DX,BX,SI,AX)
   581	
   582	#define CALC_32 \
   583		CALC_F2_PRE(0x100,AX,DX,DI) \
   584		PRECALC_16(Y15,Y14,Y12,Y8) \
   585		CALC_F2_POST(AX,CX,BX,DI)
   586	
   587	#define CALC_33 \
   588		CALC_F2_PRE(0x104,DI,AX,SI) \
   589		PRECALC_17(Y15,Y13,Y8) \
   590		CALC_F2_POST(DI,DX,CX,SI)
   591	
   592	#define CALC_34 \
   593		CALC_F2_PRE(0x108,SI,DI,BX) \
   594		PRECALC_18(Y8) \
   595		CALC_F2_POST(SI,AX,DX,BX)
   596	
   597	#define CALC_35 \
   598		CALC_F2_PRE(0x10c,BX,SI,CX) \
   599		PRECALC_19(Y8) \
   600		CALC_F2_POST(BX,DI,AX,CX)
   601	
   602	#define CALC_36 \
   603		CALC_F2_PRE(0x120,CX,BX,DX) \
   604		PRECALC_20(Y8) \
   605		CALC_F2_POST(CX,SI,DI,DX)
   606	
   607	#define CALC_37 \
   608		CALC_F2_PRE(0x124,DX,CX,AX) \
   609		PRECALC_21(Y8) \
   610		CALC_F2_POST(DX,BX,SI,AX)
   611	
   612	#define CALC_38 \
   613		CALC_F2_PRE(0x128,AX,DX,DI) \
   614		CALC_F2_POST(AX,CX,BX,DI)
   615	
   616	
   617	#define CALC_F3_PRE(OFFSET,REG_E) \
   618		ADDL OFFSET(R15),REG_E
   619	
   620	#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
   621		LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
   622		MOVL REG_B, BP \
   623		ORL  REG_A, BP \
   624		RORXL $0x1b, REG_A, R12 \
   625		RORXL $2, REG_A, REG_TB \
   626		ANDL REG_C, BP \		// Calculate F for the next round
   627		ANDL REG_B, REG_A \
   628		ORL  BP, REG_A \
   629		ADDL R12, REG_E
   630	
   631	#define CALC_39 \
   632		CALC_F3_PRE(0x12c,SI) \
   633		PRECALC_23(Y8,0x0,0x80) \
   634		CALC_F3_POST(DI,DX,CX,SI,AX)
   635	
   636	#define CALC_40 \
   637		CALC_F3_PRE(0x140,BX) \
   638		PRECALC_16(Y14,Y13,Y8,Y7) \
   639		CALC_F3_POST(SI,AX,DX,BX,DI)
   640	
   641	#define CALC_41 \
   642		CALC_F3_PRE(0x144,CX) \
   643		PRECALC_17(Y14,Y12,Y7) \
   644		CALC_F3_POST(BX,DI,AX,CX,SI)
   645	
   646	#define CALC_42 \
   647		CALC_F3_PRE(0x148,DX) \
   648		PRECALC_18(Y7) \
   649		CALC_F3_POST(CX,SI,DI,DX,BX)
   650	
   651	#define CALC_43 \
   652		CALC_F3_PRE(0x14c,AX) \
   653		PRECALC_19(Y7) \
   654		CALC_F3_POST(DX,BX,SI,AX,CX)
   655	
   656	#define CALC_44 \
   657		CALC_F3_PRE(0x160,DI) \
   658		PRECALC_20(Y7) \
   659		CALC_F3_POST(AX,CX,BX,DI,DX)
   660	
   661	#define CALC_45 \
   662		CALC_F3_PRE(0x164,SI) \
   663		PRECALC_21(Y7) \
   664		CALC_F3_POST(DI,DX,CX,SI,AX)
   665	
   666	#define CALC_46 \
   667		CALC_F3_PRE(0x168,BX) \
   668		CALC_F3_POST(SI,AX,DX,BX,DI)
   669	
   670	#define CALC_47 \
   671		CALC_F3_PRE(0x16c,CX) \
   672		VPXOR Y9, Y0, Y7 \
   673		VPADDD 0x20(R8), Y7, Y0 \
   674		VMOVDQU Y0, 0xa0(R14) \
   675		CALC_F3_POST(BX,DI,AX,CX,SI)
   676	
   677	#define CALC_48 \
   678		CALC_F3_PRE(0x180,DX) \
   679		PRECALC_16(Y13,Y12,Y7,Y5) \
   680		CALC_F3_POST(CX,SI,DI,DX,BX)
   681	
   682	#define CALC_49 \
   683		CALC_F3_PRE(0x184,AX) \
   684		PRECALC_17(Y13,Y8,Y5) \
   685		CALC_F3_POST(DX,BX,SI,AX,CX)
   686	
   687	#define CALC_50 \
   688		CALC_F3_PRE(0x188,DI) \
   689		PRECALC_18(Y5) \
   690		CALC_F3_POST(AX,CX,BX,DI,DX)
   691	
   692	#define CALC_51 \
   693		CALC_F3_PRE(0x18c,SI) \
   694		PRECALC_19(Y5) \
   695		CALC_F3_POST(DI,DX,CX,SI,AX)
   696	
   697	#define CALC_52 \
   698		CALC_F3_PRE(0x1a0,BX) \
   699		PRECALC_20(Y5) \
   700		CALC_F3_POST(SI,AX,DX,BX,DI)
   701	
   702	#define CALC_53 \
   703		CALC_F3_PRE(0x1a4,CX) \
   704		PRECALC_21(Y5) \
   705		CALC_F3_POST(BX,DI,AX,CX,SI)
   706	
   707	#define CALC_54 \
   708		CALC_F3_PRE(0x1a8,DX) \
   709		CALC_F3_POST(CX,SI,DI,DX,BX)
   710	
   711	#define CALC_55 \
   712		CALC_F3_PRE(0x1ac,AX) \
   713		PRECALC_23(Y5,0x20,0xc0) \
   714		CALC_F3_POST(DX,BX,SI,AX,CX)
   715	
   716	#define CALC_56 \
   717		CALC_F3_PRE(0x1c0,DI) \
   718		PRECALC_16(Y12,Y8,Y5,Y3) \
   719		CALC_F3_POST(AX,CX,BX,DI,DX)
   720	
   721	#define CALC_57 \
   722		CALC_F3_PRE(0x1c4,SI) \
   723		PRECALC_17(Y12,Y7,Y3) \
   724		CALC_F3_POST(DI,DX,CX,SI,AX)
   725	
   726	#define CALC_58 \
   727		CALC_F3_PRE(0x1c8,BX) \
   728		PRECALC_18(Y3) \
   729		CALC_F3_POST(SI,AX,DX,BX,DI)
   730	
   731	#define CALC_59 \
   732		CALC_F2_PRE(0x1cc,BX,SI,CX) \
   733		PRECALC_19(Y3) \
   734		CALC_F2_POST(BX,DI,AX,CX)
   735	
   736	#define CALC_60 \
   737		CALC_F2_PRE(0x1e0,CX,BX,DX) \
   738		PRECALC_20(Y3) \
   739		CALC_F2_POST(CX,SI,DI,DX)
   740	
   741	#define CALC_61 \
   742		CALC_F2_PRE(0x1e4,DX,CX,AX) \
   743		PRECALC_21(Y3) \
   744		CALC_F2_POST(DX,BX,SI,AX)
   745	
   746	#define CALC_62 \
   747		CALC_F2_PRE(0x1e8,AX,DX,DI) \
   748		CALC_F2_POST(AX,CX,BX,DI)
   749	
   750	#define CALC_63 \
   751		CALC_F2_PRE(0x1ec,DI,AX,SI) \
   752		PRECALC_23(Y3,0x20,0xe0) \
   753		CALC_F2_POST(DI,DX,CX,SI)
   754	
   755	#define CALC_64 \
   756		CALC_F2_PRE(0x200,SI,DI,BX) \
   757		PRECALC_32(Y5,Y3) \
   758		CALC_F2_POST(SI,AX,DX,BX)
   759	
   760	#define CALC_65 \
   761		CALC_F2_PRE(0x204,BX,SI,CX) \
   762		PRECALC_33(Y14,Y15) \
   763		CALC_F2_POST(BX,DI,AX,CX)
   764	
   765	#define CALC_66 \
   766		CALC_F2_PRE(0x208,CX,BX,DX) \
   767		PRECALC_34(Y8) \
   768		CALC_F2_POST(CX,SI,DI,DX)
   769	
   770	#define CALC_67 \
   771		CALC_F2_PRE(0x20c,DX,CX,AX) \
   772		PRECALC_35(Y15) \
   773		CALC_F2_POST(DX,BX,SI,AX)
   774	
   775	#define CALC_68 \
   776		CALC_F2_PRE(0x220,AX,DX,DI) \
   777		PRECALC_36(Y15) \
   778		CALC_F2_POST(AX,CX,BX,DI)
   779	
   780	#define CALC_69 \
   781		CALC_F2_PRE(0x224,DI,AX,SI) \
   782		PRECALC_37(Y15) \
   783		CALC_F2_POST(DI,DX,CX,SI)
   784	
   785	#define CALC_70 \
   786		CALC_F2_PRE(0x228,SI,DI,BX) \
   787		CALC_F2_POST(SI,AX,DX,BX)
   788	
   789	#define CALC_71 \
   790		CALC_F2_PRE(0x22c,BX,SI,CX) \
   791		PRECALC_39(Y15,0x20,0x100) \
   792		CALC_F2_POST(BX,DI,AX,CX)
   793	
   794	#define CALC_72 \
   795		CALC_F2_PRE(0x240,CX,BX,DX) \
   796		PRECALC_32(Y3,Y15) \
   797		CALC_F2_POST(CX,SI,DI,DX)
   798	
   799	#define CALC_73 \
   800		CALC_F2_PRE(0x244,DX,CX,AX) \
   801		PRECALC_33(Y13,Y14) \
   802		CALC_F2_POST(DX,BX,SI,AX)
   803	
   804	#define CALC_74 \
   805		CALC_F2_PRE(0x248,AX,DX,DI) \
   806		PRECALC_34(Y7) \
   807		CALC_F2_POST(AX,CX,BX,DI)
   808	
   809	#define CALC_75 \
   810		CALC_F2_PRE(0x24c,DI,AX,SI) \
   811		PRECALC_35(Y14) \
   812		CALC_F2_POST(DI,DX,CX,SI)
   813	
   814	#define CALC_76 \
   815		CALC_F2_PRE(0x260,SI,DI,BX) \
   816		PRECALC_36(Y14) \
   817		CALC_F2_POST(SI,AX,DX,BX)
   818	
   819	#define CALC_77 \
   820		CALC_F2_PRE(0x264,BX,SI,CX) \
   821		PRECALC_37(Y14) \
   822		CALC_F2_POST(BX,DI,AX,CX)
   823	
   824	#define CALC_78 \
   825		CALC_F2_PRE(0x268,CX,BX,DX) \
   826		CALC_F2_POST(CX,SI,DI,DX)
   827	
   828	#define CALC_79 \
   829		ADDL 0x26c(R15), AX \
   830		LEAL (AX)(CX*1), AX \
   831		RORXL $0x1b, DX, R12 \
   832		PRECALC_39(Y14,0x20,0x120) \
   833		ADDL R12, AX
   834	
   835	// Similar to CALC_0
   836	#define CALC_80 \
   837		MOVL CX, DX \
   838		RORXL $2, CX, CX \
   839		ANDNL SI, DX, BP \
   840		ANDL BX, DX \
   841		XORL BP, DX \
   842		CALC_F1_PRE(0x10,AX,DX,BX,DI) \
   843		PRECALC_32(Y15,Y14) \
   844		CALC_F1_POST(AX,CX,DI)
   845	
   846	#define CALC_81 \
   847		CALC_F1_PRE(0x14,DI,AX,CX,SI) \
   848		PRECALC_33(Y12,Y13) \
   849		CALC_F1_POST(DI,DX,SI)
   850	
   851	#define CALC_82 \
   852		CALC_F1_PRE(0x18,SI,DI,DX,BX) \
   853		PRECALC_34(Y5) \
   854		CALC_F1_POST(SI,AX,BX)
   855	
   856	#define CALC_83 \
   857		CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
   858		PRECALC_35(Y13) \
   859		CALC_F1_POST(BX,DI,CX)
   860	
   861	#define CALC_84 \
   862		CALC_F1_PRE(0x30,CX,BX,DI,DX) \
   863		PRECALC_36(Y13) \
   864		CALC_F1_POST(CX,SI,DX)
   865	
   866	#define CALC_85 \
   867		CALC_F1_PRE(0x34,DX,CX,SI,AX) \
   868		PRECALC_37(Y13) \
   869		CALC_F1_POST(DX,BX,AX)
   870	
   871	#define CALC_86 \
   872		CALC_F1_PRE(0x38,AX,DX,BX,DI) \
   873		CALC_F1_POST(AX,CX,DI)
   874	
   875	#define CALC_87 \
   876		CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
   877		PRECALC_39(Y13,0x40,0x140) \
   878		CALC_F1_POST(DI,DX,SI)
   879	
   880	#define CALC_88 \
   881		CALC_F1_PRE(0x50,SI,DI,DX,BX) \
   882		PRECALC_32(Y14,Y13) \
   883		CALC_F1_POST(SI,AX,BX)
   884	
   885	#define CALC_89 \
   886		CALC_F1_PRE(0x54,BX,SI,AX,CX) \
   887		PRECALC_33(Y8,Y12) \
   888		CALC_F1_POST(BX,DI,CX)
   889	
   890	#define CALC_90 \
   891		CALC_F1_PRE(0x58,CX,BX,DI,DX) \
   892		PRECALC_34(Y3) \
   893		CALC_F1_POST(CX,SI,DX)
   894	
   895	#define CALC_91 \
   896		CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
   897		PRECALC_35(Y12) \
   898		CALC_F1_POST(DX,BX,AX)
   899	
   900	#define CALC_92 \
   901		CALC_F1_PRE(0x70,AX,DX,BX,DI) \
   902		PRECALC_36(Y12) \
   903		CALC_F1_POST(AX,CX,DI)
   904	
   905	#define CALC_93 \
   906		CALC_F1_PRE(0x74,DI,AX,CX,SI) \
   907		PRECALC_37(Y12) \
   908		CALC_F1_POST(DI,DX,SI)
   909	
   910	#define CALC_94 \
   911		CALC_F1_PRE(0x78,SI,DI,DX,BX) \
   912		CALC_F1_POST(SI,AX,BX)
   913	
   914	#define CALC_95 \
   915		CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
   916		PRECALC_39(Y12,0x40,0x160) \
   917		CALC_F1_POST(BX,DI,CX)
   918	
   919	#define CALC_96 \
   920		CALC_F1_PRE(0x90,CX,BX,DI,DX) \
   921		PRECALC_32(Y13,Y12) \
   922		CALC_F1_POST(CX,SI,DX)
   923	
   924	#define CALC_97 \
   925		CALC_F1_PRE(0x94,DX,CX,SI,AX) \
   926		PRECALC_33(Y7,Y8) \
   927		CALC_F1_POST(DX,BX,AX)
   928	
   929	#define CALC_98 \
   930		CALC_F1_PRE(0x98,AX,DX,BX,DI) \
   931		PRECALC_34(Y15) \
   932		CALC_F1_POST(AX,CX,DI)
   933	
   934	#define CALC_99 \
   935		CALC_F2_PRE(0x9c,DI,AX,SI) \
   936		PRECALC_35(Y8) \
   937		CALC_F2_POST(DI,DX,CX,SI)
   938	
   939	#define CALC_100 \
   940		CALC_F2_PRE(0xb0,SI,DI,BX) \
   941		PRECALC_36(Y8) \
   942		CALC_F2_POST(SI,AX,DX,BX)
   943	
   944	#define CALC_101 \
   945		CALC_F2_PRE(0xb4,BX,SI,CX) \
   946		PRECALC_37(Y8) \
   947		CALC_F2_POST(BX,DI,AX,CX)
   948	
   949	#define CALC_102 \
   950		CALC_F2_PRE(0xb8,CX,BX,DX) \
   951		CALC_F2_POST(CX,SI,DI,DX)
   952	
   953	#define CALC_103 \
   954		CALC_F2_PRE(0xbc,DX,CX,AX) \
   955		PRECALC_39(Y8,0x40,0x180) \
   956		CALC_F2_POST(DX,BX,SI,AX)
   957	
   958	#define CALC_104 \
   959		CALC_F2_PRE(0xd0,AX,DX,DI) \
   960		PRECALC_32(Y12,Y8) \
   961		CALC_F2_POST(AX,CX,BX,DI)
   962	
   963	#define CALC_105 \
   964		CALC_F2_PRE(0xd4,DI,AX,SI) \
   965		PRECALC_33(Y5,Y7) \
   966		CALC_F2_POST(DI,DX,CX,SI)
   967	
   968	#define CALC_106 \
   969		CALC_F2_PRE(0xd8,SI,DI,BX) \
   970		PRECALC_34(Y14) \
   971		CALC_F2_POST(SI,AX,DX,BX)
   972	
   973	#define CALC_107 \
   974		CALC_F2_PRE(0xdc,BX,SI,CX) \
   975		PRECALC_35(Y7) \
   976		CALC_F2_POST(BX,DI,AX,CX)
   977	
   978	#define CALC_108 \
   979		CALC_F2_PRE(0xf0,CX,BX,DX) \
   980		PRECALC_36(Y7) \
   981		CALC_F2_POST(CX,SI,DI,DX)
   982	
   983	#define CALC_109 \
   984		CALC_F2_PRE(0xf4,DX,CX,AX) \
   985		PRECALC_37(Y7) \
   986		CALC_F2_POST(DX,BX,SI,AX)
   987	
   988	#define CALC_110 \
   989		CALC_F2_PRE(0xf8,AX,DX,DI) \
   990		CALC_F2_POST(AX,CX,BX,DI)
   991	
   992	#define CALC_111 \
   993		CALC_F2_PRE(0xfc,DI,AX,SI) \
   994		PRECALC_39(Y7,0x40,0x1a0) \
   995		CALC_F2_POST(DI,DX,CX,SI)
   996	
   997	#define CALC_112 \
   998		CALC_F2_PRE(0x110,SI,DI,BX) \
   999		PRECALC_32(Y8,Y7) \
  1000		CALC_F2_POST(SI,AX,DX,BX)
  1001	
  1002	#define CALC_113 \
  1003		CALC_F2_PRE(0x114,BX,SI,CX) \
  1004		PRECALC_33(Y3,Y5) \
  1005		CALC_F2_POST(BX,DI,AX,CX)
  1006	
  1007	#define CALC_114 \
  1008		CALC_F2_PRE(0x118,CX,BX,DX) \
  1009		PRECALC_34(Y13) \
  1010		CALC_F2_POST(CX,SI,DI,DX)
  1011	
  1012	#define CALC_115 \
  1013		CALC_F2_PRE(0x11c,DX,CX,AX) \
  1014		PRECALC_35(Y5) \
  1015		CALC_F2_POST(DX,BX,SI,AX)
  1016	
  1017	#define CALC_116 \
  1018		CALC_F2_PRE(0x130,AX,DX,DI) \
  1019		PRECALC_36(Y5) \
  1020		CALC_F2_POST(AX,CX,BX,DI)
  1021	
  1022	#define CALC_117 \
  1023		CALC_F2_PRE(0x134,DI,AX,SI) \
  1024		PRECALC_37(Y5) \
  1025		CALC_F2_POST(DI,DX,CX,SI)
  1026	
  1027	#define CALC_118 \
  1028		CALC_F2_PRE(0x138,SI,DI,BX) \
  1029		CALC_F2_POST(SI,AX,DX,BX)
  1030	
  1031	#define CALC_119 \
  1032		CALC_F3_PRE(0x13c,CX) \
  1033		PRECALC_39(Y5,0x40,0x1c0) \
  1034		CALC_F3_POST(BX,DI,AX,CX,SI)
  1035	
  1036	#define CALC_120 \
  1037		CALC_F3_PRE(0x150,DX) \
  1038		PRECALC_32(Y7,Y5) \
  1039		CALC_F3_POST(CX,SI,DI,DX,BX)
  1040	
  1041	#define CALC_121 \
  1042		CALC_F3_PRE(0x154,AX) \
  1043		PRECALC_33(Y15,Y3) \
  1044		CALC_F3_POST(DX,BX,SI,AX,CX)
  1045	
  1046	#define CALC_122 \
  1047		CALC_F3_PRE(0x158,DI) \
  1048		PRECALC_34(Y12) \
  1049		CALC_F3_POST(AX,CX,BX,DI,DX)
  1050	
  1051	#define CALC_123 \
  1052		CALC_F3_PRE(0x15c,SI) \
  1053		PRECALC_35(Y3) \
  1054		CALC_F3_POST(DI,DX,CX,SI,AX)
  1055	
  1056	#define CALC_124 \
  1057		CALC_F3_PRE(0x170,BX) \
  1058		PRECALC_36(Y3) \
  1059		CALC_F3_POST(SI,AX,DX,BX,DI)
  1060	
  1061	#define CALC_125 \
  1062		CALC_F3_PRE(0x174,CX) \
  1063		PRECALC_37(Y3) \
  1064		CALC_F3_POST(BX,DI,AX,CX,SI)
  1065	
  1066	#define CALC_126 \
  1067		CALC_F3_PRE(0x178,DX) \
  1068		CALC_F3_POST(CX,SI,DI,DX,BX)
  1069	
  1070	#define CALC_127 \
  1071		CALC_F3_PRE(0x17c,AX) \
  1072		PRECALC_39(Y3,0x60,0x1e0) \
  1073		CALC_F3_POST(DX,BX,SI,AX,CX)
  1074	
  1075	#define CALC_128 \
  1076		CALC_F3_PRE(0x190,DI) \
  1077		PRECALC_32(Y5,Y3) \
  1078		CALC_F3_POST(AX,CX,BX,DI,DX)
  1079	
  1080	#define CALC_129 \
  1081		CALC_F3_PRE(0x194,SI) \
  1082		PRECALC_33(Y14,Y15) \
  1083		CALC_F3_POST(DI,DX,CX,SI,AX)
  1084	
  1085	#define CALC_130 \
  1086		CALC_F3_PRE(0x198,BX) \
  1087		PRECALC_34(Y8) \
  1088		CALC_F3_POST(SI,AX,DX,BX,DI)
  1089	
  1090	#define CALC_131 \
  1091		CALC_F3_PRE(0x19c,CX) \
  1092		PRECALC_35(Y15) \
  1093		CALC_F3_POST(BX,DI,AX,CX,SI)
  1094	
  1095	#define CALC_132 \
  1096		CALC_F3_PRE(0x1b0,DX) \
  1097		PRECALC_36(Y15) \
  1098		CALC_F3_POST(CX,SI,DI,DX,BX)
  1099	
  1100	#define CALC_133 \
  1101		CALC_F3_PRE(0x1b4,AX) \
  1102		PRECALC_37(Y15) \
  1103		CALC_F3_POST(DX,BX,SI,AX,CX)
  1104	
  1105	#define CALC_134 \
  1106		CALC_F3_PRE(0x1b8,DI) \
  1107		CALC_F3_POST(AX,CX,BX,DI,DX)
  1108	
  1109	#define CALC_135 \
  1110		CALC_F3_PRE(0x1bc,SI) \
  1111		PRECALC_39(Y15,0x60,0x200) \
  1112		CALC_F3_POST(DI,DX,CX,SI,AX)
  1113	
  1114	#define CALC_136 \
  1115		CALC_F3_PRE(0x1d0,BX) \
  1116		PRECALC_32(Y3,Y15) \
  1117		CALC_F3_POST(SI,AX,DX,BX,DI)
  1118	
  1119	#define CALC_137 \
  1120		CALC_F3_PRE(0x1d4,CX) \
  1121		PRECALC_33(Y13,Y14) \
  1122		CALC_F3_POST(BX,DI,AX,CX,SI)
  1123	
  1124	#define CALC_138 \
  1125		CALC_F3_PRE(0x1d8,DX) \
  1126		PRECALC_34(Y7) \
  1127		CALC_F3_POST(CX,SI,DI,DX,BX)
  1128	
  1129	#define CALC_139 \
  1130		CALC_F2_PRE(0x1dc,DX,CX,AX) \
  1131		PRECALC_35(Y14) \
  1132		CALC_F2_POST(DX,BX,SI,AX)
  1133	
  1134	#define CALC_140 \
  1135		CALC_F2_PRE(0x1f0,AX,DX,DI) \
  1136		PRECALC_36(Y14) \
  1137		CALC_F2_POST(AX,CX,BX,DI)
  1138	
  1139	#define CALC_141 \
  1140		CALC_F2_PRE(0x1f4,DI,AX,SI) \
  1141		PRECALC_37(Y14) \
  1142		CALC_F2_POST(DI,DX,CX,SI)
  1143	
  1144	#define CALC_142 \
  1145		CALC_F2_PRE(0x1f8,SI,DI,BX) \
  1146		CALC_F2_POST(SI,AX,DX,BX)
  1147	
  1148	#define CALC_143 \
  1149		CALC_F2_PRE(0x1fc,BX,SI,CX) \
  1150		PRECALC_39(Y14,0x60,0x220) \
  1151		CALC_F2_POST(BX,DI,AX,CX)
  1152	
  1153	#define CALC_144 \
  1154		CALC_F2_PRE(0x210,CX,BX,DX) \
  1155		PRECALC_32(Y15,Y14) \
  1156		CALC_F2_POST(CX,SI,DI,DX)
  1157	
  1158	#define CALC_145 \
  1159		CALC_F2_PRE(0x214,DX,CX,AX) \
  1160		PRECALC_33(Y12,Y13) \
  1161		CALC_F2_POST(DX,BX,SI,AX)
  1162	
  1163	#define CALC_146 \
  1164		CALC_F2_PRE(0x218,AX,DX,DI) \
  1165		PRECALC_34(Y5) \
  1166		CALC_F2_POST(AX,CX,BX,DI)
  1167	
  1168	#define CALC_147 \
  1169		CALC_F2_PRE(0x21c,DI,AX,SI) \
  1170		PRECALC_35(Y13) \
  1171		CALC_F2_POST(DI,DX,CX,SI)
  1172	
  1173	#define CALC_148 \
  1174		CALC_F2_PRE(0x230,SI,DI,BX) \
  1175		PRECALC_36(Y13) \
  1176		CALC_F2_POST(SI,AX,DX,BX)
  1177	
  1178	#define CALC_149 \
  1179		CALC_F2_PRE(0x234,BX,SI,CX) \
  1180		PRECALC_37(Y13) \
  1181		CALC_F2_POST(BX,DI,AX,CX)
  1182	
  1183	#define CALC_150 \
  1184		CALC_F2_PRE(0x238,CX,BX,DX) \
  1185		CALC_F2_POST(CX,SI,DI,DX)
  1186	
  1187	#define CALC_151 \
  1188		CALC_F2_PRE(0x23c,DX,CX,AX) \
  1189		PRECALC_39(Y13,0x60,0x240) \
  1190		CALC_F2_POST(DX,BX,SI,AX)
  1191	
  1192	#define CALC_152 \
  1193		CALC_F2_PRE(0x250,AX,DX,DI) \
  1194		PRECALC_32(Y14,Y13) \
  1195		CALC_F2_POST(AX,CX,BX,DI)
  1196	
  1197	#define CALC_153 \
  1198		CALC_F2_PRE(0x254,DI,AX,SI) \
  1199		PRECALC_33(Y8,Y12) \
  1200		CALC_F2_POST(DI,DX,CX,SI)
  1201	
  1202	#define CALC_154 \
  1203		CALC_F2_PRE(0x258,SI,DI,BX) \
  1204		PRECALC_34(Y3) \
  1205		CALC_F2_POST(SI,AX,DX,BX)
  1206	
  1207	#define CALC_155 \
  1208		CALC_F2_PRE(0x25c,BX,SI,CX) \
  1209		PRECALC_35(Y12) \
  1210		CALC_F2_POST(BX,DI,AX,CX)
  1211	
  1212	#define CALC_156 \
  1213		CALC_F2_PRE(0x270,CX,BX,DX) \
  1214		PRECALC_36(Y12) \
  1215		CALC_F2_POST(CX,SI,DI,DX)
  1216	
  1217	#define CALC_157 \
  1218		CALC_F2_PRE(0x274,DX,CX,AX) \
  1219		PRECALC_37(Y12) \
  1220		CALC_F2_POST(DX,BX,SI,AX)
  1221	
  1222	#define CALC_158 \
  1223		CALC_F2_PRE(0x278,AX,DX,DI) \
  1224		CALC_F2_POST(AX,CX,BX,DI)
  1225	
  1226	#define CALC_159 \
  1227		ADDL 0x27c(R15),SI \
  1228		LEAL (SI)(AX*1), SI \
  1229		RORXL $0x1b, DI, R12 \
  1230		PRECALC_39(Y12,0x60,0x260) \
  1231		ADDL R12, SI
  1232	
  1233	
  1234	
  1235	#define CALC \
  1236		MOVL	(R9), CX \
  1237		MOVL	4(R9), SI \
  1238		MOVL	8(R9), DI \
  1239		MOVL	12(R9), AX \
  1240		MOVL	16(R9), DX \
  1241		MOVQ    SP, R14 \
  1242		LEAQ    (2*4*80+32)(SP), R15 \
  1243		PRECALC \ // Precalc WK for first 2 blocks
  1244		XCHGQ   R15, R14 \
  1245	loop: \  // this loops is unrolled
  1246		CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
  1247		JNE	begin \
  1248		VZEROUPPER \
  1249		RET \
  1250	begin: \
  1251		CALC_0 \
  1252		CALC_1 \
  1253		CALC_2 \
  1254		CALC_3 \
  1255		CALC_4 \
  1256		CALC_5 \
  1257		CALC_6 \
  1258		CALC_7 \
  1259		CALC_8 \
  1260		CALC_9 \
  1261		CALC_10 \
  1262		CALC_11 \
  1263		CALC_12 \
  1264		CALC_13 \
  1265		CALC_14 \
  1266		CALC_15 \
  1267		CALC_16 \
  1268		CALC_17 \
  1269		CALC_18 \
  1270		CALC_19 \
  1271		CALC_20 \
  1272		CALC_21 \
  1273		CALC_22 \
  1274		CALC_23 \
  1275		CALC_24 \
  1276		CALC_25 \
  1277		CALC_26 \
  1278		CALC_27 \
  1279		CALC_28 \
  1280		CALC_29 \
  1281		CALC_30 \
  1282		CALC_31 \
  1283		CALC_32 \
  1284		CALC_33 \
  1285		CALC_34 \
  1286		CALC_35 \
  1287		CALC_36 \
  1288		CALC_37 \
  1289		CALC_38 \
  1290		CALC_39 \
  1291		CALC_40 \
  1292		CALC_41 \
  1293		CALC_42 \
  1294		CALC_43 \
  1295		CALC_44 \
  1296		CALC_45 \
  1297		CALC_46 \
  1298		CALC_47 \
  1299		CALC_48 \
  1300		CALC_49 \
  1301		CALC_50 \
  1302		CALC_51 \
  1303		CALC_52 \
  1304		CALC_53 \
  1305		CALC_54 \
  1306		CALC_55 \
  1307		CALC_56 \
  1308		CALC_57 \
  1309		CALC_58 \
  1310		CALC_59 \
  1311		ADDQ $128, R10 \ // move to next even-64-byte block
  1312		CMPQ R10, R11 \ // is current block the last one?
  1313		CMOVQCC R8, R10 \ // signal the last iteration smartly
  1314		CALC_60 \
  1315		CALC_61 \
  1316		CALC_62 \
  1317		CALC_63 \
  1318		CALC_64 \
  1319		CALC_65 \
  1320		CALC_66 \
  1321		CALC_67 \
  1322		CALC_68 \
  1323		CALC_69 \
  1324		CALC_70 \
  1325		CALC_71 \
  1326		CALC_72 \
  1327		CALC_73 \
  1328		CALC_74 \
  1329		CALC_75 \
  1330		CALC_76 \
  1331		CALC_77 \
  1332		CALC_78 \
  1333		CALC_79 \
  1334		UPDATE_HASH(AX,DX,BX,SI,DI) \
  1335		CMPQ R10, R8 \ // is current block the last one?
  1336		JE loop\
  1337		MOVL DX, CX \
  1338		CALC_80 \
  1339		CALC_81 \
  1340		CALC_82 \
  1341		CALC_83 \
  1342		CALC_84 \
  1343		CALC_85 \
  1344		CALC_86 \
  1345		CALC_87 \
  1346		CALC_88 \
  1347		CALC_89 \
  1348		CALC_90 \
  1349		CALC_91 \
  1350		CALC_92 \
  1351		CALC_93 \
  1352		CALC_94 \
  1353		CALC_95 \
  1354		CALC_96 \
  1355		CALC_97 \
  1356		CALC_98 \
  1357		CALC_99 \
  1358		CALC_100 \
  1359		CALC_101 \
  1360		CALC_102 \
  1361		CALC_103 \
  1362		CALC_104 \
  1363		CALC_105 \
  1364		CALC_106 \
  1365		CALC_107 \
  1366		CALC_108 \
  1367		CALC_109 \
  1368		CALC_110 \
  1369		CALC_111 \
  1370		CALC_112 \
  1371		CALC_113 \
  1372		CALC_114 \
  1373		CALC_115 \
  1374		CALC_116 \
  1375		CALC_117 \
  1376		CALC_118 \
  1377		CALC_119 \
  1378		CALC_120 \
  1379		CALC_121 \
  1380		CALC_122 \
  1381		CALC_123 \
  1382		CALC_124 \
  1383		CALC_125 \
  1384		CALC_126 \
  1385		CALC_127 \
  1386		CALC_128 \
  1387		CALC_129 \
  1388		CALC_130 \
  1389		CALC_131 \
  1390		CALC_132 \
  1391		CALC_133 \
  1392		CALC_134 \
  1393		CALC_135 \
  1394		CALC_136 \
  1395		CALC_137 \
  1396		CALC_138 \
  1397		CALC_139 \
  1398		ADDQ $128, R13 \ //move to next even-64-byte block
  1399		CMPQ R13, R11 \ //is current block the last one?
  1400		CMOVQCC R8, R10 \
  1401		CALC_140 \
  1402		CALC_141 \
  1403		CALC_142 \
  1404		CALC_143 \
  1405		CALC_144 \
  1406		CALC_145 \
  1407		CALC_146 \
  1408		CALC_147 \
  1409		CALC_148 \
  1410		CALC_149 \
  1411		CALC_150 \
  1412		CALC_151 \
  1413		CALC_152 \
  1414		CALC_153 \
  1415		CALC_154 \
  1416		CALC_155 \
  1417		CALC_156 \
  1418		CALC_157 \
  1419		CALC_158 \
  1420		CALC_159 \
  1421		UPDATE_HASH(SI,DI,DX,CX,BX) \
  1422		MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
  1423		MOVL	DI, SI \
  1424		MOVL	DX, DI \
  1425		MOVL	BX, DX \
  1426		MOVL	CX, AX \
  1427		MOVL	R12, CX \
  1428		XCHGQ   R15, R14 \
  1429		JMP     loop
  1430	
  1431	
  1432	
  1433	TEXT ·blockAVX2(SB),$1408-32
  1434	
  1435		MOVQ	dig+0(FP),	DI
  1436		MOVQ	p_base+8(FP),	SI
  1437		MOVQ	p_len+16(FP),	DX
  1438		SHRQ	$6,		DX
  1439		SHLQ	$6,		DX
  1440	
  1441		MOVQ	$K_XMM_AR<>(SB), R8
  1442	
  1443		MOVQ	DI, R9
  1444		MOVQ	SI, R10
  1445		LEAQ	64(SI), R13
  1446	
  1447		ADDQ	SI, DX
  1448		ADDQ	$64, DX
  1449		MOVQ	DX, R11
  1450	
  1451		CMPQ	R13, R11
  1452		CMOVQCC	R8, R13
  1453	
  1454		MOVQ    $BSWAP_SHUFB_CTL<>(SB), R8
  1455		VMOVDQU (R8), Y10
  1456		MOVQ	$K_XMM_AR<>(SB), R8 //restore R8
  1457	
  1458		CALC // RET is inside macros
  1459	
  1460	DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
  1461	DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
  1462	DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
  1463	DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
  1464	DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
  1465	DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
  1466	DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
  1467	DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
  1468	DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
  1469	DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
  1470	DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
  1471	DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
  1472	DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
  1473	DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
  1474	DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
  1475	DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
  1476	DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
  1477	DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
  1478	DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
  1479	DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
  1480	DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
  1481	DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
  1482	DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
  1483	DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
  1484	DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
  1485	DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
  1486	DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
  1487	DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
  1488	DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
  1489	DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
  1490	DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
  1491	DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
  1492	GLOBL K_XMM_AR<>(SB),RODATA,$128
  1493	
  1494	DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
  1495	DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
  1496	DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
  1497	DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
  1498	DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
  1499	DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
  1500	DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
  1501	DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
  1502	GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32

View as plain text