Text file src/crypto/sha1/sha1block_amd64.s

Documentation: crypto/sha1

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// AVX2 version by Intel, same algorithm as code in Linux kernel:
     6// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
     7// Authors:
     8// Ilya Albrekht <ilya.albrekht@intel.com>
     9// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
    10// Ronen Zohar <ronen.zohar@intel.com>
    11// Chandramouli Narayanan <mouli@linux.intel.com>
    12
    13
    14#include "textflag.h"
    15
    16// SHA-1 block routine. See sha1block.go for Go equivalent.
    17//
    18// There are 80 rounds of 4 types:
    19//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
    20//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
    21//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
    22//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
    23//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
    24//
    25// Each round loads or shuffles the data, then computes a per-round
    26// function of b, c, d, and then mixes the result into and rotates the
    27// five registers a, b, c, d, e holding the intermediate results.
    28//
    29// The register rotation is implemented by rotating the arguments to
    30// the round macros instead of by explicit move instructions.
    31
    32#define LOAD(index) \
    33	MOVL	(index*4)(SI), R10; \
    34	BSWAPL	R10; \
    35	MOVL	R10, (index*4)(SP)
    36
    37#define SHUFFLE(index) \
    38	MOVL	(((index)&0xf)*4)(SP), R10; \
    39	XORL	(((index-3)&0xf)*4)(SP), R10; \
    40	XORL	(((index-8)&0xf)*4)(SP), R10; \
    41	XORL	(((index-14)&0xf)*4)(SP), R10; \
    42	ROLL	$1, R10; \
    43	MOVL	R10, (((index)&0xf)*4)(SP)
    44
    45#define FUNC1(a, b, c, d, e) \
    46	MOVL	d, R9; \
    47	XORL	c, R9; \
    48	ANDL	b, R9; \
    49	XORL	d, R9
    50
    51#define FUNC2(a, b, c, d, e) \
    52	MOVL	b, R9; \
    53	XORL	c, R9; \
    54	XORL	d, R9
    55
    56#define FUNC3(a, b, c, d, e) \
    57	MOVL	b, R8; \
    58	ORL	c, R8; \
    59	ANDL	d, R8; \
    60	MOVL	b, R9; \
    61	ANDL	c, R9; \
    62	ORL	R8, R9
    63
    64#define FUNC4 FUNC2
    65
    66#define MIX(a, b, c, d, e, const) \
    67	ROLL	$30, b; \
    68	ADDL	R9, e; \
    69	MOVL	a, R8; \
    70	ROLL	$5, R8; \
    71	LEAL	const(e)(R10*1), e; \
    72	ADDL	R8, e
    73
    74#define ROUND1(a, b, c, d, e, index) \
    75	LOAD(index); \
    76	FUNC1(a, b, c, d, e); \
    77	MIX(a, b, c, d, e, 0x5A827999)
    78
    79#define ROUND1x(a, b, c, d, e, index) \
    80	SHUFFLE(index); \
    81	FUNC1(a, b, c, d, e); \
    82	MIX(a, b, c, d, e, 0x5A827999)
    83
    84#define ROUND2(a, b, c, d, e, index) \
    85	SHUFFLE(index); \
    86	FUNC2(a, b, c, d, e); \
    87	MIX(a, b, c, d, e, 0x6ED9EBA1)
    88
    89#define ROUND3(a, b, c, d, e, index) \
    90	SHUFFLE(index); \
    91	FUNC3(a, b, c, d, e); \
    92	MIX(a, b, c, d, e, 0x8F1BBCDC)
    93
    94#define ROUND4(a, b, c, d, e, index) \
    95	SHUFFLE(index); \
    96	FUNC4(a, b, c, d, e); \
    97	MIX(a, b, c, d, e, 0xCA62C1D6)
    98
    99TEXT ·blockAMD64(SB),NOSPLIT,$64-32
   100	MOVQ	dig+0(FP),	BP
   101	MOVQ	p_base+8(FP),	SI
   102	MOVQ	p_len+16(FP),	DX
   103	SHRQ	$6,		DX
   104	SHLQ	$6,		DX
   105
   106	LEAQ	(SI)(DX*1),	DI
   107	MOVL	(0*4)(BP),	AX
   108	MOVL	(1*4)(BP),	BX
   109	MOVL	(2*4)(BP),	CX
   110	MOVL	(3*4)(BP),	DX
   111	MOVL	(4*4)(BP),	BP
   112
   113	CMPQ	SI,		DI
   114	JEQ	end
   115
   116loop:
   117	MOVL	AX,	R11
   118	MOVL	BX,	R12
   119	MOVL	CX,	R13
   120	MOVL	DX,	R14
   121	MOVL	BP,	R15
   122
   123	ROUND1(AX, BX, CX, DX, BP, 0)
   124	ROUND1(BP, AX, BX, CX, DX, 1)
   125	ROUND1(DX, BP, AX, BX, CX, 2)
   126	ROUND1(CX, DX, BP, AX, BX, 3)
   127	ROUND1(BX, CX, DX, BP, AX, 4)
   128	ROUND1(AX, BX, CX, DX, BP, 5)
   129	ROUND1(BP, AX, BX, CX, DX, 6)
   130	ROUND1(DX, BP, AX, BX, CX, 7)
   131	ROUND1(CX, DX, BP, AX, BX, 8)
   132	ROUND1(BX, CX, DX, BP, AX, 9)
   133	ROUND1(AX, BX, CX, DX, BP, 10)
   134	ROUND1(BP, AX, BX, CX, DX, 11)
   135	ROUND1(DX, BP, AX, BX, CX, 12)
   136	ROUND1(CX, DX, BP, AX, BX, 13)
   137	ROUND1(BX, CX, DX, BP, AX, 14)
   138	ROUND1(AX, BX, CX, DX, BP, 15)
   139
   140	ROUND1x(BP, AX, BX, CX, DX, 16)
   141	ROUND1x(DX, BP, AX, BX, CX, 17)
   142	ROUND1x(CX, DX, BP, AX, BX, 18)
   143	ROUND1x(BX, CX, DX, BP, AX, 19)
   144
   145	ROUND2(AX, BX, CX, DX, BP, 20)
   146	ROUND2(BP, AX, BX, CX, DX, 21)
   147	ROUND2(DX, BP, AX, BX, CX, 22)
   148	ROUND2(CX, DX, BP, AX, BX, 23)
   149	ROUND2(BX, CX, DX, BP, AX, 24)
   150	ROUND2(AX, BX, CX, DX, BP, 25)
   151	ROUND2(BP, AX, BX, CX, DX, 26)
   152	ROUND2(DX, BP, AX, BX, CX, 27)
   153	ROUND2(CX, DX, BP, AX, BX, 28)
   154	ROUND2(BX, CX, DX, BP, AX, 29)
   155	ROUND2(AX, BX, CX, DX, BP, 30)
   156	ROUND2(BP, AX, BX, CX, DX, 31)
   157	ROUND2(DX, BP, AX, BX, CX, 32)
   158	ROUND2(CX, DX, BP, AX, BX, 33)
   159	ROUND2(BX, CX, DX, BP, AX, 34)
   160	ROUND2(AX, BX, CX, DX, BP, 35)
   161	ROUND2(BP, AX, BX, CX, DX, 36)
   162	ROUND2(DX, BP, AX, BX, CX, 37)
   163	ROUND2(CX, DX, BP, AX, BX, 38)
   164	ROUND2(BX, CX, DX, BP, AX, 39)
   165
   166	ROUND3(AX, BX, CX, DX, BP, 40)
   167	ROUND3(BP, AX, BX, CX, DX, 41)
   168	ROUND3(DX, BP, AX, BX, CX, 42)
   169	ROUND3(CX, DX, BP, AX, BX, 43)
   170	ROUND3(BX, CX, DX, BP, AX, 44)
   171	ROUND3(AX, BX, CX, DX, BP, 45)
   172	ROUND3(BP, AX, BX, CX, DX, 46)
   173	ROUND3(DX, BP, AX, BX, CX, 47)
   174	ROUND3(CX, DX, BP, AX, BX, 48)
   175	ROUND3(BX, CX, DX, BP, AX, 49)
   176	ROUND3(AX, BX, CX, DX, BP, 50)
   177	ROUND3(BP, AX, BX, CX, DX, 51)
   178	ROUND3(DX, BP, AX, BX, CX, 52)
   179	ROUND3(CX, DX, BP, AX, BX, 53)
   180	ROUND3(BX, CX, DX, BP, AX, 54)
   181	ROUND3(AX, BX, CX, DX, BP, 55)
   182	ROUND3(BP, AX, BX, CX, DX, 56)
   183	ROUND3(DX, BP, AX, BX, CX, 57)
   184	ROUND3(CX, DX, BP, AX, BX, 58)
   185	ROUND3(BX, CX, DX, BP, AX, 59)
   186
   187	ROUND4(AX, BX, CX, DX, BP, 60)
   188	ROUND4(BP, AX, BX, CX, DX, 61)
   189	ROUND4(DX, BP, AX, BX, CX, 62)
   190	ROUND4(CX, DX, BP, AX, BX, 63)
   191	ROUND4(BX, CX, DX, BP, AX, 64)
   192	ROUND4(AX, BX, CX, DX, BP, 65)
   193	ROUND4(BP, AX, BX, CX, DX, 66)
   194	ROUND4(DX, BP, AX, BX, CX, 67)
   195	ROUND4(CX, DX, BP, AX, BX, 68)
   196	ROUND4(BX, CX, DX, BP, AX, 69)
   197	ROUND4(AX, BX, CX, DX, BP, 70)
   198	ROUND4(BP, AX, BX, CX, DX, 71)
   199	ROUND4(DX, BP, AX, BX, CX, 72)
   200	ROUND4(CX, DX, BP, AX, BX, 73)
   201	ROUND4(BX, CX, DX, BP, AX, 74)
   202	ROUND4(AX, BX, CX, DX, BP, 75)
   203	ROUND4(BP, AX, BX, CX, DX, 76)
   204	ROUND4(DX, BP, AX, BX, CX, 77)
   205	ROUND4(CX, DX, BP, AX, BX, 78)
   206	ROUND4(BX, CX, DX, BP, AX, 79)
   207
   208	ADDL	R11, AX
   209	ADDL	R12, BX
   210	ADDL	R13, CX
   211	ADDL	R14, DX
   212	ADDL	R15, BP
   213
   214	ADDQ	$64, SI
   215	CMPQ	SI, DI
   216	JB	loop
   217
   218end:
   219	MOVQ	dig+0(FP), DI
   220	MOVL	AX, (0*4)(DI)
   221	MOVL	BX, (1*4)(DI)
   222	MOVL	CX, (2*4)(DI)
   223	MOVL	DX, (3*4)(DI)
   224	MOVL	BP, (4*4)(DI)
   225	RET
   226
   227
   228// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
   229// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
   230// From http://software.intel.com/en-us/articles
   231// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
   232// This implementation is 2x unrolled, and interleaves vector instructions,
   233// used to precompute W, with scalar computation of current round
   234// for optimal scheduling.
   235
   236// Trivial helper macros.
   237#define UPDATE_HASH(A,TB,C,D,E) \
   238	ADDL	(R9), A \
   239	MOVL	A, (R9) \
   240	ADDL	4(R9), TB \
   241	MOVL	TB, 4(R9) \
   242	ADDL	8(R9), C \
   243	MOVL	C, 8(R9) \
   244	ADDL	12(R9), D \
   245	MOVL	D, 12(R9) \
   246	ADDL	16(R9), E \
   247	MOVL	E, 16(R9)
   248
   249
   250
   251// Helper macros for PRECALC, which does precomputations
   252#define PRECALC_0(OFFSET) \
   253	VMOVDQU   OFFSET(R10),X0
   254
   255#define PRECALC_1(OFFSET) \
   256	VINSERTI128 $1, OFFSET(R13), Y0, Y0
   257
   258#define PRECALC_2(YREG) \
   259	VPSHUFB Y10, Y0, YREG
   260
   261#define PRECALC_4(YREG,K_OFFSET) \
   262	VPADDD K_OFFSET(R8), YREG, Y0
   263
   264#define PRECALC_7(OFFSET) \
   265	VMOVDQU Y0, (OFFSET*2)(R14)
   266
   267
   268// Message scheduling pre-compute for rounds 0-15
   269// R13 is a pointer to even 64-byte block
   270// R10 is a pointer to odd 64-byte block
   271// R14 is a pointer to temp buffer
   272// X0 is used as temp register
   273// YREG is clobbered as part of computation
   274// OFFSET chooses 16 byte chunk within a block
   275// R8 is a pointer to constants block
   276// K_OFFSET chooses K constants relevant to this round
   277// X10 holds swap mask
   278#define PRECALC_00_15(OFFSET,YREG) \
   279	PRECALC_0(OFFSET) \
   280	PRECALC_1(OFFSET) \
   281	PRECALC_2(YREG) \
   282	PRECALC_4(YREG,0x0) \
   283	PRECALC_7(OFFSET)
   284
   285
   286// Helper macros for PRECALC_16_31
   287#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   288	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
   289	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
   290
   291#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   292	VPXOR  REG_SUB_8, REG, REG \
   293	VPXOR  REG_SUB_16, Y0, Y0
   294
   295#define PRECALC_18(REG) \
   296	VPXOR Y0, REG, REG \
   297	VPSLLDQ $12, REG, Y9
   298
   299#define PRECALC_19(REG) \
   300	VPSLLD $1, REG, Y0 \
   301	VPSRLD $31, REG, REG
   302
   303#define PRECALC_20(REG) \
   304	VPOR REG, Y0, Y0 \
   305	VPSLLD $2, Y9,  REG
   306
   307#define PRECALC_21(REG) \
   308	VPSRLD $30, Y9, Y9 \
   309	VPXOR REG, Y0, Y0
   310
   311#define PRECALC_23(REG,K_OFFSET,OFFSET) \
   312	VPXOR Y9, Y0, REG \
   313	VPADDD K_OFFSET(R8), REG, Y0 \
   314	VMOVDQU Y0, (OFFSET)(R14)
   315
   316// Message scheduling pre-compute for rounds 16-31
   317// calculating last 32 w[i] values in 8 XMM registers
   318// pre-calculate K+w[i] values and store to mem
   319// for later load by ALU add instruction.
   320// "brute force" vectorization for rounds 16-31 only
   321// due to w[i]->w[i-3] dependency.
   322// clobbers 5 input ymm registers REG_SUB*
   323// uses X0 and X9 as temp registers
   324// As always, R8 is a pointer to constants block
   325// and R14 is a pointer to temp buffer
   326#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
   327	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   328	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   329	PRECALC_18(REG) \
   330	PRECALC_19(REG) \
   331	PRECALC_20(REG) \
   332	PRECALC_21(REG) \
   333	PRECALC_23(REG,K_OFFSET,OFFSET)
   334
   335
   336// Helper macros for PRECALC_32_79
   337#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
   338	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
   339
   340#define PRECALC_33(REG_SUB_28,REG) \
   341	VPXOR REG_SUB_28, REG, REG
   342
   343#define PRECALC_34(REG_SUB_16) \
   344	VPXOR REG_SUB_16, Y0, Y0
   345
   346#define PRECALC_35(REG) \
   347	VPXOR Y0, REG, REG
   348
   349#define PRECALC_36(REG) \
   350	VPSLLD $2, REG, Y0
   351
   352#define PRECALC_37(REG) \
   353	VPSRLD $30, REG, REG \
   354	VPOR REG, Y0, REG
   355
   356#define PRECALC_39(REG,K_OFFSET,OFFSET) \
   357	VPADDD K_OFFSET(R8), REG, Y0 \
   358	VMOVDQU Y0, (OFFSET)(R14)
   359
   360// Message scheduling pre-compute for rounds 32-79
   361// In SHA-1 specification we have:
   362// w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
   363// Which is the same as:
   364// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
   365// This allows for more efficient vectorization,
   366// since w[i]->w[i-3] dependency is broken
   367#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
   368	PRECALC_32(REG_SUB_8,REG_SUB_4) \
   369	PRECALC_33(REG_SUB_28,REG) \
   370	PRECALC_34(REG_SUB_16) \
   371	PRECALC_35(REG) \
   372	PRECALC_36(REG) \
   373	PRECALC_37(REG) \
   374	PRECALC_39(REG,K_OFFSET,OFFSET)
   375
   376#define PRECALC \
   377	PRECALC_00_15(0,Y15) \
   378	PRECALC_00_15(0x10,Y14) \
   379	PRECALC_00_15(0x20,Y13) \
   380	PRECALC_00_15(0x30,Y12) \
   381	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
   382	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
   383	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
   384	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
   385	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
   386	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
   387	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
   388	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
   389	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
   390	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
   391	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
   392	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
   393	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
   394	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
   395	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
   396	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
   397
   398// Macros calculating individual rounds have general form
   399// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
   400// CALC_ROUND_{PRE,POST} macros follow
   401
   402#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
   403	ADDL OFFSET(R15),REG_E \
   404	ANDNL REG_C,REG_A,BP \
   405	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   406	RORXL $0x1b, REG_A, R12 \
   407	RORXL $2, REG_A, REG_B         // for next round
   408
   409// Calculate F for the next round
   410#define CALC_F1_POST(REG_A,REG_B,REG_E) \
   411	ANDL REG_B,REG_A \             // b&c
   412	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
   413	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
   414
   415
   416// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
   417#define CALC_0 \
   418	MOVL SI, BX \ // Precalculating first round
   419	RORXL $2, SI, SI \
   420	ANDNL AX, BX, BP \
   421	ANDL DI, BX \
   422	XORL BP, BX \
   423	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
   424	PRECALC_0(0x80) \
   425	CALC_F1_POST(CX,SI,DX)
   426
   427#define CALC_1 \
   428	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
   429	PRECALC_1(0x80) \
   430	CALC_F1_POST(DX,BX,AX)
   431
   432#define CALC_2 \
   433	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
   434	PRECALC_2(Y15) \
   435	CALC_F1_POST(AX,CX,DI)
   436
   437#define CALC_3 \
   438	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
   439	CALC_F1_POST(DI,DX,SI)
   440
   441#define CALC_4 \
   442	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
   443	PRECALC_4(Y15,0x0) \
   444	CALC_F1_POST(SI,AX,BX)
   445
   446#define CALC_5 \
   447	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
   448	CALC_F1_POST(BX,DI,CX)
   449
   450#define CALC_6 \
   451	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
   452	CALC_F1_POST(CX,SI,DX)
   453
   454#define CALC_7 \
   455	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
   456	PRECALC_7(0x0) \
   457	CALC_F1_POST(DX,BX,AX)
   458
   459#define CALC_8 \
   460	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
   461	PRECALC_0(0x90) \
   462	CALC_F1_POST(AX,CX,DI)
   463
   464#define CALC_9 \
   465	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
   466	PRECALC_1(0x90) \
   467	CALC_F1_POST(DI,DX,SI)
   468
   469#define CALC_10 \
   470	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
   471	PRECALC_2(Y14) \
   472	CALC_F1_POST(SI,AX,BX)
   473
   474#define CALC_11 \
   475	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
   476	CALC_F1_POST(BX,DI,CX)
   477
   478#define CALC_12 \
   479	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
   480	PRECALC_4(Y14,0x0) \
   481	CALC_F1_POST(CX,SI,DX)
   482
   483#define CALC_13 \
   484	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
   485	CALC_F1_POST(DX,BX,AX)
   486
   487#define CALC_14 \
   488	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
   489	CALC_F1_POST(AX,CX,DI)
   490
   491#define CALC_15 \
   492	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
   493	PRECALC_7(0x10) \
   494	CALC_F1_POST(DI,DX,SI)
   495
   496#define CALC_16 \
   497	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
   498	PRECALC_0(0xa0) \
   499	CALC_F1_POST(SI,AX,BX)
   500
   501#define CALC_17 \
   502	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
   503	PRECALC_1(0xa0) \
   504	CALC_F1_POST(BX,DI,CX)
   505
   506#define CALC_18 \
   507	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
   508	PRECALC_2(Y13) \
   509	CALC_F1_POST(CX,SI,DX)
   510
   511
   512#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
   513	ADDL OFFSET(R15),REG_E \
   514	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   515	RORXL $0x1b, REG_A, R12 \
   516	RORXL $2, REG_A, REG_B         // for next round
   517
   518#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
   519	XORL REG_B, REG_A \
   520	ADDL R12, REG_E \
   521        XORL REG_C, REG_A
   522
   523#define CALC_19 \
   524	CALC_F2_PRE(0x8c,DX,CX,AX) \
   525	CALC_F2_POST(DX,BX,SI,AX)
   526
   527#define CALC_20 \
   528	CALC_F2_PRE(0xa0,AX,DX,DI) \
   529	PRECALC_4(Y13,0x0) \
   530	CALC_F2_POST(AX,CX,BX,DI)
   531
   532#define CALC_21 \
   533	CALC_F2_PRE(0xa4,DI,AX,SI) \
   534	CALC_F2_POST(DI,DX,CX,SI)
   535
   536#define CALC_22 \
   537	CALC_F2_PRE(0xa8,SI,DI,BX) \
   538	CALC_F2_POST(SI,AX,DX,BX)
   539
   540#define CALC_23 \
   541	CALC_F2_PRE(0xac,BX,SI,CX) \
   542	PRECALC_7(0x20) \
   543	CALC_F2_POST(BX,DI,AX,CX)
   544
   545#define CALC_24 \
   546	CALC_F2_PRE(0xc0,CX,BX,DX) \
   547	PRECALC_0(0xb0) \
   548	CALC_F2_POST(CX,SI,DI,DX)
   549
   550#define CALC_25 \
   551	CALC_F2_PRE(0xc4,DX,CX,AX) \
   552	PRECALC_1(0xb0) \
   553	CALC_F2_POST(DX,BX,SI,AX)
   554
   555#define CALC_26 \
   556	CALC_F2_PRE(0xc8,AX,DX,DI) \
   557	PRECALC_2(Y12) \
   558	CALC_F2_POST(AX,CX,BX,DI)
   559
   560#define CALC_27 \
   561	CALC_F2_PRE(0xcc,DI,AX,SI) \
   562	CALC_F2_POST(DI,DX,CX,SI)
   563
   564#define CALC_28 \
   565	CALC_F2_PRE(0xe0,SI,DI,BX) \
   566	PRECALC_4(Y12,0x0) \
   567	CALC_F2_POST(SI,AX,DX,BX)
   568
   569#define CALC_29 \
   570	CALC_F2_PRE(0xe4,BX,SI,CX) \
   571	CALC_F2_POST(BX,DI,AX,CX)
   572
   573#define CALC_30 \
   574	CALC_F2_PRE(0xe8,CX,BX,DX) \
   575	CALC_F2_POST(CX,SI,DI,DX)
   576
   577#define CALC_31 \
   578	CALC_F2_PRE(0xec,DX,CX,AX) \
   579	PRECALC_7(0x30) \
   580	CALC_F2_POST(DX,BX,SI,AX)
   581
   582#define CALC_32 \
   583	CALC_F2_PRE(0x100,AX,DX,DI) \
   584	PRECALC_16(Y15,Y14,Y12,Y8) \
   585	CALC_F2_POST(AX,CX,BX,DI)
   586
   587#define CALC_33 \
   588	CALC_F2_PRE(0x104,DI,AX,SI) \
   589	PRECALC_17(Y15,Y13,Y8) \
   590	CALC_F2_POST(DI,DX,CX,SI)
   591
   592#define CALC_34 \
   593	CALC_F2_PRE(0x108,SI,DI,BX) \
   594	PRECALC_18(Y8) \
   595	CALC_F2_POST(SI,AX,DX,BX)
   596
   597#define CALC_35 \
   598	CALC_F2_PRE(0x10c,BX,SI,CX) \
   599	PRECALC_19(Y8) \
   600	CALC_F2_POST(BX,DI,AX,CX)
   601
   602#define CALC_36 \
   603	CALC_F2_PRE(0x120,CX,BX,DX) \
   604	PRECALC_20(Y8) \
   605	CALC_F2_POST(CX,SI,DI,DX)
   606
   607#define CALC_37 \
   608	CALC_F2_PRE(0x124,DX,CX,AX) \
   609	PRECALC_21(Y8) \
   610	CALC_F2_POST(DX,BX,SI,AX)
   611
   612#define CALC_38 \
   613	CALC_F2_PRE(0x128,AX,DX,DI) \
   614	CALC_F2_POST(AX,CX,BX,DI)
   615
   616
   617#define CALC_F3_PRE(OFFSET,REG_E) \
   618	ADDL OFFSET(R15),REG_E
   619
   620#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
   621	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
   622	MOVL REG_B, BP \
   623	ORL  REG_A, BP \
   624	RORXL $0x1b, REG_A, R12 \
   625	RORXL $2, REG_A, REG_TB \
   626	ANDL REG_C, BP \		// Calculate F for the next round
   627	ANDL REG_B, REG_A \
   628	ORL  BP, REG_A \
   629	ADDL R12, REG_E
   630
   631#define CALC_39 \
   632	CALC_F3_PRE(0x12c,SI) \
   633	PRECALC_23(Y8,0x0,0x80) \
   634	CALC_F3_POST(DI,DX,CX,SI,AX)
   635
   636#define CALC_40 \
   637	CALC_F3_PRE(0x140,BX) \
   638	PRECALC_16(Y14,Y13,Y8,Y7) \
   639	CALC_F3_POST(SI,AX,DX,BX,DI)
   640
   641#define CALC_41 \
   642	CALC_F3_PRE(0x144,CX) \
   643	PRECALC_17(Y14,Y12,Y7) \
   644	CALC_F3_POST(BX,DI,AX,CX,SI)
   645
   646#define CALC_42 \
   647	CALC_F3_PRE(0x148,DX) \
   648	PRECALC_18(Y7) \
   649	CALC_F3_POST(CX,SI,DI,DX,BX)
   650
   651#define CALC_43 \
   652	CALC_F3_PRE(0x14c,AX) \
   653	PRECALC_19(Y7) \
   654	CALC_F3_POST(DX,BX,SI,AX,CX)
   655
   656#define CALC_44 \
   657	CALC_F3_PRE(0x160,DI) \
   658	PRECALC_20(Y7) \
   659	CALC_F3_POST(AX,CX,BX,DI,DX)
   660
   661#define CALC_45 \
   662	CALC_F3_PRE(0x164,SI) \
   663	PRECALC_21(Y7) \
   664	CALC_F3_POST(DI,DX,CX,SI,AX)
   665
   666#define CALC_46 \
   667	CALC_F3_PRE(0x168,BX) \
   668	CALC_F3_POST(SI,AX,DX,BX,DI)
   669
   670#define CALC_47 \
   671	CALC_F3_PRE(0x16c,CX) \
   672	VPXOR Y9, Y0, Y7 \
   673	VPADDD 0x20(R8), Y7, Y0 \
   674	VMOVDQU Y0, 0xa0(R14) \
   675	CALC_F3_POST(BX,DI,AX,CX,SI)
   676
   677#define CALC_48 \
   678	CALC_F3_PRE(0x180,DX) \
   679	PRECALC_16(Y13,Y12,Y7,Y5) \
   680	CALC_F3_POST(CX,SI,DI,DX,BX)
   681
   682#define CALC_49 \
   683	CALC_F3_PRE(0x184,AX) \
   684	PRECALC_17(Y13,Y8,Y5) \
   685	CALC_F3_POST(DX,BX,SI,AX,CX)
   686
   687#define CALC_50 \
   688	CALC_F3_PRE(0x188,DI) \
   689	PRECALC_18(Y5) \
   690	CALC_F3_POST(AX,CX,BX,DI,DX)
   691
   692#define CALC_51 \
   693	CALC_F3_PRE(0x18c,SI) \
   694	PRECALC_19(Y5) \
   695	CALC_F3_POST(DI,DX,CX,SI,AX)
   696
   697#define CALC_52 \
   698	CALC_F3_PRE(0x1a0,BX) \
   699	PRECALC_20(Y5) \
   700	CALC_F3_POST(SI,AX,DX,BX,DI)
   701
   702#define CALC_53 \
   703	CALC_F3_PRE(0x1a4,CX) \
   704	PRECALC_21(Y5) \
   705	CALC_F3_POST(BX,DI,AX,CX,SI)
   706
   707#define CALC_54 \
   708	CALC_F3_PRE(0x1a8,DX) \
   709	CALC_F3_POST(CX,SI,DI,DX,BX)
   710
   711#define CALC_55 \
   712	CALC_F3_PRE(0x1ac,AX) \
   713	PRECALC_23(Y5,0x20,0xc0) \
   714	CALC_F3_POST(DX,BX,SI,AX,CX)
   715
   716#define CALC_56 \
   717	CALC_F3_PRE(0x1c0,DI) \
   718	PRECALC_16(Y12,Y8,Y5,Y3) \
   719	CALC_F3_POST(AX,CX,BX,DI,DX)
   720
   721#define CALC_57 \
   722	CALC_F3_PRE(0x1c4,SI) \
   723	PRECALC_17(Y12,Y7,Y3) \
   724	CALC_F3_POST(DI,DX,CX,SI,AX)
   725
   726#define CALC_58 \
   727	CALC_F3_PRE(0x1c8,BX) \
   728	PRECALC_18(Y3) \
   729	CALC_F3_POST(SI,AX,DX,BX,DI)
   730
   731#define CALC_59 \
   732	CALC_F2_PRE(0x1cc,BX,SI,CX) \
   733	PRECALC_19(Y3) \
   734	CALC_F2_POST(BX,DI,AX,CX)
   735
   736#define CALC_60 \
   737	CALC_F2_PRE(0x1e0,CX,BX,DX) \
   738	PRECALC_20(Y3) \
   739	CALC_F2_POST(CX,SI,DI,DX)
   740
   741#define CALC_61 \
   742	CALC_F2_PRE(0x1e4,DX,CX,AX) \
   743	PRECALC_21(Y3) \
   744	CALC_F2_POST(DX,BX,SI,AX)
   745
   746#define CALC_62 \
   747	CALC_F2_PRE(0x1e8,AX,DX,DI) \
   748	CALC_F2_POST(AX,CX,BX,DI)
   749
   750#define CALC_63 \
   751	CALC_F2_PRE(0x1ec,DI,AX,SI) \
   752	PRECALC_23(Y3,0x20,0xe0) \
   753	CALC_F2_POST(DI,DX,CX,SI)
   754
   755#define CALC_64 \
   756	CALC_F2_PRE(0x200,SI,DI,BX) \
   757	PRECALC_32(Y5,Y3) \
   758	CALC_F2_POST(SI,AX,DX,BX)
   759
   760#define CALC_65 \
   761	CALC_F2_PRE(0x204,BX,SI,CX) \
   762	PRECALC_33(Y14,Y15) \
   763	CALC_F2_POST(BX,DI,AX,CX)
   764
   765#define CALC_66 \
   766	CALC_F2_PRE(0x208,CX,BX,DX) \
   767	PRECALC_34(Y8) \
   768	CALC_F2_POST(CX,SI,DI,DX)
   769
   770#define CALC_67 \
   771	CALC_F2_PRE(0x20c,DX,CX,AX) \
   772	PRECALC_35(Y15) \
   773	CALC_F2_POST(DX,BX,SI,AX)
   774
   775#define CALC_68 \
   776	CALC_F2_PRE(0x220,AX,DX,DI) \
   777	PRECALC_36(Y15) \
   778	CALC_F2_POST(AX,CX,BX,DI)
   779
   780#define CALC_69 \
   781	CALC_F2_PRE(0x224,DI,AX,SI) \
   782	PRECALC_37(Y15) \
   783	CALC_F2_POST(DI,DX,CX,SI)
   784
   785#define CALC_70 \
   786	CALC_F2_PRE(0x228,SI,DI,BX) \
   787	CALC_F2_POST(SI,AX,DX,BX)
   788
   789#define CALC_71 \
   790	CALC_F2_PRE(0x22c,BX,SI,CX) \
   791	PRECALC_39(Y15,0x20,0x100) \
   792	CALC_F2_POST(BX,DI,AX,CX)
   793
   794#define CALC_72 \
   795	CALC_F2_PRE(0x240,CX,BX,DX) \
   796	PRECALC_32(Y3,Y15) \
   797	CALC_F2_POST(CX,SI,DI,DX)
   798
   799#define CALC_73 \
   800	CALC_F2_PRE(0x244,DX,CX,AX) \
   801	PRECALC_33(Y13,Y14) \
   802	CALC_F2_POST(DX,BX,SI,AX)
   803
   804#define CALC_74 \
   805	CALC_F2_PRE(0x248,AX,DX,DI) \
   806	PRECALC_34(Y7) \
   807	CALC_F2_POST(AX,CX,BX,DI)
   808
   809#define CALC_75 \
   810	CALC_F2_PRE(0x24c,DI,AX,SI) \
   811	PRECALC_35(Y14) \
   812	CALC_F2_POST(DI,DX,CX,SI)
   813
   814#define CALC_76 \
   815	CALC_F2_PRE(0x260,SI,DI,BX) \
   816	PRECALC_36(Y14) \
   817	CALC_F2_POST(SI,AX,DX,BX)
   818
   819#define CALC_77 \
   820	CALC_F2_PRE(0x264,BX,SI,CX) \
   821	PRECALC_37(Y14) \
   822	CALC_F2_POST(BX,DI,AX,CX)
   823
   824#define CALC_78 \
   825	CALC_F2_PRE(0x268,CX,BX,DX) \
   826	CALC_F2_POST(CX,SI,DI,DX)
   827
   828#define CALC_79 \
   829	ADDL 0x26c(R15), AX \
   830	LEAL (AX)(CX*1), AX \
   831	RORXL $0x1b, DX, R12 \
   832	PRECALC_39(Y14,0x20,0x120) \
   833	ADDL R12, AX
   834
   835// Similar to CALC_0
   836#define CALC_80 \
   837	MOVL CX, DX \
   838	RORXL $2, CX, CX \
   839	ANDNL SI, DX, BP \
   840	ANDL BX, DX \
   841	XORL BP, DX \
   842	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
   843	PRECALC_32(Y15,Y14) \
   844	CALC_F1_POST(AX,CX,DI)
   845
   846#define CALC_81 \
   847	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
   848	PRECALC_33(Y12,Y13) \
   849	CALC_F1_POST(DI,DX,SI)
   850
   851#define CALC_82 \
   852	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
   853	PRECALC_34(Y5) \
   854	CALC_F1_POST(SI,AX,BX)
   855
   856#define CALC_83 \
   857	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
   858	PRECALC_35(Y13) \
   859	CALC_F1_POST(BX,DI,CX)
   860
   861#define CALC_84 \
   862	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
   863	PRECALC_36(Y13) \
   864	CALC_F1_POST(CX,SI,DX)
   865
   866#define CALC_85 \
   867	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
   868	PRECALC_37(Y13) \
   869	CALC_F1_POST(DX,BX,AX)
   870
   871#define CALC_86 \
   872	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
   873	CALC_F1_POST(AX,CX,DI)
   874
   875#define CALC_87 \
   876	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
   877	PRECALC_39(Y13,0x40,0x140) \
   878	CALC_F1_POST(DI,DX,SI)
   879
   880#define CALC_88 \
   881	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
   882	PRECALC_32(Y14,Y13) \
   883	CALC_F1_POST(SI,AX,BX)
   884
   885#define CALC_89 \
   886	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
   887	PRECALC_33(Y8,Y12) \
   888	CALC_F1_POST(BX,DI,CX)
   889
   890#define CALC_90 \
   891	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
   892	PRECALC_34(Y3) \
   893	CALC_F1_POST(CX,SI,DX)
   894
   895#define CALC_91 \
   896	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
   897	PRECALC_35(Y12) \
   898	CALC_F1_POST(DX,BX,AX)
   899
   900#define CALC_92 \
   901	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
   902	PRECALC_36(Y12) \
   903	CALC_F1_POST(AX,CX,DI)
   904
   905#define CALC_93 \
   906	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
   907	PRECALC_37(Y12) \
   908	CALC_F1_POST(DI,DX,SI)
   909
   910#define CALC_94 \
   911	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
   912	CALC_F1_POST(SI,AX,BX)
   913
   914#define CALC_95 \
   915	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
   916	PRECALC_39(Y12,0x40,0x160) \
   917	CALC_F1_POST(BX,DI,CX)
   918
   919#define CALC_96 \
   920	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
   921	PRECALC_32(Y13,Y12) \
   922	CALC_F1_POST(CX,SI,DX)
   923
   924#define CALC_97 \
   925	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
   926	PRECALC_33(Y7,Y8) \
   927	CALC_F1_POST(DX,BX,AX)
   928
   929#define CALC_98 \
   930	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
   931	PRECALC_34(Y15) \
   932	CALC_F1_POST(AX,CX,DI)
   933
   934#define CALC_99 \
   935	CALC_F2_PRE(0x9c,DI,AX,SI) \
   936	PRECALC_35(Y8) \
   937	CALC_F2_POST(DI,DX,CX,SI)
   938
   939#define CALC_100 \
   940	CALC_F2_PRE(0xb0,SI,DI,BX) \
   941	PRECALC_36(Y8) \
   942	CALC_F2_POST(SI,AX,DX,BX)
   943
   944#define CALC_101 \
   945	CALC_F2_PRE(0xb4,BX,SI,CX) \
   946	PRECALC_37(Y8) \
   947	CALC_F2_POST(BX,DI,AX,CX)
   948
   949#define CALC_102 \
   950	CALC_F2_PRE(0xb8,CX,BX,DX) \
   951	CALC_F2_POST(CX,SI,DI,DX)
   952
   953#define CALC_103 \
   954	CALC_F2_PRE(0xbc,DX,CX,AX) \
   955	PRECALC_39(Y8,0x40,0x180) \
   956	CALC_F2_POST(DX,BX,SI,AX)
   957
   958#define CALC_104 \
   959	CALC_F2_PRE(0xd0,AX,DX,DI) \
   960	PRECALC_32(Y12,Y8) \
   961	CALC_F2_POST(AX,CX,BX,DI)
   962
   963#define CALC_105 \
   964	CALC_F2_PRE(0xd4,DI,AX,SI) \
   965	PRECALC_33(Y5,Y7) \
   966	CALC_F2_POST(DI,DX,CX,SI)
   967
   968#define CALC_106 \
   969	CALC_F2_PRE(0xd8,SI,DI,BX) \
   970	PRECALC_34(Y14) \
   971	CALC_F2_POST(SI,AX,DX,BX)
   972
   973#define CALC_107 \
   974	CALC_F2_PRE(0xdc,BX,SI,CX) \
   975	PRECALC_35(Y7) \
   976	CALC_F2_POST(BX,DI,AX,CX)
   977
   978#define CALC_108 \
   979	CALC_F2_PRE(0xf0,CX,BX,DX) \
   980	PRECALC_36(Y7) \
   981	CALC_F2_POST(CX,SI,DI,DX)
   982
   983#define CALC_109 \
   984	CALC_F2_PRE(0xf4,DX,CX,AX) \
   985	PRECALC_37(Y7) \
   986	CALC_F2_POST(DX,BX,SI,AX)
   987
   988#define CALC_110 \
   989	CALC_F2_PRE(0xf8,AX,DX,DI) \
   990	CALC_F2_POST(AX,CX,BX,DI)
   991
   992#define CALC_111 \
   993	CALC_F2_PRE(0xfc,DI,AX,SI) \
   994	PRECALC_39(Y7,0x40,0x1a0) \
   995	CALC_F2_POST(DI,DX,CX,SI)
   996
   997#define CALC_112 \
   998	CALC_F2_PRE(0x110,SI,DI,BX) \
   999	PRECALC_32(Y8,Y7) \
  1000	CALC_F2_POST(SI,AX,DX,BX)
  1001
  1002#define CALC_113 \
  1003	CALC_F2_PRE(0x114,BX,SI,CX) \
  1004	PRECALC_33(Y3,Y5) \
  1005	CALC_F2_POST(BX,DI,AX,CX)
  1006
  1007#define CALC_114 \
  1008	CALC_F2_PRE(0x118,CX,BX,DX) \
  1009	PRECALC_34(Y13) \
  1010	CALC_F2_POST(CX,SI,DI,DX)
  1011
  1012#define CALC_115 \
  1013	CALC_F2_PRE(0x11c,DX,CX,AX) \
  1014	PRECALC_35(Y5) \
  1015	CALC_F2_POST(DX,BX,SI,AX)
  1016
  1017#define CALC_116 \
  1018	CALC_F2_PRE(0x130,AX,DX,DI) \
  1019	PRECALC_36(Y5) \
  1020	CALC_F2_POST(AX,CX,BX,DI)
  1021
  1022#define CALC_117 \
  1023	CALC_F2_PRE(0x134,DI,AX,SI) \
  1024	PRECALC_37(Y5) \
  1025	CALC_F2_POST(DI,DX,CX,SI)
  1026
  1027#define CALC_118 \
  1028	CALC_F2_PRE(0x138,SI,DI,BX) \
  1029	CALC_F2_POST(SI,AX,DX,BX)
  1030
  1031#define CALC_119 \
  1032	CALC_F3_PRE(0x13c,CX) \
  1033	PRECALC_39(Y5,0x40,0x1c0) \
  1034	CALC_F3_POST(BX,DI,AX,CX,SI)
  1035
  1036#define CALC_120 \
  1037	CALC_F3_PRE(0x150,DX) \
  1038	PRECALC_32(Y7,Y5) \
  1039	CALC_F3_POST(CX,SI,DI,DX,BX)
  1040
  1041#define CALC_121 \
  1042	CALC_F3_PRE(0x154,AX) \
  1043	PRECALC_33(Y15,Y3) \
  1044	CALC_F3_POST(DX,BX,SI,AX,CX)
  1045
  1046#define CALC_122 \
  1047	CALC_F3_PRE(0x158,DI) \
  1048	PRECALC_34(Y12) \
  1049	CALC_F3_POST(AX,CX,BX,DI,DX)
  1050
  1051#define CALC_123 \
  1052	CALC_F3_PRE(0x15c,SI) \
  1053	PRECALC_35(Y3) \
  1054	CALC_F3_POST(DI,DX,CX,SI,AX)
  1055
  1056#define CALC_124 \
  1057	CALC_F3_PRE(0x170,BX) \
  1058	PRECALC_36(Y3) \
  1059	CALC_F3_POST(SI,AX,DX,BX,DI)
  1060
  1061#define CALC_125 \
  1062	CALC_F3_PRE(0x174,CX) \
  1063	PRECALC_37(Y3) \
  1064	CALC_F3_POST(BX,DI,AX,CX,SI)
  1065
  1066#define CALC_126 \
  1067	CALC_F3_PRE(0x178,DX) \
  1068	CALC_F3_POST(CX,SI,DI,DX,BX)
  1069
  1070#define CALC_127 \
  1071	CALC_F3_PRE(0x17c,AX) \
  1072	PRECALC_39(Y3,0x60,0x1e0) \
  1073	CALC_F3_POST(DX,BX,SI,AX,CX)
  1074
  1075#define CALC_128 \
  1076	CALC_F3_PRE(0x190,DI) \
  1077	PRECALC_32(Y5,Y3) \
  1078	CALC_F3_POST(AX,CX,BX,DI,DX)
  1079
  1080#define CALC_129 \
  1081	CALC_F3_PRE(0x194,SI) \
  1082	PRECALC_33(Y14,Y15) \
  1083	CALC_F3_POST(DI,DX,CX,SI,AX)
  1084
  1085#define CALC_130 \
  1086	CALC_F3_PRE(0x198,BX) \
  1087	PRECALC_34(Y8) \
  1088	CALC_F3_POST(SI,AX,DX,BX,DI)
  1089
  1090#define CALC_131 \
  1091	CALC_F3_PRE(0x19c,CX) \
  1092	PRECALC_35(Y15) \
  1093	CALC_F3_POST(BX,DI,AX,CX,SI)
  1094
  1095#define CALC_132 \
  1096	CALC_F3_PRE(0x1b0,DX) \
  1097	PRECALC_36(Y15) \
  1098	CALC_F3_POST(CX,SI,DI,DX,BX)
  1099
  1100#define CALC_133 \
  1101	CALC_F3_PRE(0x1b4,AX) \
  1102	PRECALC_37(Y15) \
  1103	CALC_F3_POST(DX,BX,SI,AX,CX)
  1104
  1105#define CALC_134 \
  1106	CALC_F3_PRE(0x1b8,DI) \
  1107	CALC_F3_POST(AX,CX,BX,DI,DX)
  1108
  1109#define CALC_135 \
  1110	CALC_F3_PRE(0x1bc,SI) \
  1111	PRECALC_39(Y15,0x60,0x200) \
  1112	CALC_F3_POST(DI,DX,CX,SI,AX)
  1113
  1114#define CALC_136 \
  1115	CALC_F3_PRE(0x1d0,BX) \
  1116	PRECALC_32(Y3,Y15) \
  1117	CALC_F3_POST(SI,AX,DX,BX,DI)
  1118
  1119#define CALC_137 \
  1120	CALC_F3_PRE(0x1d4,CX) \
  1121	PRECALC_33(Y13,Y14) \
  1122	CALC_F3_POST(BX,DI,AX,CX,SI)
  1123
  1124#define CALC_138 \
  1125	CALC_F3_PRE(0x1d8,DX) \
  1126	PRECALC_34(Y7) \
  1127	CALC_F3_POST(CX,SI,DI,DX,BX)
  1128
  1129#define CALC_139 \
  1130	CALC_F2_PRE(0x1dc,DX,CX,AX) \
  1131	PRECALC_35(Y14) \
  1132	CALC_F2_POST(DX,BX,SI,AX)
  1133
  1134#define CALC_140 \
  1135	CALC_F2_PRE(0x1f0,AX,DX,DI) \
  1136	PRECALC_36(Y14) \
  1137	CALC_F2_POST(AX,CX,BX,DI)
  1138
  1139#define CALC_141 \
  1140	CALC_F2_PRE(0x1f4,DI,AX,SI) \
  1141	PRECALC_37(Y14) \
  1142	CALC_F2_POST(DI,DX,CX,SI)
  1143
  1144#define CALC_142 \
  1145	CALC_F2_PRE(0x1f8,SI,DI,BX) \
  1146	CALC_F2_POST(SI,AX,DX,BX)
  1147
  1148#define CALC_143 \
  1149	CALC_F2_PRE(0x1fc,BX,SI,CX) \
  1150	PRECALC_39(Y14,0x60,0x220) \
  1151	CALC_F2_POST(BX,DI,AX,CX)
  1152
  1153#define CALC_144 \
  1154	CALC_F2_PRE(0x210,CX,BX,DX) \
  1155	PRECALC_32(Y15,Y14) \
  1156	CALC_F2_POST(CX,SI,DI,DX)
  1157
  1158#define CALC_145 \
  1159	CALC_F2_PRE(0x214,DX,CX,AX) \
  1160	PRECALC_33(Y12,Y13) \
  1161	CALC_F2_POST(DX,BX,SI,AX)
  1162
  1163#define CALC_146 \
  1164	CALC_F2_PRE(0x218,AX,DX,DI) \
  1165	PRECALC_34(Y5) \
  1166	CALC_F2_POST(AX,CX,BX,DI)
  1167
  1168#define CALC_147 \
  1169	CALC_F2_PRE(0x21c,DI,AX,SI) \
  1170	PRECALC_35(Y13) \
  1171	CALC_F2_POST(DI,DX,CX,SI)
  1172
  1173#define CALC_148 \
  1174	CALC_F2_PRE(0x230,SI,DI,BX) \
  1175	PRECALC_36(Y13) \
  1176	CALC_F2_POST(SI,AX,DX,BX)
  1177
  1178#define CALC_149 \
  1179	CALC_F2_PRE(0x234,BX,SI,CX) \
  1180	PRECALC_37(Y13) \
  1181	CALC_F2_POST(BX,DI,AX,CX)
  1182
  1183#define CALC_150 \
  1184	CALC_F2_PRE(0x238,CX,BX,DX) \
  1185	CALC_F2_POST(CX,SI,DI,DX)
  1186
  1187#define CALC_151 \
  1188	CALC_F2_PRE(0x23c,DX,CX,AX) \
  1189	PRECALC_39(Y13,0x60,0x240) \
  1190	CALC_F2_POST(DX,BX,SI,AX)
  1191
  1192#define CALC_152 \
  1193	CALC_F2_PRE(0x250,AX,DX,DI) \
  1194	PRECALC_32(Y14,Y13) \
  1195	CALC_F2_POST(AX,CX,BX,DI)
  1196
  1197#define CALC_153 \
  1198	CALC_F2_PRE(0x254,DI,AX,SI) \
  1199	PRECALC_33(Y8,Y12) \
  1200	CALC_F2_POST(DI,DX,CX,SI)
  1201
  1202#define CALC_154 \
  1203	CALC_F2_PRE(0x258,SI,DI,BX) \
  1204	PRECALC_34(Y3) \
  1205	CALC_F2_POST(SI,AX,DX,BX)
  1206
  1207#define CALC_155 \
  1208	CALC_F2_PRE(0x25c,BX,SI,CX) \
  1209	PRECALC_35(Y12) \
  1210	CALC_F2_POST(BX,DI,AX,CX)
  1211
  1212#define CALC_156 \
  1213	CALC_F2_PRE(0x270,CX,BX,DX) \
  1214	PRECALC_36(Y12) \
  1215	CALC_F2_POST(CX,SI,DI,DX)
  1216
  1217#define CALC_157 \
  1218	CALC_F2_PRE(0x274,DX,CX,AX) \
  1219	PRECALC_37(Y12) \
  1220	CALC_F2_POST(DX,BX,SI,AX)
  1221
  1222#define CALC_158 \
  1223	CALC_F2_PRE(0x278,AX,DX,DI) \
  1224	CALC_F2_POST(AX,CX,BX,DI)
  1225
  1226#define CALC_159 \
  1227	ADDL 0x27c(R15),SI \
  1228	LEAL (SI)(AX*1), SI \
  1229	RORXL $0x1b, DI, R12 \
  1230	PRECALC_39(Y12,0x60,0x260) \
  1231	ADDL R12, SI
  1232
  1233
  1234
  1235#define CALC \
  1236	MOVL	(R9), CX \
  1237	MOVL	4(R9), SI \
  1238	MOVL	8(R9), DI \
  1239	MOVL	12(R9), AX \
  1240	MOVL	16(R9), DX \
  1241	MOVQ    SP, R14 \
  1242	LEAQ    (2*4*80+32)(SP), R15 \
  1243	PRECALC \ // Precalc WK for first 2 blocks
  1244	XCHGQ   R15, R14 \
  1245loop: \  // this loops is unrolled
  1246	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
  1247	JNE	begin \
  1248	VZEROUPPER \
  1249	RET \
  1250begin: \
  1251	CALC_0 \
  1252	CALC_1 \
  1253	CALC_2 \
  1254	CALC_3 \
  1255	CALC_4 \
  1256	CALC_5 \
  1257	CALC_6 \
  1258	CALC_7 \
  1259	CALC_8 \
  1260	CALC_9 \
  1261	CALC_10 \
  1262	CALC_11 \
  1263	CALC_12 \
  1264	CALC_13 \
  1265	CALC_14 \
  1266	CALC_15 \
  1267	CALC_16 \
  1268	CALC_17 \
  1269	CALC_18 \
  1270	CALC_19 \
  1271	CALC_20 \
  1272	CALC_21 \
  1273	CALC_22 \
  1274	CALC_23 \
  1275	CALC_24 \
  1276	CALC_25 \
  1277	CALC_26 \
  1278	CALC_27 \
  1279	CALC_28 \
  1280	CALC_29 \
  1281	CALC_30 \
  1282	CALC_31 \
  1283	CALC_32 \
  1284	CALC_33 \
  1285	CALC_34 \
  1286	CALC_35 \
  1287	CALC_36 \
  1288	CALC_37 \
  1289	CALC_38 \
  1290	CALC_39 \
  1291	CALC_40 \
  1292	CALC_41 \
  1293	CALC_42 \
  1294	CALC_43 \
  1295	CALC_44 \
  1296	CALC_45 \
  1297	CALC_46 \
  1298	CALC_47 \
  1299	CALC_48 \
  1300	CALC_49 \
  1301	CALC_50 \
  1302	CALC_51 \
  1303	CALC_52 \
  1304	CALC_53 \
  1305	CALC_54 \
  1306	CALC_55 \
  1307	CALC_56 \
  1308	CALC_57 \
  1309	CALC_58 \
  1310	CALC_59 \
  1311	ADDQ $128, R10 \ // move to next even-64-byte block
  1312	CMPQ R10, R11 \ // is current block the last one?
  1313	CMOVQCC R8, R10 \ // signal the last iteration smartly
  1314	CALC_60 \
  1315	CALC_61 \
  1316	CALC_62 \
  1317	CALC_63 \
  1318	CALC_64 \
  1319	CALC_65 \
  1320	CALC_66 \
  1321	CALC_67 \
  1322	CALC_68 \
  1323	CALC_69 \
  1324	CALC_70 \
  1325	CALC_71 \
  1326	CALC_72 \
  1327	CALC_73 \
  1328	CALC_74 \
  1329	CALC_75 \
  1330	CALC_76 \
  1331	CALC_77 \
  1332	CALC_78 \
  1333	CALC_79 \
  1334	UPDATE_HASH(AX,DX,BX,SI,DI) \
  1335	CMPQ R10, R8 \ // is current block the last one?
  1336	JE loop\
  1337	MOVL DX, CX \
  1338	CALC_80 \
  1339	CALC_81 \
  1340	CALC_82 \
  1341	CALC_83 \
  1342	CALC_84 \
  1343	CALC_85 \
  1344	CALC_86 \
  1345	CALC_87 \
  1346	CALC_88 \
  1347	CALC_89 \
  1348	CALC_90 \
  1349	CALC_91 \
  1350	CALC_92 \
  1351	CALC_93 \
  1352	CALC_94 \
  1353	CALC_95 \
  1354	CALC_96 \
  1355	CALC_97 \
  1356	CALC_98 \
  1357	CALC_99 \
  1358	CALC_100 \
  1359	CALC_101 \
  1360	CALC_102 \
  1361	CALC_103 \
  1362	CALC_104 \
  1363	CALC_105 \
  1364	CALC_106 \
  1365	CALC_107 \
  1366	CALC_108 \
  1367	CALC_109 \
  1368	CALC_110 \
  1369	CALC_111 \
  1370	CALC_112 \
  1371	CALC_113 \
  1372	CALC_114 \
  1373	CALC_115 \
  1374	CALC_116 \
  1375	CALC_117 \
  1376	CALC_118 \
  1377	CALC_119 \
  1378	CALC_120 \
  1379	CALC_121 \
  1380	CALC_122 \
  1381	CALC_123 \
  1382	CALC_124 \
  1383	CALC_125 \
  1384	CALC_126 \
  1385	CALC_127 \
  1386	CALC_128 \
  1387	CALC_129 \
  1388	CALC_130 \
  1389	CALC_131 \
  1390	CALC_132 \
  1391	CALC_133 \
  1392	CALC_134 \
  1393	CALC_135 \
  1394	CALC_136 \
  1395	CALC_137 \
  1396	CALC_138 \
  1397	CALC_139 \
  1398	ADDQ $128, R13 \ //move to next even-64-byte block
  1399	CMPQ R13, R11 \ //is current block the last one?
  1400	CMOVQCC R8, R10 \
  1401	CALC_140 \
  1402	CALC_141 \
  1403	CALC_142 \
  1404	CALC_143 \
  1405	CALC_144 \
  1406	CALC_145 \
  1407	CALC_146 \
  1408	CALC_147 \
  1409	CALC_148 \
  1410	CALC_149 \
  1411	CALC_150 \
  1412	CALC_151 \
  1413	CALC_152 \
  1414	CALC_153 \
  1415	CALC_154 \
  1416	CALC_155 \
  1417	CALC_156 \
  1418	CALC_157 \
  1419	CALC_158 \
  1420	CALC_159 \
  1421	UPDATE_HASH(SI,DI,DX,CX,BX) \
  1422	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
  1423	MOVL	DI, SI \
  1424	MOVL	DX, DI \
  1425	MOVL	BX, DX \
  1426	MOVL	CX, AX \
  1427	MOVL	R12, CX \
  1428	XCHGQ   R15, R14 \
  1429	JMP     loop
  1430
  1431
  1432
  1433TEXT ·blockAVX2(SB),$1408-32
  1434
  1435	MOVQ	dig+0(FP),	DI
  1436	MOVQ	p_base+8(FP),	SI
  1437	MOVQ	p_len+16(FP),	DX
  1438	SHRQ	$6,		DX
  1439	SHLQ	$6,		DX
  1440
  1441	MOVQ	$K_XMM_AR<>(SB), R8
  1442
  1443	MOVQ	DI, R9
  1444	MOVQ	SI, R10
  1445	LEAQ	64(SI), R13
  1446
  1447	ADDQ	SI, DX
  1448	ADDQ	$64, DX
  1449	MOVQ	DX, R11
  1450
  1451	CMPQ	R13, R11
  1452	CMOVQCC	R8, R13
  1453
  1454	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
  1455
  1456	CALC // RET is inside macros
  1457
  1458DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
  1459DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
  1460DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
  1461DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
  1462DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
  1463DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
  1464DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
  1465DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
  1466DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
  1467DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
  1468DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
  1469DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
  1470DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
  1471DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
  1472DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
  1473DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
  1474DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
  1475DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
  1476DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
  1477DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
  1478DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
  1479DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
  1480DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
  1481DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
  1482DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
  1483DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
  1484DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
  1485DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
  1486DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
  1487DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
  1488DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
  1489DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
  1490GLOBL K_XMM_AR<>(SB),RODATA,$128
  1491
  1492DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
  1493DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
  1494DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
  1495DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
  1496DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
  1497DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
  1498DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
  1499DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
  1500GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32

View as plain text