...
Run Format

Text file src/crypto/sha512/sha512block_amd64.s

Documentation: crypto/sha512

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// SHA512 block routine. See sha512block.go for Go equivalent.
     8	//
     9	// The algorithm is detailed in FIPS 180-4:
    10	//
    11	//  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12	//
    13	// Wt = Mt; for 0 <= t <= 15
    14	// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    15	//
    16	// a = H0
    17	// b = H1
    18	// c = H2
    19	// d = H3
    20	// e = H4
    21	// f = H5
    22	// g = H6
    23	// h = H7
    24	//
    25	// for t = 0 to 79 {
    26	//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    27	//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    28	//    h = g
    29	//    g = f
    30	//    f = e
    31	//    e = d + T1
    32	//    d = c
    33	//    c = b
    34	//    b = a
    35	//    a = T1 + T2
    36	// }
    37	//
    38	// H0 = a + H0
    39	// H1 = b + H1
    40	// H2 = c + H2
    41	// H3 = d + H3
    42	// H4 = e + H4
    43	// H5 = f + H5
    44	// H6 = g + H6
    45	// H7 = h + H7
    46	
    47	// Wt = Mt; for 0 <= t <= 15
    48	#define MSGSCHEDULE0(index) \
    49		MOVQ	(index*8)(SI), AX; \
    50		BSWAPQ	AX; \
    51		MOVQ	AX, (index*8)(BP)
    52	
    53	// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    54	//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    55	//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    56	#define MSGSCHEDULE1(index) \
    57		MOVQ	((index-2)*8)(BP), AX; \
    58		MOVQ	AX, CX; \
    59		RORQ	$19, AX; \
    60		MOVQ	CX, DX; \
    61		RORQ	$61, CX; \
    62		SHRQ	$6, DX; \
    63		MOVQ	((index-15)*8)(BP), BX; \
    64		XORQ	CX, AX; \
    65		MOVQ	BX, CX; \
    66		XORQ	DX, AX; \
    67		RORQ	$1, BX; \
    68		MOVQ	CX, DX; \
    69		SHRQ	$7, DX; \
    70		RORQ	$8, CX; \
    71		ADDQ	((index-7)*8)(BP), AX; \
    72		XORQ	CX, BX; \
    73		XORQ	DX, BX; \
    74		ADDQ	((index-16)*8)(BP), BX; \
    75		ADDQ	BX, AX; \
    76		MOVQ	AX, ((index)*8)(BP)
    77	
    78	// Calculate T1 in AX - uses AX, CX and DX registers.
    79	// h is also used as an accumulator. Wt is passed in AX.
    80	//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    81	//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    82	//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    83	#define SHA512T1(const, e, f, g, h) \
    84		MOVQ	$const, DX; \
    85		ADDQ	AX, h; \
    86		MOVQ	e, AX; \
    87		ADDQ	DX, h; \
    88		MOVQ	e, CX; \
    89		RORQ	$14, AX; \
    90		MOVQ	e, DX; \
    91		RORQ	$18, CX; \
    92		XORQ	CX, AX; \
    93		MOVQ	e, CX; \
    94		RORQ	$41, DX; \
    95		ANDQ	f, CX; \
    96		XORQ	AX, DX; \
    97		MOVQ	e, AX; \
    98		NOTQ	AX; \
    99		ADDQ	DX, h; \
   100		ANDQ	g, AX; \
   101		XORQ	CX, AX; \
   102		ADDQ	h, AX
   103	
   104	// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   105	//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   106	//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   107	//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   108	#define SHA512T2(a, b, c) \
   109		MOVQ	a, DI; \
   110		MOVQ	c, BX; \
   111		RORQ	$28, DI; \
   112		MOVQ	a, DX; \
   113		ANDQ	b, BX; \
   114		RORQ	$34, DX; \
   115		MOVQ	a, CX; \
   116		ANDQ	c, CX; \
   117		XORQ	DX, DI; \
   118		XORQ	CX, BX; \
   119		MOVQ	a, DX; \
   120		MOVQ	b, CX; \
   121		RORQ	$39, DX; \
   122		ANDQ	a, CX; \
   123		XORQ	CX, BX; \
   124		XORQ	DX, DI; \
   125		ADDQ	DI, BX
   126	
   127	// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   128	// The values for e and a are stored in d and h, ready for rotation.
   129	#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   130		SHA512T1(const, e, f, g, h); \
   131		SHA512T2(a, b, c); \
   132		MOVQ	BX, h; \
   133		ADDQ	AX, d; \
   134		ADDQ	AX, h
   135	
   136	#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   137		MSGSCHEDULE0(index); \
   138		SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   139	
   140	#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   141		MSGSCHEDULE1(index); \
   142		SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   143	
   144	TEXT ·blockAMD64(SB),0,$648-32
   145		MOVQ	p_base+8(FP), SI
   146		MOVQ	p_len+16(FP), DX
   147		SHRQ	$7, DX
   148		SHLQ	$7, DX
   149	
   150		LEAQ	(SI)(DX*1), DI
   151		MOVQ	DI, 640(SP)
   152		CMPQ	SI, DI
   153		JEQ	end
   154	
   155		MOVQ	dig+0(FP), BP
   156		MOVQ	(0*8)(BP), R8		// a = H0
   157		MOVQ	(1*8)(BP), R9		// b = H1
   158		MOVQ	(2*8)(BP), R10		// c = H2
   159		MOVQ	(3*8)(BP), R11		// d = H3
   160		MOVQ	(4*8)(BP), R12		// e = H4
   161		MOVQ	(5*8)(BP), R13		// f = H5
   162		MOVQ	(6*8)(BP), R14		// g = H6
   163		MOVQ	(7*8)(BP), R15		// h = H7
   164	
   165	loop:
   166		MOVQ	SP, BP			// message schedule
   167	
   168		SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   169		SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   170		SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   171		SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   172		SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   173		SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   174		SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   175		SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   176		SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   177		SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   178		SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   179		SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   180		SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   181		SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   182		SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   183		SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   184	
   185		SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   186		SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   187		SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   188		SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   189		SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   190		SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   191		SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   192		SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   193		SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   194		SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   195		SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   196		SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   197		SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   198		SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   199		SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   200		SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   201		SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   202		SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   203		SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   204		SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   205		SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   206		SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   207		SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   208		SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   209		SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   210		SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   211		SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   212		SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   213		SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   214		SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   215		SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   216		SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   217		SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   218		SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   219		SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   220		SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   221		SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   222		SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   223		SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   224		SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   225		SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   226		SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   227		SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   228		SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   229		SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   230		SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   231		SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   232		SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   233		SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   234		SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   235		SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   236		SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   237		SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   238		SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   239		SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   240		SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   241		SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   242		SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   243		SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   244		SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   245		SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   246		SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   247		SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   248		SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   249	
   250		MOVQ	dig+0(FP), BP
   251		ADDQ	(0*8)(BP), R8	// H0 = a + H0
   252		MOVQ	R8, (0*8)(BP)
   253		ADDQ	(1*8)(BP), R9	// H1 = b + H1
   254		MOVQ	R9, (1*8)(BP)
   255		ADDQ	(2*8)(BP), R10	// H2 = c + H2
   256		MOVQ	R10, (2*8)(BP)
   257		ADDQ	(3*8)(BP), R11	// H3 = d + H3
   258		MOVQ	R11, (3*8)(BP)
   259		ADDQ	(4*8)(BP), R12	// H4 = e + H4
   260		MOVQ	R12, (4*8)(BP)
   261		ADDQ	(5*8)(BP), R13	// H5 = f + H5
   262		MOVQ	R13, (5*8)(BP)
   263		ADDQ	(6*8)(BP), R14	// H6 = g + H6
   264		MOVQ	R14, (6*8)(BP)
   265		ADDQ	(7*8)(BP), R15	// H7 = h + H7
   266		MOVQ	R15, (7*8)(BP)
   267	
   268		ADDQ	$128, SI
   269		CMPQ	SI, 640(SP)
   270		JB	loop
   271	
   272	end:
   273		RET
   274	
   275	// Version below is based on "Fast SHA512 Implementations on Intel
   276	// Architecture Processors" White-paper
   277	// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   278	// AVX2 version by Intel, same algorithm in Linux kernel:
   279	// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   280	
   281	// James Guilford <james.guilford@intel.com>
   282	// Kirk Yap <kirk.s.yap@intel.com>
   283	// Tim Chen <tim.c.chen@linux.intel.com>
   284	// David Cote <david.m.cote@intel.com>
   285	// Aleksey Sidorov <aleksey.sidorov@intel.com>
   286	
   287	#define YFER_SIZE (4*8)
   288	#define SRND_SIZE (1*8)
   289	#define INP_SIZE (1*8)
   290	
   291	#define frame_YFER (0)
   292	#define frame_SRND (frame_YFER + YFER_SIZE)
   293	#define frame_INP (frame_SRND + SRND_SIZE)
   294	#define frame_INPEND (frame_INP + INP_SIZE)
   295	
   296	#define addm(p1, p2) \
   297		ADDQ p1, p2; \
   298		MOVQ p2, p1
   299	
   300	#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   301		VMOVDQU p2, p1;    \
   302		VPSHUFB p3, p1, p1
   303	
   304	#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   305		VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   306		VPALIGNR   $RVAL, YSRC2, YDST, YDST
   307	
   308	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   309	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   310	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   311	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   312	
   313	GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   314	
   315	DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   316	DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   317	DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   318	DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   319	
   320	GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   321	
   322	TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   323		MOVQ dig+0(FP), SI
   324		MOVQ p_base+8(FP), DI
   325		MOVQ p_len+16(FP), DX
   326	
   327		SHRQ $7, DX
   328		SHLQ $7, DX
   329	
   330		JZ   done_hash
   331		ADDQ DI, DX
   332		MOVQ DX, frame_INPEND(SP)
   333	
   334		MOVQ (0*8)(SI), AX
   335		MOVQ (1*8)(SI), BX
   336		MOVQ (2*8)(SI), CX
   337		MOVQ (3*8)(SI), R8
   338		MOVQ (4*8)(SI), DX
   339		MOVQ (5*8)(SI), R9
   340		MOVQ (6*8)(SI), R10
   341		MOVQ (7*8)(SI), R11
   342	
   343		MOVQ    $PSHUFFLE_BYTE_FLIP_MASK<>(SB), R12
   344		VMOVDQU (R12), Y9
   345	
   346	loop0:
   347		MOVQ ·_K+0(SB), BP
   348	
   349		// byte swap first 16 dwords
   350		COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   351		COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   352		COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   353		COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   354	
   355		MOVQ DI, frame_INP(SP)
   356	
   357		// schedule 64 input dwords, by doing 12 rounds of 4 each
   358		MOVQ $4, frame_SRND(SP)
   359	
   360	loop1:
   361		VPADDQ  (BP), Y4, Y0
   362		VMOVDQU Y0, frame_YFER(SP)
   363	
   364		MY_VPALIGNR(Y0, Y7, Y6, 8)
   365	
   366		VPADDQ Y4, Y0, Y0
   367	
   368		MY_VPALIGNR(Y1, Y5, Y4, 8)
   369	
   370		VPSRLQ $1, Y1, Y2
   371		VPSLLQ $(64-1), Y1, Y3
   372		VPOR   Y2, Y3, Y3
   373	
   374		VPSRLQ $7, Y1, Y8
   375	
   376		MOVQ  AX, DI
   377		RORXQ $41, DX, R13
   378		RORXQ $18, DX, R14
   379		ADDQ  frame_YFER(SP), R11
   380		ORQ   CX, DI
   381		MOVQ  R9, R15
   382		RORXQ $34, AX, R12
   383	
   384		XORQ  R14, R13
   385		XORQ  R10, R15
   386		RORXQ $14, DX, R14
   387	
   388		ANDQ  DX, R15
   389		XORQ  R14, R13
   390		RORXQ $39, AX, R14
   391		ADDQ  R11, R8
   392	
   393		ANDQ  BX, DI
   394		XORQ  R12, R14
   395		RORXQ $28, AX, R12
   396	
   397		XORQ R10, R15
   398		XORQ R12, R14
   399		MOVQ AX, R12
   400		ANDQ CX, R12
   401	
   402		ADDQ R13, R15
   403		ORQ  R12, DI
   404		ADDQ R14, R11
   405	
   406		ADDQ R15, R8
   407	
   408		ADDQ R15, R11
   409		ADDQ DI, R11
   410	
   411		VPSRLQ $8, Y1, Y2
   412		VPSLLQ $(64-8), Y1, Y1
   413		VPOR   Y2, Y1, Y1
   414	
   415		VPXOR Y8, Y3, Y3
   416		VPXOR Y1, Y3, Y1
   417	
   418		VPADDQ Y1, Y0, Y0
   419	
   420		VPERM2F128 $0x0, Y0, Y0, Y4
   421	
   422		MOVQ $MASK_YMM_LO<>(SB), R13
   423	
   424		VPAND (R13), Y0, Y0
   425	
   426		VPERM2F128 $0x11, Y7, Y7, Y2
   427		VPSRLQ     $6, Y2, Y8
   428	
   429		MOVQ  R11, DI
   430		RORXQ $41, R8, R13
   431		RORXQ $18, R8, R14
   432		ADDQ  1*8+frame_YFER(SP), R10
   433		ORQ   BX, DI
   434	
   435		MOVQ  DX, R15
   436		RORXQ $34, R11, R12
   437		XORQ  R14, R13
   438		XORQ  R9, R15
   439	
   440		RORXQ $14, R8, R14
   441		XORQ  R14, R13
   442		RORXQ $39, R11, R14
   443		ANDQ  R8, R15
   444		ADDQ  R10, CX
   445	
   446		ANDQ AX, DI
   447		XORQ R12, R14
   448	
   449		RORXQ $28, R11, R12
   450		XORQ  R9, R15
   451	
   452		XORQ R12, R14
   453		MOVQ R11, R12
   454		ANDQ BX, R12
   455		ADDQ R13, R15
   456	
   457		ORQ  R12, DI
   458		ADDQ R14, R10
   459	
   460		ADDQ R15, CX
   461		ADDQ R15, R10
   462		ADDQ DI, R10
   463	
   464		VPSRLQ $19, Y2, Y3
   465		VPSLLQ $(64-19), Y2, Y1
   466		VPOR   Y1, Y3, Y3
   467		VPXOR  Y3, Y8, Y8
   468		VPSRLQ $61, Y2, Y3
   469		VPSLLQ $(64-61), Y2, Y1
   470		VPOR   Y1, Y3, Y3
   471		VPXOR  Y3, Y8, Y8
   472	
   473		VPADDQ Y8, Y4, Y4
   474	
   475		VPSRLQ $6, Y4, Y8
   476	
   477		MOVQ  R10, DI
   478		RORXQ $41, CX, R13
   479		ADDQ  2*8+frame_YFER(SP), R9
   480	
   481		RORXQ $18, CX, R14
   482		ORQ   AX, DI
   483		MOVQ  R8, R15
   484		XORQ  DX, R15
   485	
   486		RORXQ $34, R10, R12
   487		XORQ  R14, R13
   488		ANDQ  CX, R15
   489	
   490		RORXQ $14, CX, R14
   491		ADDQ  R9, BX
   492		ANDQ  R11, DI
   493	
   494		XORQ  R14, R13
   495		RORXQ $39, R10, R14
   496		XORQ  DX, R15
   497	
   498		XORQ  R12, R14
   499		RORXQ $28, R10, R12
   500	
   501		XORQ R12, R14
   502		MOVQ R10, R12
   503		ANDQ AX, R12
   504		ADDQ R13, R15
   505	
   506		ORQ  R12, DI
   507		ADDQ R14, R9
   508		ADDQ R15, BX
   509		ADDQ R15, R9
   510	
   511		ADDQ DI, R9
   512	
   513		VPSRLQ $19, Y4, Y3
   514		VPSLLQ $(64-19), Y4, Y1
   515		VPOR   Y1, Y3, Y3
   516		VPXOR  Y3, Y8, Y8
   517		VPSRLQ $61, Y4, Y3
   518		VPSLLQ $(64-61), Y4, Y1
   519		VPOR   Y1, Y3, Y3
   520		VPXOR  Y3, Y8, Y8
   521	
   522		VPADDQ Y8, Y0, Y2
   523	
   524		VPBLENDD $0xF0, Y2, Y4, Y4
   525	
   526		MOVQ  R9, DI
   527		RORXQ $41, BX, R13
   528		RORXQ $18, BX, R14
   529		ADDQ  3*8+frame_YFER(SP), DX
   530		ORQ   R11, DI
   531	
   532		MOVQ  CX, R15
   533		RORXQ $34, R9, R12
   534		XORQ  R14, R13
   535		XORQ  R8, R15
   536	
   537		RORXQ $14, BX, R14
   538		ANDQ  BX, R15
   539		ADDQ  DX, AX
   540		ANDQ  R10, DI
   541	
   542		XORQ R14, R13
   543		XORQ R8, R15
   544	
   545		RORXQ $39, R9, R14
   546		ADDQ  R13, R15
   547	
   548		XORQ R12, R14
   549		ADDQ R15, AX
   550	
   551		RORXQ $28, R9, R12
   552	
   553		XORQ R12, R14
   554		MOVQ R9, R12
   555		ANDQ R11, R12
   556		ORQ  R12, DI
   557	
   558		ADDQ R14, DX
   559		ADDQ R15, DX
   560		ADDQ DI, DX
   561	
   562		VPADDQ  1*32(BP), Y5, Y0
   563		VMOVDQU Y0, frame_YFER(SP)
   564	
   565		MY_VPALIGNR(Y0, Y4, Y7, 8)
   566	
   567		VPADDQ Y5, Y0, Y0
   568	
   569		MY_VPALIGNR(Y1, Y6, Y5, 8)
   570	
   571		VPSRLQ $1, Y1, Y2
   572		VPSLLQ $(64-1), Y1, Y3
   573		VPOR   Y2, Y3, Y3
   574	
   575		VPSRLQ $7, Y1, Y8
   576	
   577		MOVQ  DX, DI
   578		RORXQ $41, AX, R13
   579		RORXQ $18, AX, R14
   580		ADDQ  frame_YFER(SP), R8
   581		ORQ   R10, DI
   582		MOVQ  BX, R15
   583		RORXQ $34, DX, R12
   584	
   585		XORQ  R14, R13
   586		XORQ  CX, R15
   587		RORXQ $14, AX, R14
   588	
   589		ANDQ  AX, R15
   590		XORQ  R14, R13
   591		RORXQ $39, DX, R14
   592		ADDQ  R8, R11
   593	
   594		ANDQ  R9, DI
   595		XORQ  R12, R14
   596		RORXQ $28, DX, R12
   597	
   598		XORQ CX, R15
   599		XORQ R12, R14
   600		MOVQ DX, R12
   601		ANDQ R10, R12
   602	
   603		ADDQ R13, R15
   604		ORQ  R12, DI
   605		ADDQ R14, R8
   606	
   607		ADDQ R15, R11
   608	
   609		ADDQ R15, R8
   610		ADDQ DI, R8
   611	
   612		VPSRLQ $8, Y1, Y2
   613		VPSLLQ $(64-8), Y1, Y1
   614		VPOR   Y2, Y1, Y1
   615	
   616		VPXOR Y8, Y3, Y3
   617		VPXOR Y1, Y3, Y1
   618	
   619		VPADDQ Y1, Y0, Y0
   620	
   621		VPERM2F128 $0x0, Y0, Y0, Y5
   622	
   623		MOVQ  $MASK_YMM_LO<>(SB), R13
   624		VPAND (R13), Y0, Y0
   625	
   626		VPERM2F128 $0x11, Y4, Y4, Y2
   627		VPSRLQ     $6, Y2, Y8
   628	
   629		MOVQ  R8, DI
   630		RORXQ $41, R11, R13
   631		RORXQ $18, R11, R14
   632		ADDQ  1*8+frame_YFER(SP), CX
   633		ORQ   R9, DI
   634	
   635		MOVQ  AX, R15
   636		RORXQ $34, R8, R12
   637		XORQ  R14, R13
   638		XORQ  BX, R15
   639	
   640		RORXQ $14, R11, R14
   641		XORQ  R14, R13
   642		RORXQ $39, R8, R14
   643		ANDQ  R11, R15
   644		ADDQ  CX, R10
   645	
   646		ANDQ DX, DI
   647		XORQ R12, R14
   648	
   649		RORXQ $28, R8, R12
   650		XORQ  BX, R15
   651	
   652		XORQ R12, R14
   653		MOVQ R8, R12
   654		ANDQ R9, R12
   655		ADDQ R13, R15
   656	
   657		ORQ  R12, DI
   658		ADDQ R14, CX
   659	
   660		ADDQ R15, R10
   661		ADDQ R15, CX
   662		ADDQ DI, CX
   663	
   664		VPSRLQ $19, Y2, Y3
   665		VPSLLQ $(64-19), Y2, Y1
   666		VPOR   Y1, Y3, Y3
   667		VPXOR  Y3, Y8, Y8
   668		VPSRLQ $61, Y2, Y3
   669		VPSLLQ $(64-61), Y2, Y1
   670		VPOR   Y1, Y3, Y3
   671		VPXOR  Y3, Y8, Y8
   672	
   673		VPADDQ Y8, Y5, Y5
   674	
   675		VPSRLQ $6, Y5, Y8
   676	
   677		MOVQ  CX, DI
   678		RORXQ $41, R10, R13
   679		ADDQ  2*8+frame_YFER(SP), BX
   680	
   681		RORXQ $18, R10, R14
   682		ORQ   DX, DI
   683		MOVQ  R11, R15
   684		XORQ  AX, R15
   685	
   686		RORXQ $34, CX, R12
   687		XORQ  R14, R13
   688		ANDQ  R10, R15
   689	
   690		RORXQ $14, R10, R14
   691		ADDQ  BX, R9
   692		ANDQ  R8, DI
   693	
   694		XORQ  R14, R13
   695		RORXQ $39, CX, R14
   696		XORQ  AX, R15
   697	
   698		XORQ  R12, R14
   699		RORXQ $28, CX, R12
   700	
   701		XORQ R12, R14
   702		MOVQ CX, R12
   703		ANDQ DX, R12
   704		ADDQ R13, R15
   705	
   706		ORQ  R12, DI
   707		ADDQ R14, BX
   708		ADDQ R15, R9
   709		ADDQ R15, BX
   710	
   711		ADDQ DI, BX
   712	
   713		VPSRLQ $19, Y5, Y3
   714		VPSLLQ $(64-19), Y5, Y1
   715		VPOR   Y1, Y3, Y3
   716		VPXOR  Y3, Y8, Y8
   717		VPSRLQ $61, Y5, Y3
   718		VPSLLQ $(64-61), Y5, Y1
   719		VPOR   Y1, Y3, Y3
   720		VPXOR  Y3, Y8, Y8
   721	
   722		VPADDQ Y8, Y0, Y2
   723	
   724		VPBLENDD $0xF0, Y2, Y5, Y5
   725	
   726		MOVQ  BX, DI
   727		RORXQ $41, R9, R13
   728		RORXQ $18, R9, R14
   729		ADDQ  3*8+frame_YFER(SP), AX
   730		ORQ   R8, DI
   731	
   732		MOVQ  R10, R15
   733		RORXQ $34, BX, R12
   734		XORQ  R14, R13
   735		XORQ  R11, R15
   736	
   737		RORXQ $14, R9, R14
   738		ANDQ  R9, R15
   739		ADDQ  AX, DX
   740		ANDQ  CX, DI
   741	
   742		XORQ R14, R13
   743		XORQ R11, R15
   744	
   745		RORXQ $39, BX, R14
   746		ADDQ  R13, R15
   747	
   748		XORQ R12, R14
   749		ADDQ R15, DX
   750	
   751		RORXQ $28, BX, R12
   752	
   753		XORQ R12, R14
   754		MOVQ BX, R12
   755		ANDQ R8, R12
   756		ORQ  R12, DI
   757	
   758		ADDQ R14, AX
   759		ADDQ R15, AX
   760		ADDQ DI, AX
   761	
   762		VPADDQ  2*32(BP), Y6, Y0
   763		VMOVDQU Y0, frame_YFER(SP)
   764	
   765		MY_VPALIGNR(Y0, Y5, Y4, 8)
   766	
   767		VPADDQ Y6, Y0, Y0
   768	
   769		MY_VPALIGNR(Y1, Y7, Y6, 8)
   770	
   771		VPSRLQ $1, Y1, Y2
   772		VPSLLQ $(64-1), Y1, Y3
   773		VPOR   Y2, Y3, Y3
   774	
   775		VPSRLQ $7, Y1, Y8
   776	
   777		MOVQ  AX, DI
   778		RORXQ $41, DX, R13
   779		RORXQ $18, DX, R14
   780		ADDQ  frame_YFER(SP), R11
   781		ORQ   CX, DI
   782		MOVQ  R9, R15
   783		RORXQ $34, AX, R12
   784	
   785		XORQ  R14, R13
   786		XORQ  R10, R15
   787		RORXQ $14, DX, R14
   788	
   789		ANDQ  DX, R15
   790		XORQ  R14, R13
   791		RORXQ $39, AX, R14
   792		ADDQ  R11, R8
   793	
   794		ANDQ  BX, DI
   795		XORQ  R12, R14
   796		RORXQ $28, AX, R12
   797	
   798		XORQ R10, R15
   799		XORQ R12, R14
   800		MOVQ AX, R12
   801		ANDQ CX, R12
   802	
   803		ADDQ R13, R15
   804		ORQ  R12, DI
   805		ADDQ R14, R11
   806	
   807		ADDQ R15, R8
   808	
   809		ADDQ R15, R11
   810		ADDQ DI, R11
   811	
   812		VPSRLQ $8, Y1, Y2
   813		VPSLLQ $(64-8), Y1, Y1
   814		VPOR   Y2, Y1, Y1
   815	
   816		VPXOR Y8, Y3, Y3
   817		VPXOR Y1, Y3, Y1
   818	
   819		VPADDQ Y1, Y0, Y0
   820	
   821		VPERM2F128 $0x0, Y0, Y0, Y6
   822	
   823		MOVQ  $MASK_YMM_LO<>(SB), R13
   824		VPAND (R13), Y0, Y0
   825	
   826		VPERM2F128 $0x11, Y5, Y5, Y2
   827		VPSRLQ     $6, Y2, Y8
   828	
   829		MOVQ  R11, DI
   830		RORXQ $41, R8, R13
   831		RORXQ $18, R8, R14
   832		ADDQ  1*8+frame_YFER(SP), R10
   833		ORQ   BX, DI
   834	
   835		MOVQ  DX, R15
   836		RORXQ $34, R11, R12
   837		XORQ  R14, R13
   838		XORQ  R9, R15
   839	
   840		RORXQ $14, R8, R14
   841		XORQ  R14, R13
   842		RORXQ $39, R11, R14
   843		ANDQ  R8, R15
   844		ADDQ  R10, CX
   845	
   846		ANDQ AX, DI
   847		XORQ R12, R14
   848	
   849		RORXQ $28, R11, R12
   850		XORQ  R9, R15
   851	
   852		XORQ R12, R14
   853		MOVQ R11, R12
   854		ANDQ BX, R12
   855		ADDQ R13, R15
   856	
   857		ORQ  R12, DI
   858		ADDQ R14, R10
   859	
   860		ADDQ R15, CX
   861		ADDQ R15, R10
   862		ADDQ DI, R10
   863	
   864		VPSRLQ $19, Y2, Y3
   865		VPSLLQ $(64-19), Y2, Y1
   866		VPOR   Y1, Y3, Y3
   867		VPXOR  Y3, Y8, Y8
   868		VPSRLQ $61, Y2, Y3
   869		VPSLLQ $(64-61), Y2, Y1
   870		VPOR   Y1, Y3, Y3
   871		VPXOR  Y3, Y8, Y8
   872	
   873		VPADDQ Y8, Y6, Y6
   874	
   875		VPSRLQ $6, Y6, Y8
   876	
   877		MOVQ  R10, DI
   878		RORXQ $41, CX, R13
   879		ADDQ  2*8+frame_YFER(SP), R9
   880	
   881		RORXQ $18, CX, R14
   882		ORQ   AX, DI
   883		MOVQ  R8, R15
   884		XORQ  DX, R15
   885	
   886		RORXQ $34, R10, R12
   887		XORQ  R14, R13
   888		ANDQ  CX, R15
   889	
   890		RORXQ $14, CX, R14
   891		ADDQ  R9, BX
   892		ANDQ  R11, DI
   893	
   894		XORQ  R14, R13
   895		RORXQ $39, R10, R14
   896		XORQ  DX, R15
   897	
   898		XORQ  R12, R14
   899		RORXQ $28, R10, R12
   900	
   901		XORQ R12, R14
   902		MOVQ R10, R12
   903		ANDQ AX, R12
   904		ADDQ R13, R15
   905	
   906		ORQ  R12, DI
   907		ADDQ R14, R9
   908		ADDQ R15, BX
   909		ADDQ R15, R9
   910	
   911		ADDQ DI, R9
   912	
   913		VPSRLQ $19, Y6, Y3
   914		VPSLLQ $(64-19), Y6, Y1
   915		VPOR   Y1, Y3, Y3
   916		VPXOR  Y3, Y8, Y8
   917		VPSRLQ $61, Y6, Y3
   918		VPSLLQ $(64-61), Y6, Y1
   919		VPOR   Y1, Y3, Y3
   920		VPXOR  Y3, Y8, Y8
   921	
   922		VPADDQ Y8, Y0, Y2
   923	
   924		VPBLENDD $0xF0, Y2, Y6, Y6
   925	
   926		MOVQ  R9, DI
   927		RORXQ $41, BX, R13
   928		RORXQ $18, BX, R14
   929		ADDQ  3*8+frame_YFER(SP), DX
   930		ORQ   R11, DI
   931	
   932		MOVQ  CX, R15
   933		RORXQ $34, R9, R12
   934		XORQ  R14, R13
   935		XORQ  R8, R15
   936	
   937		RORXQ $14, BX, R14
   938		ANDQ  BX, R15
   939		ADDQ  DX, AX
   940		ANDQ  R10, DI
   941	
   942		XORQ R14, R13
   943		XORQ R8, R15
   944	
   945		RORXQ $39, R9, R14
   946		ADDQ  R13, R15
   947	
   948		XORQ R12, R14
   949		ADDQ R15, AX
   950	
   951		RORXQ $28, R9, R12
   952	
   953		XORQ R12, R14
   954		MOVQ R9, R12
   955		ANDQ R11, R12
   956		ORQ  R12, DI
   957	
   958		ADDQ R14, DX
   959		ADDQ R15, DX
   960		ADDQ DI, DX
   961	
   962		VPADDQ  3*32(BP), Y7, Y0
   963		VMOVDQU Y0, frame_YFER(SP)
   964		ADDQ    $(4*32), BP
   965	
   966		MY_VPALIGNR(Y0, Y6, Y5, 8)
   967	
   968		VPADDQ Y7, Y0, Y0
   969	
   970		MY_VPALIGNR(Y1, Y4, Y7, 8)
   971	
   972		VPSRLQ $1, Y1, Y2
   973		VPSLLQ $(64-1), Y1, Y3
   974		VPOR   Y2, Y3, Y3
   975	
   976		VPSRLQ $7, Y1, Y8
   977	
   978		MOVQ  DX, DI
   979		RORXQ $41, AX, R13
   980		RORXQ $18, AX, R14
   981		ADDQ  frame_YFER(SP), R8
   982		ORQ   R10, DI
   983		MOVQ  BX, R15
   984		RORXQ $34, DX, R12
   985	
   986		XORQ  R14, R13
   987		XORQ  CX, R15
   988		RORXQ $14, AX, R14
   989	
   990		ANDQ  AX, R15
   991		XORQ  R14, R13
   992		RORXQ $39, DX, R14
   993		ADDQ  R8, R11
   994	
   995		ANDQ  R9, DI
   996		XORQ  R12, R14
   997		RORXQ $28, DX, R12
   998	
   999		XORQ CX, R15
  1000		XORQ R12, R14
  1001		MOVQ DX, R12
  1002		ANDQ R10, R12
  1003	
  1004		ADDQ R13, R15
  1005		ORQ  R12, DI
  1006		ADDQ R14, R8
  1007	
  1008		ADDQ R15, R11
  1009	
  1010		ADDQ R15, R8
  1011		ADDQ DI, R8
  1012	
  1013		VPSRLQ $8, Y1, Y2
  1014		VPSLLQ $(64-8), Y1, Y1
  1015		VPOR   Y2, Y1, Y1
  1016	
  1017		VPXOR Y8, Y3, Y3
  1018		VPXOR Y1, Y3, Y1
  1019	
  1020		VPADDQ Y1, Y0, Y0
  1021	
  1022		VPERM2F128 $0x0, Y0, Y0, Y7
  1023	
  1024		MOVQ  $MASK_YMM_LO<>(SB), R13
  1025		VPAND (R13), Y0, Y0
  1026	
  1027		VPERM2F128 $0x11, Y6, Y6, Y2
  1028		VPSRLQ     $6, Y2, Y8
  1029	
  1030		MOVQ  R8, DI
  1031		RORXQ $41, R11, R13
  1032		RORXQ $18, R11, R14
  1033		ADDQ  1*8+frame_YFER(SP), CX
  1034		ORQ   R9, DI
  1035	
  1036		MOVQ  AX, R15
  1037		RORXQ $34, R8, R12
  1038		XORQ  R14, R13
  1039		XORQ  BX, R15
  1040	
  1041		RORXQ $14, R11, R14
  1042		XORQ  R14, R13
  1043		RORXQ $39, R8, R14
  1044		ANDQ  R11, R15
  1045		ADDQ  CX, R10
  1046	
  1047		ANDQ DX, DI
  1048		XORQ R12, R14
  1049	
  1050		RORXQ $28, R8, R12
  1051		XORQ  BX, R15
  1052	
  1053		XORQ R12, R14
  1054		MOVQ R8, R12
  1055		ANDQ R9, R12
  1056		ADDQ R13, R15
  1057	
  1058		ORQ  R12, DI
  1059		ADDQ R14, CX
  1060	
  1061		ADDQ R15, R10
  1062		ADDQ R15, CX
  1063		ADDQ DI, CX
  1064	
  1065		VPSRLQ $19, Y2, Y3
  1066		VPSLLQ $(64-19), Y2, Y1
  1067		VPOR   Y1, Y3, Y3
  1068		VPXOR  Y3, Y8, Y8
  1069		VPSRLQ $61, Y2, Y3
  1070		VPSLLQ $(64-61), Y2, Y1
  1071		VPOR   Y1, Y3, Y3
  1072		VPXOR  Y3, Y8, Y8
  1073	
  1074		VPADDQ Y8, Y7, Y7
  1075	
  1076		VPSRLQ $6, Y7, Y8
  1077	
  1078		MOVQ  CX, DI
  1079		RORXQ $41, R10, R13
  1080		ADDQ  2*8+frame_YFER(SP), BX
  1081	
  1082		RORXQ $18, R10, R14
  1083		ORQ   DX, DI
  1084		MOVQ  R11, R15
  1085		XORQ  AX, R15
  1086	
  1087		RORXQ $34, CX, R12
  1088		XORQ  R14, R13
  1089		ANDQ  R10, R15
  1090	
  1091		RORXQ $14, R10, R14
  1092		ADDQ  BX, R9
  1093		ANDQ  R8, DI
  1094	
  1095		XORQ  R14, R13
  1096		RORXQ $39, CX, R14
  1097		XORQ  AX, R15
  1098	
  1099		XORQ  R12, R14
  1100		RORXQ $28, CX, R12
  1101	
  1102		XORQ R12, R14
  1103		MOVQ CX, R12
  1104		ANDQ DX, R12
  1105		ADDQ R13, R15
  1106	
  1107		ORQ  R12, DI
  1108		ADDQ R14, BX
  1109		ADDQ R15, R9
  1110		ADDQ R15, BX
  1111	
  1112		ADDQ DI, BX
  1113	
  1114		VPSRLQ $19, Y7, Y3
  1115		VPSLLQ $(64-19), Y7, Y1
  1116		VPOR   Y1, Y3, Y3
  1117		VPXOR  Y3, Y8, Y8
  1118		VPSRLQ $61, Y7, Y3
  1119		VPSLLQ $(64-61), Y7, Y1
  1120		VPOR   Y1, Y3, Y3
  1121		VPXOR  Y3, Y8, Y8
  1122	
  1123		VPADDQ Y8, Y0, Y2
  1124	
  1125		VPBLENDD $0xF0, Y2, Y7, Y7
  1126	
  1127		MOVQ  BX, DI
  1128		RORXQ $41, R9, R13
  1129		RORXQ $18, R9, R14
  1130		ADDQ  3*8+frame_YFER(SP), AX
  1131		ORQ   R8, DI
  1132	
  1133		MOVQ  R10, R15
  1134		RORXQ $34, BX, R12
  1135		XORQ  R14, R13
  1136		XORQ  R11, R15
  1137	
  1138		RORXQ $14, R9, R14
  1139		ANDQ  R9, R15
  1140		ADDQ  AX, DX
  1141		ANDQ  CX, DI
  1142	
  1143		XORQ R14, R13
  1144		XORQ R11, R15
  1145	
  1146		RORXQ $39, BX, R14
  1147		ADDQ  R13, R15
  1148	
  1149		XORQ R12, R14
  1150		ADDQ R15, DX
  1151	
  1152		RORXQ $28, BX, R12
  1153	
  1154		XORQ R12, R14
  1155		MOVQ BX, R12
  1156		ANDQ R8, R12
  1157		ORQ  R12, DI
  1158	
  1159		ADDQ R14, AX
  1160		ADDQ R15, AX
  1161		ADDQ DI, AX
  1162	
  1163		SUBQ $1, frame_SRND(SP)
  1164		JNE  loop1
  1165	
  1166		MOVQ $2, frame_SRND(SP)
  1167	
  1168	loop2:
  1169		VPADDQ  (BP), Y4, Y0
  1170		VMOVDQU Y0, frame_YFER(SP)
  1171	
  1172		MOVQ  R9, R15
  1173		RORXQ $41, DX, R13
  1174		RORXQ $18, DX, R14
  1175		XORQ  R10, R15
  1176	
  1177		XORQ  R14, R13
  1178		RORXQ $14, DX, R14
  1179		ANDQ  DX, R15
  1180	
  1181		XORQ  R14, R13
  1182		RORXQ $34, AX, R12
  1183		XORQ  R10, R15
  1184		RORXQ $39, AX, R14
  1185		MOVQ  AX, DI
  1186	
  1187		XORQ  R12, R14
  1188		RORXQ $28, AX, R12
  1189		ADDQ  frame_YFER(SP), R11
  1190		ORQ   CX, DI
  1191	
  1192		XORQ R12, R14
  1193		MOVQ AX, R12
  1194		ANDQ BX, DI
  1195		ANDQ CX, R12
  1196		ADDQ R13, R15
  1197	
  1198		ADDQ R11, R8
  1199		ORQ  R12, DI
  1200		ADDQ R14, R11
  1201	
  1202		ADDQ R15, R8
  1203	
  1204		ADDQ  R15, R11
  1205		MOVQ  DX, R15
  1206		RORXQ $41, R8, R13
  1207		RORXQ $18, R8, R14
  1208		XORQ  R9, R15
  1209	
  1210		XORQ  R14, R13
  1211		RORXQ $14, R8, R14
  1212		ANDQ  R8, R15
  1213		ADDQ  DI, R11
  1214	
  1215		XORQ  R14, R13
  1216		RORXQ $34, R11, R12
  1217		XORQ  R9, R15
  1218		RORXQ $39, R11, R14
  1219		MOVQ  R11, DI
  1220	
  1221		XORQ  R12, R14
  1222		RORXQ $28, R11, R12
  1223		ADDQ  8*1+frame_YFER(SP), R10
  1224		ORQ   BX, DI
  1225	
  1226		XORQ R12, R14
  1227		MOVQ R11, R12
  1228		ANDQ AX, DI
  1229		ANDQ BX, R12
  1230		ADDQ R13, R15
  1231	
  1232		ADDQ R10, CX
  1233		ORQ  R12, DI
  1234		ADDQ R14, R10
  1235	
  1236		ADDQ R15, CX
  1237	
  1238		ADDQ  R15, R10
  1239		MOVQ  R8, R15
  1240		RORXQ $41, CX, R13
  1241		RORXQ $18, CX, R14
  1242		XORQ  DX, R15
  1243	
  1244		XORQ  R14, R13
  1245		RORXQ $14, CX, R14
  1246		ANDQ  CX, R15
  1247		ADDQ  DI, R10
  1248	
  1249		XORQ  R14, R13
  1250		RORXQ $34, R10, R12
  1251		XORQ  DX, R15
  1252		RORXQ $39, R10, R14
  1253		MOVQ  R10, DI
  1254	
  1255		XORQ  R12, R14
  1256		RORXQ $28, R10, R12
  1257		ADDQ  8*2+frame_YFER(SP), R9
  1258		ORQ   AX, DI
  1259	
  1260		XORQ R12, R14
  1261		MOVQ R10, R12
  1262		ANDQ R11, DI
  1263		ANDQ AX, R12
  1264		ADDQ R13, R15
  1265	
  1266		ADDQ R9, BX
  1267		ORQ  R12, DI
  1268		ADDQ R14, R9
  1269	
  1270		ADDQ R15, BX
  1271	
  1272		ADDQ  R15, R9
  1273		MOVQ  CX, R15
  1274		RORXQ $41, BX, R13
  1275		RORXQ $18, BX, R14
  1276		XORQ  R8, R15
  1277	
  1278		XORQ  R14, R13
  1279		RORXQ $14, BX, R14
  1280		ANDQ  BX, R15
  1281		ADDQ  DI, R9
  1282	
  1283		XORQ  R14, R13
  1284		RORXQ $34, R9, R12
  1285		XORQ  R8, R15
  1286		RORXQ $39, R9, R14
  1287		MOVQ  R9, DI
  1288	
  1289		XORQ  R12, R14
  1290		RORXQ $28, R9, R12
  1291		ADDQ  8*3+frame_YFER(SP), DX
  1292		ORQ   R11, DI
  1293	
  1294		XORQ R12, R14
  1295		MOVQ R9, R12
  1296		ANDQ R10, DI
  1297		ANDQ R11, R12
  1298		ADDQ R13, R15
  1299	
  1300		ADDQ DX, AX
  1301		ORQ  R12, DI
  1302		ADDQ R14, DX
  1303	
  1304		ADDQ R15, AX
  1305	
  1306		ADDQ R15, DX
  1307	
  1308		ADDQ DI, DX
  1309	
  1310		VPADDQ  1*32(BP), Y5, Y0
  1311		VMOVDQU Y0, frame_YFER(SP)
  1312		ADDQ    $(2*32), BP
  1313	
  1314		MOVQ  BX, R15
  1315		RORXQ $41, AX, R13
  1316		RORXQ $18, AX, R14
  1317		XORQ  CX, R15
  1318	
  1319		XORQ  R14, R13
  1320		RORXQ $14, AX, R14
  1321		ANDQ  AX, R15
  1322	
  1323		XORQ  R14, R13
  1324		RORXQ $34, DX, R12
  1325		XORQ  CX, R15
  1326		RORXQ $39, DX, R14
  1327		MOVQ  DX, DI
  1328	
  1329		XORQ  R12, R14
  1330		RORXQ $28, DX, R12
  1331		ADDQ  frame_YFER(SP), R8
  1332		ORQ   R10, DI
  1333	
  1334		XORQ R12, R14
  1335		MOVQ DX, R12
  1336		ANDQ R9, DI
  1337		ANDQ R10, R12
  1338		ADDQ R13, R15
  1339	
  1340		ADDQ R8, R11
  1341		ORQ  R12, DI
  1342		ADDQ R14, R8
  1343	
  1344		ADDQ R15, R11
  1345	
  1346		ADDQ  R15, R8
  1347		MOVQ  AX, R15
  1348		RORXQ $41, R11, R13
  1349		RORXQ $18, R11, R14
  1350		XORQ  BX, R15
  1351	
  1352		XORQ  R14, R13
  1353		RORXQ $14, R11, R14
  1354		ANDQ  R11, R15
  1355		ADDQ  DI, R8
  1356	
  1357		XORQ  R14, R13
  1358		RORXQ $34, R8, R12
  1359		XORQ  BX, R15
  1360		RORXQ $39, R8, R14
  1361		MOVQ  R8, DI
  1362	
  1363		XORQ  R12, R14
  1364		RORXQ $28, R8, R12
  1365		ADDQ  8*1+frame_YFER(SP), CX
  1366		ORQ   R9, DI
  1367	
  1368		XORQ R12, R14
  1369		MOVQ R8, R12
  1370		ANDQ DX, DI
  1371		ANDQ R9, R12
  1372		ADDQ R13, R15
  1373	
  1374		ADDQ CX, R10
  1375		ORQ  R12, DI
  1376		ADDQ R14, CX
  1377	
  1378		ADDQ R15, R10
  1379	
  1380		ADDQ  R15, CX
  1381		MOVQ  R11, R15
  1382		RORXQ $41, R10, R13
  1383		RORXQ $18, R10, R14
  1384		XORQ  AX, R15
  1385	
  1386		XORQ  R14, R13
  1387		RORXQ $14, R10, R14
  1388		ANDQ  R10, R15
  1389		ADDQ  DI, CX
  1390	
  1391		XORQ  R14, R13
  1392		RORXQ $34, CX, R12
  1393		XORQ  AX, R15
  1394		RORXQ $39, CX, R14
  1395		MOVQ  CX, DI
  1396	
  1397		XORQ  R12, R14
  1398		RORXQ $28, CX, R12
  1399		ADDQ  8*2+frame_YFER(SP), BX
  1400		ORQ   DX, DI
  1401	
  1402		XORQ R12, R14
  1403		MOVQ CX, R12
  1404		ANDQ R8, DI
  1405		ANDQ DX, R12
  1406		ADDQ R13, R15
  1407	
  1408		ADDQ BX, R9
  1409		ORQ  R12, DI
  1410		ADDQ R14, BX
  1411	
  1412		ADDQ R15, R9
  1413	
  1414		ADDQ  R15, BX
  1415		MOVQ  R10, R15
  1416		RORXQ $41, R9, R13
  1417		RORXQ $18, R9, R14
  1418		XORQ  R11, R15
  1419	
  1420		XORQ  R14, R13
  1421		RORXQ $14, R9, R14
  1422		ANDQ  R9, R15
  1423		ADDQ  DI, BX
  1424	
  1425		XORQ  R14, R13
  1426		RORXQ $34, BX, R12
  1427		XORQ  R11, R15
  1428		RORXQ $39, BX, R14
  1429		MOVQ  BX, DI
  1430	
  1431		XORQ  R12, R14
  1432		RORXQ $28, BX, R12
  1433		ADDQ  8*3+frame_YFER(SP), AX
  1434		ORQ   R8, DI
  1435	
  1436		XORQ R12, R14
  1437		MOVQ BX, R12
  1438		ANDQ CX, DI
  1439		ANDQ R8, R12
  1440		ADDQ R13, R15
  1441	
  1442		ADDQ AX, DX
  1443		ORQ  R12, DI
  1444		ADDQ R14, AX
  1445	
  1446		ADDQ R15, DX
  1447	
  1448		ADDQ R15, AX
  1449	
  1450		ADDQ DI, AX
  1451	
  1452		VMOVDQU Y6, Y4
  1453		VMOVDQU Y7, Y5
  1454	
  1455		SUBQ $1, frame_SRND(SP)
  1456		JNE  loop2
  1457	
  1458		addm(8*0(SI),AX)
  1459		addm(8*1(SI),BX)
  1460		addm(8*2(SI),CX)
  1461		addm(8*3(SI),R8)
  1462		addm(8*4(SI),DX)
  1463		addm(8*5(SI),R9)
  1464		addm(8*6(SI),R10)
  1465		addm(8*7(SI),R11)
  1466	
  1467		MOVQ frame_INP(SP), DI
  1468		ADDQ $128, DI
  1469		CMPQ DI, frame_INPEND(SP)
  1470		JNE  loop0
  1471	
  1472	done_hash:
  1473		VZEROUPPER
  1474		RET

View as plain text