...
Run Format

Text file src/crypto/aes/gcm_amd64.s

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6	// The implementation uses some optimization as described in:
     7	// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8	//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9	// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10	//     Hardware
    11	
    12	#include "textflag.h"
    13	
    14	#define B0 X0
    15	#define B1 X1
    16	#define B2 X2
    17	#define B3 X3
    18	#define B4 X4
    19	#define B5 X5
    20	#define B6 X6
    21	#define B7 X7
    22	
    23	#define ACC0 X8
    24	#define ACC1 X9
    25	#define ACCM X10
    26	
    27	#define T0 X11
    28	#define T1 X12
    29	#define T2 X13
    30	#define POLY X14
    31	#define BSWAP X15
    32	
    33	DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34	DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35	
    36	DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37	DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38	
    39	DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40	DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41	DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42	DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43	DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44	DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45	DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46	DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47	DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48	DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49	DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50	DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51	DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52	DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53	DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54	DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55	DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56	DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57	DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58	DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59	DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60	DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61	DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62	DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63	DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64	DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65	DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66	DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67	DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68	DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69	
    70	GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71	GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72	GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73	
    74	// func hasGCMAsm() bool
    75	// returns whether AES-NI AND CLMUL-NI are supported
    76	TEXT ·hasGCMAsm(SB),NOSPLIT,$0
    77		XORQ AX, AX
    78		INCL AX
    79		CPUID
    80		MOVQ CX, DX
    81		SHRQ $25, CX
    82		SHRQ $1, DX
    83		ANDQ DX, CX
    84		ANDQ $1, CX
    85		MOVB CX, ret+0(FP)
    86		RET
    87	
    88	// func aesEncBlock(dst, src *[16]byte, ks []uint32)
    89	TEXT ·aesEncBlock(SB),NOSPLIT,$0
    90		MOVQ dst+0(FP), DI
    91		MOVQ src+8(FP), SI
    92		MOVQ ks_base+16(FP), DX
    93		MOVQ ks_len+24(FP), CX
    94	
    95		SHRQ $2, CX
    96		DECQ CX
    97	
    98		MOVOU (SI), X0
    99		MOVOU (16*0)(DX), X1
   100		PXOR X1, X0
   101		MOVOU (16*1)(DX), X1
   102		AESENC X1, X0
   103		MOVOU (16*2)(DX), X1
   104		AESENC X1, X0
   105		MOVOU (16*3)(DX), X1
   106		AESENC X1, X0
   107		MOVOU (16*4)(DX), X1
   108		AESENC X1, X0
   109		MOVOU (16*5)(DX), X1
   110		AESENC X1, X0
   111		MOVOU (16*6)(DX), X1
   112		AESENC X1, X0
   113		MOVOU (16*7)(DX), X1
   114		AESENC X1, X0
   115		MOVOU (16*8)(DX), X1
   116		AESENC X1, X0
   117		MOVOU (16*9)(DX), X1
   118		AESENC X1, X0
   119		MOVOU (16*10)(DX), X1
   120		CMPQ CX, $12
   121		JB encLast
   122		AESENC X1, X0
   123		MOVOU (16*11)(DX), X1
   124		AESENC X1, X0
   125		MOVOU (16*12)(DX), X1
   126		JE encLast
   127		AESENC X1, X0
   128		MOVOU (16*13)(DX), X1
   129		AESENC X1, X0
   130		MOVOU (16*14)(DX), X1
   131	
   132	encLast:
   133		AESENCLAST X1, X0
   134		MOVOU X0, (DI)
   135	
   136		RET
   137	
   138	// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
   139	TEXT ·gcmAesFinish(SB),NOSPLIT,$0
   140	#define pTbl DI
   141	#define tMsk SI
   142	#define tPtr DX
   143	#define plen AX
   144	#define dlen CX
   145	
   146		MOVQ productTable+0(FP), pTbl
   147		MOVQ tagMask+8(FP), tMsk
   148		MOVQ T+16(FP), tPtr
   149		MOVQ pLen+24(FP), plen
   150		MOVQ dLen+32(FP), dlen
   151	
   152		MOVOU (tPtr), ACC0
   153		MOVOU (tMsk), T2
   154	
   155		MOVOU bswapMask<>(SB), BSWAP
   156		MOVOU gcmPoly<>(SB), POLY
   157	
   158		SHLQ $3, plen
   159		SHLQ $3, dlen
   160	
   161		MOVQ plen, B0
   162		PINSRQ $1, dlen, B0
   163	
   164		PXOR ACC0, B0
   165	
   166		MOVOU (16*14)(pTbl), ACC0
   167		MOVOU (16*15)(pTbl), ACCM
   168		MOVOU ACC0, ACC1
   169	
   170		PCLMULQDQ $0x00, B0, ACC0
   171		PCLMULQDQ $0x11, B0, ACC1
   172		PSHUFD $78, B0, T0
   173		PXOR B0, T0
   174		PCLMULQDQ $0x00, T0, ACCM
   175	
   176		PXOR ACC0, ACCM
   177		PXOR ACC1, ACCM
   178		MOVOU ACCM, T0
   179		PSRLDQ $8, ACCM
   180		PSLLDQ $8, T0
   181		PXOR ACCM, ACC1
   182		PXOR T0, ACC0
   183	
   184		MOVOU POLY, T0
   185		PCLMULQDQ $0x01, ACC0, T0
   186		PSHUFD $78, ACC0, ACC0
   187		PXOR T0, ACC0
   188	
   189		MOVOU POLY, T0
   190		PCLMULQDQ $0x01, ACC0, T0
   191		PSHUFD $78, ACC0, ACC0
   192		PXOR T0, ACC0
   193	
   194		PXOR ACC1, ACC0
   195	
   196		PSHUFB BSWAP, ACC0
   197		PXOR T2, ACC0
   198		MOVOU ACC0, (tPtr)
   199	
   200		RET
   201	#undef pTbl
   202	#undef tMsk
   203	#undef tPtr
   204	#undef plen
   205	#undef dlen
   206	
   207	// func gcmAesInit(productTable *[256]byte, ks []uint32)
   208	TEXT ·gcmAesInit(SB),NOSPLIT,$0
   209	#define dst DI
   210	#define KS SI
   211	#define NR DX
   212	
   213		MOVQ productTable+0(FP), dst
   214		MOVQ ks_base+8(FP), KS
   215		MOVQ ks_len+16(FP), NR
   216	
   217		SHRQ $2, NR
   218		DECQ NR
   219	
   220		MOVOU bswapMask<>(SB), BSWAP
   221		MOVOU gcmPoly<>(SB), POLY
   222	
   223		// Encrypt block 0, with the AES key to generate the hash key H
   224		MOVOU (16*0)(KS), B0
   225		MOVOU (16*1)(KS), T0
   226		AESENC T0, B0
   227		MOVOU (16*2)(KS), T0
   228		AESENC T0, B0
   229		MOVOU (16*3)(KS), T0
   230		AESENC T0, B0
   231		MOVOU (16*4)(KS), T0
   232		AESENC T0, B0
   233		MOVOU (16*5)(KS), T0
   234		AESENC T0, B0
   235		MOVOU (16*6)(KS), T0
   236		AESENC T0, B0
   237		MOVOU (16*7)(KS), T0
   238		AESENC T0, B0
   239		MOVOU (16*8)(KS), T0
   240		AESENC T0, B0
   241		MOVOU (16*9)(KS), T0
   242		AESENC T0, B0
   243		MOVOU (16*10)(KS), T0
   244		CMPQ NR, $12
   245		JB initEncLast
   246		AESENC T0, B0
   247		MOVOU (16*11)(KS), T0
   248		AESENC T0, B0
   249		MOVOU (16*12)(KS), T0
   250		JE initEncLast
   251		AESENC T0, B0
   252		MOVOU (16*13)(KS), T0
   253		AESENC T0, B0
   254		MOVOU (16*14)(KS), T0
   255	initEncLast:
   256		AESENCLAST T0, B0
   257	
   258		PSHUFB BSWAP, B0
   259		// H * 2
   260		PSHUFD $0xff, B0, T0
   261		MOVOU B0, T1
   262		PSRAL $31, T0
   263		PAND POLY, T0
   264		PSRLL $31, T1
   265		PSLLDQ $4, T1
   266		PSLLL $1, B0
   267		PXOR T0, B0
   268		PXOR T1, B0
   269		// Karatsuba pre-computations
   270		MOVOU B0, (16*14)(dst)
   271		PSHUFD $78, B0, B1
   272		PXOR B0, B1
   273		MOVOU B1, (16*15)(dst)
   274	
   275		MOVOU B0, B2
   276		MOVOU B1, B3
   277		// Now prepare powers of H and pre-computations for them
   278		MOVQ $7, AX
   279	
   280	initLoop:
   281			MOVOU B2, T0
   282			MOVOU B2, T1
   283			MOVOU B3, T2
   284			PCLMULQDQ $0x00, B0, T0
   285			PCLMULQDQ $0x11, B0, T1
   286			PCLMULQDQ $0x00, B1, T2
   287	
   288			PXOR T0, T2
   289			PXOR T1, T2
   290			MOVOU T2, B4
   291			PSLLDQ $8, B4
   292			PSRLDQ $8, T2
   293			PXOR B4, T0
   294			PXOR T2, T1
   295	
   296			MOVOU POLY, B2
   297			PCLMULQDQ $0x01, T0, B2
   298			PSHUFD $78, T0, T0
   299			PXOR B2, T0
   300			MOVOU POLY, B2
   301			PCLMULQDQ $0x01, T0, B2
   302			PSHUFD $78, T0, T0
   303			PXOR T0, B2
   304			PXOR T1, B2
   305	
   306			MOVOU B2, (16*12)(dst)
   307			PSHUFD $78, B2, B3
   308			PXOR B2, B3
   309			MOVOU B3, (16*13)(dst)
   310	
   311			DECQ AX
   312			LEAQ (-16*2)(dst), dst
   313		JNE initLoop
   314	
   315		RET
   316	#undef NR
   317	#undef KS
   318	#undef dst
   319	
   320	// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   321	TEXT ·gcmAesData(SB),NOSPLIT,$0
   322	#define pTbl DI
   323	#define aut SI
   324	#define tPtr CX
   325	#define autLen DX
   326	
   327		MOVQ productTable+0(FP), pTbl
   328		MOVQ data_base+8(FP), aut
   329		MOVQ data_len+16(FP), autLen
   330		MOVQ T+32(FP), tPtr
   331	
   332		PXOR ACC0, ACC0
   333		MOVOU bswapMask<>(SB), BSWAP
   334		MOVOU gcmPoly<>(SB), POLY
   335	
   336		MOVOU (16*14)(pTbl), T1
   337		MOVOU (16*15)(pTbl), T2
   338	
   339		TESTQ autLen, autLen
   340		JEQ dataBail
   341	
   342		CMPQ autLen, $13	// optimize the TLS case
   343		JNE dataSinglesLoop
   344	
   345		PXOR B0, B0
   346		MOVQ (aut), B0
   347		PINSRD $2, 8(aut), B0
   348		PINSRB $12, 12(aut), B0
   349		XORQ autLen, autLen
   350		JMP dataMul
   351	
   352	dataSinglesLoop:
   353	
   354			CMPQ autLen, $16
   355			JB dataEnd
   356			SUBQ $16, autLen
   357	
   358			MOVOU (aut), B0
   359	dataMul:
   360			PSHUFB BSWAP, B0
   361			PXOR ACC0, B0
   362	
   363			MOVOU T1, ACC0
   364			MOVOU T2, ACCM
   365			MOVOU T1, ACC1
   366	
   367			PSHUFD $78, B0, T0
   368			PXOR B0, T0
   369			PCLMULQDQ $0x00, B0, ACC0
   370			PCLMULQDQ $0x11, B0, ACC1
   371			PCLMULQDQ $0x00, T0, ACCM
   372	
   373			PXOR ACC0, ACCM
   374			PXOR ACC1, ACCM
   375			MOVOU ACCM, T0
   376			PSRLDQ $8, ACCM
   377			PSLLDQ $8, T0
   378			PXOR ACCM, ACC1
   379			PXOR T0, ACC0
   380	
   381			MOVOU POLY, T0
   382			PCLMULQDQ $0x01, ACC0, T0
   383			PSHUFD $78, ACC0, ACC0
   384			PXOR T0, ACC0
   385	
   386			MOVOU POLY, T0
   387			PCLMULQDQ $0x01, ACC0, T0
   388			PSHUFD $78, ACC0, ACC0
   389			PXOR T0, ACC0
   390			PXOR ACC1, ACC0
   391	
   392			LEAQ 16(aut), aut
   393	
   394		JMP dataSinglesLoop
   395	
   396	dataEnd:
   397	
   398		TESTQ autLen, autLen
   399		JEQ dataBail
   400	
   401		PXOR B0, B0
   402		LEAQ -1(aut)(autLen*1), aut
   403	
   404	dataLoadLoop:
   405	
   406			PSLLDQ $1, B0
   407			PINSRB $0, (aut), B0
   408	
   409			LEAQ -1(aut), aut
   410			DECQ autLen
   411			JNE dataLoadLoop
   412	
   413		JMP dataMul
   414	
   415	dataBail:
   416		MOVOU ACC0, (tPtr)
   417		RET
   418	#undef pTbl
   419	#undef aut
   420	#undef tPtr
   421	#undef autLen
   422	
   423	// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   424	TEXT ·gcmAesEnc(SB),0,$256-96
   425	#define pTbl DI
   426	#define ctx DX
   427	#define ctrPtr CX
   428	#define ptx SI
   429	#define ks AX
   430	#define tPtr R8
   431	#define ptxLen R9
   432	#define aluCTR R10
   433	#define aluTMP R11
   434	#define aluK R12
   435	#define NR R13
   436	
   437	#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   438	#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   439	#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   440	#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   441	#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   442	#define combinedRound(i) \
   443		MOVOU (16*i)(ks), T0;\
   444		AESENC T0, B0;\
   445		AESENC T0, B1;\
   446		AESENC T0, B2;\
   447		AESENC T0, B3;\
   448		 MOVOU (16*(i*2))(pTbl), T1;\
   449		 MOVOU T1, T2;\
   450		AESENC T0, B4;\
   451		AESENC T0, B5;\
   452		AESENC T0, B6;\
   453		AESENC T0, B7;\
   454		 MOVOU (16*i)(SP), T0;\
   455		 PCLMULQDQ $0x00, T0, T1;\
   456		 PXOR T1, ACC0;\
   457		 PSHUFD $78, T0, T1;\
   458		 PCLMULQDQ $0x11, T0, T2;\
   459		 PXOR T1, T0;\
   460		 PXOR T2, ACC1;\
   461		 MOVOU (16*(i*2+1))(pTbl), T2;\
   462		 PCLMULQDQ $0x00, T2, T0;\
   463		 PXOR T0, ACCM
   464	#define mulRound(i) \
   465		MOVOU (16*i)(SP), T0;\
   466		MOVOU (16*(i*2))(pTbl), T1;\
   467		MOVOU T1, T2;\
   468		PCLMULQDQ $0x00, T0, T1;\
   469		PXOR T1, ACC0;\
   470		PCLMULQDQ $0x11, T0, T2;\
   471		PXOR T2, ACC1;\
   472		PSHUFD $78, T0, T1;\
   473		PXOR T1, T0;\
   474		MOVOU (16*(i*2+1))(pTbl), T1;\
   475		PCLMULQDQ $0x00, T0, T1;\
   476		PXOR T1, ACCM
   477	
   478		MOVQ productTable+0(FP), pTbl
   479		MOVQ dst+8(FP), ctx
   480		MOVQ src_base+32(FP), ptx
   481		MOVQ src_len+40(FP), ptxLen
   482		MOVQ ctr+56(FP), ctrPtr
   483		MOVQ T+64(FP), tPtr
   484		MOVQ ks_base+72(FP), ks
   485		MOVQ ks_len+80(FP), NR
   486	
   487		SHRQ $2, NR
   488		DECQ NR
   489	
   490		MOVOU bswapMask<>(SB), BSWAP
   491		MOVOU gcmPoly<>(SB), POLY
   492	
   493		MOVOU (tPtr), ACC0
   494		PXOR ACC1, ACC1
   495		PXOR ACCM, ACCM
   496		MOVOU (ctrPtr), B0
   497		MOVL (3*4)(ctrPtr), aluCTR
   498		MOVOU (ks), T0
   499		MOVL (3*4)(ks), aluK
   500		BSWAPL aluCTR
   501		BSWAPL aluK
   502	
   503		PXOR B0, T0
   504		MOVOU T0, (8*16 + 0*16)(SP)
   505		increment(0)
   506	
   507		CMPQ ptxLen, $128
   508		JB gcmAesEncSingles
   509		SUBQ $128, ptxLen
   510	
   511		// We have at least 8 blocks to encrypt, prepare the rest of the counters
   512		MOVOU T0, (8*16 + 1*16)(SP)
   513		increment(1)
   514		MOVOU T0, (8*16 + 2*16)(SP)
   515		increment(2)
   516		MOVOU T0, (8*16 + 3*16)(SP)
   517		increment(3)
   518		MOVOU T0, (8*16 + 4*16)(SP)
   519		increment(4)
   520		MOVOU T0, (8*16 + 5*16)(SP)
   521		increment(5)
   522		MOVOU T0, (8*16 + 6*16)(SP)
   523		increment(6)
   524		MOVOU T0, (8*16 + 7*16)(SP)
   525		increment(7)
   526	
   527		MOVOU (8*16 + 0*16)(SP), B0
   528		MOVOU (8*16 + 1*16)(SP), B1
   529		MOVOU (8*16 + 2*16)(SP), B2
   530		MOVOU (8*16 + 3*16)(SP), B3
   531		MOVOU (8*16 + 4*16)(SP), B4
   532		MOVOU (8*16 + 5*16)(SP), B5
   533		MOVOU (8*16 + 6*16)(SP), B6
   534		MOVOU (8*16 + 7*16)(SP), B7
   535	
   536		aesRound(1)
   537		increment(0)
   538		aesRound(2)
   539		increment(1)
   540		aesRound(3)
   541		increment(2)
   542		aesRound(4)
   543		increment(3)
   544		aesRound(5)
   545		increment(4)
   546		aesRound(6)
   547		increment(5)
   548		aesRound(7)
   549		increment(6)
   550		aesRound(8)
   551		increment(7)
   552		aesRound(9)
   553		MOVOU (16*10)(ks), T0
   554		CMPQ NR, $12
   555		JB encLast1
   556		aesRnd(T0)
   557		aesRound(11)
   558		MOVOU (16*12)(ks), T0
   559		JE encLast1
   560		aesRnd(T0)
   561		aesRound(13)
   562		MOVOU (16*14)(ks), T0
   563	encLast1:
   564		aesRndLast(T0)
   565	
   566		MOVOU (16*0)(ptx), T0
   567		PXOR T0, B0
   568		MOVOU (16*1)(ptx), T0
   569		PXOR T0, B1
   570		MOVOU (16*2)(ptx), T0
   571		PXOR T0, B2
   572		MOVOU (16*3)(ptx), T0
   573		PXOR T0, B3
   574		MOVOU (16*4)(ptx), T0
   575		PXOR T0, B4
   576		MOVOU (16*5)(ptx), T0
   577		PXOR T0, B5
   578		MOVOU (16*6)(ptx), T0
   579		PXOR T0, B6
   580		MOVOU (16*7)(ptx), T0
   581		PXOR T0, B7
   582	
   583		MOVOU B0, (16*0)(ctx)
   584		PSHUFB BSWAP, B0
   585		PXOR ACC0, B0
   586		MOVOU B1, (16*1)(ctx)
   587		PSHUFB BSWAP, B1
   588		MOVOU B2, (16*2)(ctx)
   589		PSHUFB BSWAP, B2
   590		MOVOU B3, (16*3)(ctx)
   591		PSHUFB BSWAP, B3
   592		MOVOU B4, (16*4)(ctx)
   593		PSHUFB BSWAP, B4
   594		MOVOU B5, (16*5)(ctx)
   595		PSHUFB BSWAP, B5
   596		MOVOU B6, (16*6)(ctx)
   597		PSHUFB BSWAP, B6
   598		MOVOU B7, (16*7)(ctx)
   599		PSHUFB BSWAP, B7
   600	
   601		MOVOU B0, (16*0)(SP)
   602		MOVOU B1, (16*1)(SP)
   603		MOVOU B2, (16*2)(SP)
   604		MOVOU B3, (16*3)(SP)
   605		MOVOU B4, (16*4)(SP)
   606		MOVOU B5, (16*5)(SP)
   607		MOVOU B6, (16*6)(SP)
   608		MOVOU B7, (16*7)(SP)
   609	
   610		LEAQ 128(ptx), ptx
   611		LEAQ 128(ctx), ctx
   612	
   613	gcmAesEncOctetsLoop:
   614	
   615			CMPQ ptxLen, $128
   616			JB gcmAesEncOctetsEnd
   617			SUBQ $128, ptxLen
   618	
   619			MOVOU (8*16 + 0*16)(SP), B0
   620			MOVOU (8*16 + 1*16)(SP), B1
   621			MOVOU (8*16 + 2*16)(SP), B2
   622			MOVOU (8*16 + 3*16)(SP), B3
   623			MOVOU (8*16 + 4*16)(SP), B4
   624			MOVOU (8*16 + 5*16)(SP), B5
   625			MOVOU (8*16 + 6*16)(SP), B6
   626			MOVOU (8*16 + 7*16)(SP), B7
   627	
   628			MOVOU (16*0)(SP), T0
   629			PSHUFD $78, T0, T1
   630			PXOR T0, T1
   631	
   632			MOVOU (16*0)(pTbl), ACC0
   633			MOVOU (16*1)(pTbl), ACCM
   634			MOVOU ACC0, ACC1
   635	
   636			PCLMULQDQ $0x00, T1, ACCM
   637			PCLMULQDQ $0x00, T0, ACC0
   638			PCLMULQDQ $0x11, T0, ACC1
   639	
   640			combinedRound(1)
   641			increment(0)
   642			combinedRound(2)
   643			increment(1)
   644			combinedRound(3)
   645			increment(2)
   646			combinedRound(4)
   647			increment(3)
   648			combinedRound(5)
   649			increment(4)
   650			combinedRound(6)
   651			increment(5)
   652			combinedRound(7)
   653			increment(6)
   654	
   655			aesRound(8)
   656			increment(7)
   657	
   658			PXOR ACC0, ACCM
   659			PXOR ACC1, ACCM
   660			MOVOU ACCM, T0
   661			PSRLDQ $8, ACCM
   662			PSLLDQ $8, T0
   663			PXOR ACCM, ACC1
   664			PXOR T0, ACC0
   665	
   666			reduceRound(ACC0)
   667			aesRound(9)
   668	
   669			reduceRound(ACC0)
   670			PXOR ACC1, ACC0
   671	
   672			MOVOU (16*10)(ks), T0
   673			CMPQ NR, $12
   674			JB encLast2
   675			aesRnd(T0)
   676			aesRound(11)
   677			MOVOU (16*12)(ks), T0
   678			JE encLast2
   679			aesRnd(T0)
   680			aesRound(13)
   681			MOVOU (16*14)(ks), T0
   682	encLast2:
   683			aesRndLast(T0)
   684	
   685			MOVOU (16*0)(ptx), T0
   686			PXOR T0, B0
   687			MOVOU (16*1)(ptx), T0
   688			PXOR T0, B1
   689			MOVOU (16*2)(ptx), T0
   690			PXOR T0, B2
   691			MOVOU (16*3)(ptx), T0
   692			PXOR T0, B3
   693			MOVOU (16*4)(ptx), T0
   694			PXOR T0, B4
   695			MOVOU (16*5)(ptx), T0
   696			PXOR T0, B5
   697			MOVOU (16*6)(ptx), T0
   698			PXOR T0, B6
   699			MOVOU (16*7)(ptx), T0
   700			PXOR T0, B7
   701	
   702			MOVOU B0, (16*0)(ctx)
   703			PSHUFB BSWAP, B0
   704			PXOR ACC0, B0
   705			MOVOU B1, (16*1)(ctx)
   706			PSHUFB BSWAP, B1
   707			MOVOU B2, (16*2)(ctx)
   708			PSHUFB BSWAP, B2
   709			MOVOU B3, (16*3)(ctx)
   710			PSHUFB BSWAP, B3
   711			MOVOU B4, (16*4)(ctx)
   712			PSHUFB BSWAP, B4
   713			MOVOU B5, (16*5)(ctx)
   714			PSHUFB BSWAP, B5
   715			MOVOU B6, (16*6)(ctx)
   716			PSHUFB BSWAP, B6
   717			MOVOU B7, (16*7)(ctx)
   718			PSHUFB BSWAP, B7
   719	
   720			MOVOU B0, (16*0)(SP)
   721			MOVOU B1, (16*1)(SP)
   722			MOVOU B2, (16*2)(SP)
   723			MOVOU B3, (16*3)(SP)
   724			MOVOU B4, (16*4)(SP)
   725			MOVOU B5, (16*5)(SP)
   726			MOVOU B6, (16*6)(SP)
   727			MOVOU B7, (16*7)(SP)
   728	
   729			LEAQ 128(ptx), ptx
   730			LEAQ 128(ctx), ctx
   731	
   732			JMP gcmAesEncOctetsLoop
   733	
   734	gcmAesEncOctetsEnd:
   735	
   736		MOVOU (16*0)(SP), T0
   737		MOVOU (16*0)(pTbl), ACC0
   738		MOVOU (16*1)(pTbl), ACCM
   739		MOVOU ACC0, ACC1
   740		PSHUFD $78, T0, T1
   741		PXOR T0, T1
   742		PCLMULQDQ $0x00, T0, ACC0
   743		PCLMULQDQ $0x11, T0, ACC1
   744		PCLMULQDQ $0x00, T1, ACCM
   745	
   746		mulRound(1)
   747		mulRound(2)
   748		mulRound(3)
   749		mulRound(4)
   750		mulRound(5)
   751		mulRound(6)
   752		mulRound(7)
   753	
   754		PXOR ACC0, ACCM
   755		PXOR ACC1, ACCM
   756		MOVOU ACCM, T0
   757		PSRLDQ $8, ACCM
   758		PSLLDQ $8, T0
   759		PXOR ACCM, ACC1
   760		PXOR T0, ACC0
   761	
   762		reduceRound(ACC0)
   763		reduceRound(ACC0)
   764		PXOR ACC1, ACC0
   765	
   766		TESTQ ptxLen, ptxLen
   767		JE gcmAesEncDone
   768	
   769		SUBQ $7, aluCTR
   770	
   771	gcmAesEncSingles:
   772	
   773		MOVOU (16*1)(ks), B1
   774		MOVOU (16*2)(ks), B2
   775		MOVOU (16*3)(ks), B3
   776		MOVOU (16*4)(ks), B4
   777		MOVOU (16*5)(ks), B5
   778		MOVOU (16*6)(ks), B6
   779		MOVOU (16*7)(ks), B7
   780	
   781		MOVOU (16*14)(pTbl), T2
   782	
   783	gcmAesEncSinglesLoop:
   784	
   785			CMPQ ptxLen, $16
   786			JB gcmAesEncTail
   787			SUBQ $16, ptxLen
   788	
   789			MOVOU (8*16 + 0*16)(SP), B0
   790			increment(0)
   791	
   792			AESENC B1, B0
   793			AESENC B2, B0
   794			AESENC B3, B0
   795			AESENC B4, B0
   796			AESENC B5, B0
   797			AESENC B6, B0
   798			AESENC B7, B0
   799			MOVOU (16*8)(ks), T0
   800			AESENC T0, B0
   801			MOVOU (16*9)(ks), T0
   802			AESENC T0, B0
   803			MOVOU (16*10)(ks), T0
   804			CMPQ NR, $12
   805			JB encLast3
   806			AESENC T0, B0
   807			MOVOU (16*11)(ks), T0
   808			AESENC T0, B0
   809			MOVOU (16*12)(ks), T0
   810			JE encLast3
   811			AESENC T0, B0
   812			MOVOU (16*13)(ks), T0
   813			AESENC T0, B0
   814			MOVOU (16*14)(ks), T0
   815	encLast3:
   816			AESENCLAST T0, B0
   817	
   818			MOVOU (ptx), T0
   819			PXOR T0, B0
   820			MOVOU B0, (ctx)
   821	
   822			PSHUFB BSWAP, B0
   823			PXOR ACC0, B0
   824	
   825			MOVOU T2, ACC0
   826			MOVOU T2, ACC1
   827			MOVOU (16*15)(pTbl), ACCM
   828	
   829			PSHUFD $78, B0, T0
   830			PXOR B0, T0
   831			PCLMULQDQ $0x00, B0, ACC0
   832			PCLMULQDQ $0x11, B0, ACC1
   833			PCLMULQDQ $0x00, T0, ACCM
   834	
   835			PXOR ACC0, ACCM
   836			PXOR ACC1, ACCM
   837			MOVOU ACCM, T0
   838			PSRLDQ $8, ACCM
   839			PSLLDQ $8, T0
   840			PXOR ACCM, ACC1
   841			PXOR T0, ACC0
   842	
   843			reduceRound(ACC0)
   844			reduceRound(ACC0)
   845			PXOR ACC1, ACC0
   846	
   847			LEAQ (16*1)(ptx), ptx
   848			LEAQ (16*1)(ctx), ctx
   849	
   850		JMP gcmAesEncSinglesLoop
   851	
   852	gcmAesEncTail:
   853		TESTQ ptxLen, ptxLen
   854		JE gcmAesEncDone
   855	
   856		MOVOU (8*16 + 0*16)(SP), B0
   857		AESENC B1, B0
   858		AESENC B2, B0
   859		AESENC B3, B0
   860		AESENC B4, B0
   861		AESENC B5, B0
   862		AESENC B6, B0
   863		AESENC B7, B0
   864		MOVOU (16*8)(ks), T0
   865		AESENC T0, B0
   866		MOVOU (16*9)(ks), T0
   867		AESENC T0, B0
   868		MOVOU (16*10)(ks), T0
   869		CMPQ NR, $12
   870		JB encLast4
   871		AESENC T0, B0
   872		MOVOU (16*11)(ks), T0
   873		AESENC T0, B0
   874		MOVOU (16*12)(ks), T0
   875		JE encLast4
   876		AESENC T0, B0
   877		MOVOU (16*13)(ks), T0
   878		AESENC T0, B0
   879		MOVOU (16*14)(ks), T0
   880	encLast4:
   881		AESENCLAST T0, B0
   882		MOVOU B0, T0
   883	
   884		LEAQ -1(ptx)(ptxLen*1), ptx
   885	
   886		MOVQ ptxLen, aluTMP
   887		SHLQ $4, aluTMP
   888	
   889		LEAQ andMask<>(SB), aluCTR
   890		MOVOU -16(aluCTR)(aluTMP*1), T1
   891	
   892		PXOR B0, B0
   893	ptxLoadLoop:
   894			PSLLDQ $1, B0
   895			PINSRB $0, (ptx), B0
   896			LEAQ -1(ptx), ptx
   897			DECQ ptxLen
   898		JNE ptxLoadLoop
   899	
   900		PXOR T0, B0
   901		PAND T1, B0
   902		MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   903	
   904		PSHUFB BSWAP, B0
   905		PXOR ACC0, B0
   906	
   907		MOVOU T2, ACC0
   908		MOVOU T2, ACC1
   909		MOVOU (16*15)(pTbl), ACCM
   910	
   911		PSHUFD $78, B0, T0
   912		PXOR B0, T0
   913		PCLMULQDQ $0x00, B0, ACC0
   914		PCLMULQDQ $0x11, B0, ACC1
   915		PCLMULQDQ $0x00, T0, ACCM
   916	
   917		PXOR ACC0, ACCM
   918		PXOR ACC1, ACCM
   919		MOVOU ACCM, T0
   920		PSRLDQ $8, ACCM
   921		PSLLDQ $8, T0
   922		PXOR ACCM, ACC1
   923		PXOR T0, ACC0
   924	
   925		reduceRound(ACC0)
   926		reduceRound(ACC0)
   927		PXOR ACC1, ACC0
   928	
   929	gcmAesEncDone:
   930		MOVOU ACC0, (tPtr)
   931		RET
   932	#undef increment
   933	
   934	// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   935	TEXT ·gcmAesDec(SB),0,$128-96
   936	#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   937	#define combinedDecRound(i) \
   938		MOVOU (16*i)(ks), T0;\
   939		AESENC T0, B0;\
   940		AESENC T0, B1;\
   941		AESENC T0, B2;\
   942		AESENC T0, B3;\
   943		MOVOU (16*(i*2))(pTbl), T1;\
   944		MOVOU T1, T2;\
   945		AESENC T0, B4;\
   946		AESENC T0, B5;\
   947		AESENC T0, B6;\
   948		AESENC T0, B7;\
   949		MOVOU (16*i)(ctx), T0;\
   950		PSHUFB BSWAP, T0;\
   951		PCLMULQDQ $0x00, T0, T1;\
   952		PXOR T1, ACC0;\
   953		PSHUFD $78, T0, T1;\
   954		PCLMULQDQ $0x11, T0, T2;\
   955		PXOR T1, T0;\
   956		PXOR T2, ACC1;\
   957		MOVOU (16*(i*2+1))(pTbl), T2;\
   958		PCLMULQDQ $0x00, T2, T0;\
   959		PXOR T0, ACCM
   960	
   961		MOVQ productTable+0(FP), pTbl
   962		MOVQ dst+8(FP), ptx
   963		MOVQ src_base+32(FP), ctx
   964		MOVQ src_len+40(FP), ptxLen
   965		MOVQ ctr+56(FP), ctrPtr
   966		MOVQ T+64(FP), tPtr
   967		MOVQ ks_base+72(FP), ks
   968		MOVQ ks_len+80(FP), NR
   969	
   970		SHRQ $2, NR
   971		DECQ NR
   972	
   973		MOVOU bswapMask<>(SB), BSWAP
   974		MOVOU gcmPoly<>(SB), POLY
   975	
   976		MOVOU (tPtr), ACC0
   977		PXOR ACC1, ACC1
   978		PXOR ACCM, ACCM
   979		MOVOU (ctrPtr), B0
   980		MOVL (3*4)(ctrPtr), aluCTR
   981		MOVOU (ks), T0
   982		MOVL (3*4)(ks), aluK
   983		BSWAPL aluCTR
   984		BSWAPL aluK
   985	
   986		PXOR B0, T0
   987		MOVOU T0, (0*16)(SP)
   988		increment(0)
   989	
   990		CMPQ ptxLen, $128
   991		JB gcmAesDecSingles
   992	
   993		MOVOU T0, (1*16)(SP)
   994		increment(1)
   995		MOVOU T0, (2*16)(SP)
   996		increment(2)
   997		MOVOU T0, (3*16)(SP)
   998		increment(3)
   999		MOVOU T0, (4*16)(SP)
  1000		increment(4)
  1001		MOVOU T0, (5*16)(SP)
  1002		increment(5)
  1003		MOVOU T0, (6*16)(SP)
  1004		increment(6)
  1005		MOVOU T0, (7*16)(SP)
  1006		increment(7)
  1007	
  1008	gcmAesDecOctetsLoop:
  1009	
  1010			CMPQ ptxLen, $128
  1011			JB gcmAesDecEndOctets
  1012			SUBQ $128, ptxLen
  1013	
  1014			MOVOU (0*16)(SP), B0
  1015			MOVOU (1*16)(SP), B1
  1016			MOVOU (2*16)(SP), B2
  1017			MOVOU (3*16)(SP), B3
  1018			MOVOU (4*16)(SP), B4
  1019			MOVOU (5*16)(SP), B5
  1020			MOVOU (6*16)(SP), B6
  1021			MOVOU (7*16)(SP), B7
  1022	
  1023			MOVOU (16*0)(ctx), T0
  1024			PSHUFB BSWAP, T0
  1025			PXOR ACC0, T0
  1026			PSHUFD $78, T0, T1
  1027			PXOR T0, T1
  1028	
  1029			MOVOU (16*0)(pTbl), ACC0
  1030			MOVOU (16*1)(pTbl), ACCM
  1031			MOVOU ACC0, ACC1
  1032	
  1033			PCLMULQDQ $0x00, T1, ACCM
  1034			PCLMULQDQ $0x00, T0, ACC0
  1035			PCLMULQDQ $0x11, T0, ACC1
  1036	
  1037			combinedDecRound(1)
  1038			increment(0)
  1039			combinedDecRound(2)
  1040			increment(1)
  1041			combinedDecRound(3)
  1042			increment(2)
  1043			combinedDecRound(4)
  1044			increment(3)
  1045			combinedDecRound(5)
  1046			increment(4)
  1047			combinedDecRound(6)
  1048			increment(5)
  1049			combinedDecRound(7)
  1050			increment(6)
  1051	
  1052			aesRound(8)
  1053			increment(7)
  1054	
  1055			PXOR ACC0, ACCM
  1056			PXOR ACC1, ACCM
  1057			MOVOU ACCM, T0
  1058			PSRLDQ $8, ACCM
  1059			PSLLDQ $8, T0
  1060			PXOR ACCM, ACC1
  1061			PXOR T0, ACC0
  1062	
  1063			reduceRound(ACC0)
  1064			aesRound(9)
  1065	
  1066			reduceRound(ACC0)
  1067			PXOR ACC1, ACC0
  1068	
  1069			MOVOU (16*10)(ks), T0
  1070			CMPQ NR, $12
  1071			JB decLast1
  1072			aesRnd(T0)
  1073			aesRound(11)
  1074			MOVOU (16*12)(ks), T0
  1075			JE decLast1
  1076			aesRnd(T0)
  1077			aesRound(13)
  1078			MOVOU (16*14)(ks), T0
  1079	decLast1:
  1080			aesRndLast(T0)
  1081	
  1082			MOVOU (16*0)(ctx), T0
  1083			PXOR T0, B0
  1084			MOVOU (16*1)(ctx), T0
  1085			PXOR T0, B1
  1086			MOVOU (16*2)(ctx), T0
  1087			PXOR T0, B2
  1088			MOVOU (16*3)(ctx), T0
  1089			PXOR T0, B3
  1090			MOVOU (16*4)(ctx), T0
  1091			PXOR T0, B4
  1092			MOVOU (16*5)(ctx), T0
  1093			PXOR T0, B5
  1094			MOVOU (16*6)(ctx), T0
  1095			PXOR T0, B6
  1096			MOVOU (16*7)(ctx), T0
  1097			PXOR T0, B7
  1098	
  1099			MOVOU B0, (16*0)(ptx)
  1100			MOVOU B1, (16*1)(ptx)
  1101			MOVOU B2, (16*2)(ptx)
  1102			MOVOU B3, (16*3)(ptx)
  1103			MOVOU B4, (16*4)(ptx)
  1104			MOVOU B5, (16*5)(ptx)
  1105			MOVOU B6, (16*6)(ptx)
  1106			MOVOU B7, (16*7)(ptx)
  1107	
  1108			LEAQ 128(ptx), ptx
  1109			LEAQ 128(ctx), ctx
  1110	
  1111			JMP gcmAesDecOctetsLoop
  1112	
  1113	gcmAesDecEndOctets:
  1114	
  1115		SUBQ $7, aluCTR
  1116	
  1117	gcmAesDecSingles:
  1118	
  1119		MOVOU (16*1)(ks), B1
  1120		MOVOU (16*2)(ks), B2
  1121		MOVOU (16*3)(ks), B3
  1122		MOVOU (16*4)(ks), B4
  1123		MOVOU (16*5)(ks), B5
  1124		MOVOU (16*6)(ks), B6
  1125		MOVOU (16*7)(ks), B7
  1126	
  1127		MOVOU (16*14)(pTbl), T2
  1128	
  1129	gcmAesDecSinglesLoop:
  1130	
  1131			CMPQ ptxLen, $16
  1132			JB gcmAesDecTail
  1133			SUBQ $16, ptxLen
  1134	
  1135			MOVOU (ctx), B0
  1136			MOVOU B0, T1
  1137			PSHUFB BSWAP, B0
  1138			PXOR ACC0, B0
  1139	
  1140			MOVOU T2, ACC0
  1141			MOVOU T2, ACC1
  1142			MOVOU (16*15)(pTbl), ACCM
  1143	
  1144			PCLMULQDQ $0x00, B0, ACC0
  1145			PCLMULQDQ $0x11, B0, ACC1
  1146			PSHUFD $78, B0, T0
  1147			PXOR B0, T0
  1148			PCLMULQDQ $0x00, T0, ACCM
  1149	
  1150			PXOR ACC0, ACCM
  1151			PXOR ACC1, ACCM
  1152			MOVOU ACCM, T0
  1153			PSRLDQ $8, ACCM
  1154			PSLLDQ $8, T0
  1155			PXOR ACCM, ACC1
  1156			PXOR T0, ACC0
  1157	
  1158			reduceRound(ACC0)
  1159			reduceRound(ACC0)
  1160			PXOR ACC1, ACC0
  1161	
  1162			MOVOU (0*16)(SP), B0
  1163			increment(0)
  1164			AESENC B1, B0
  1165			AESENC B2, B0
  1166			AESENC B3, B0
  1167			AESENC B4, B0
  1168			AESENC B5, B0
  1169			AESENC B6, B0
  1170			AESENC B7, B0
  1171			MOVOU (16*8)(ks), T0
  1172			AESENC T0, B0
  1173			MOVOU (16*9)(ks), T0
  1174			AESENC T0, B0
  1175			MOVOU (16*10)(ks), T0
  1176			CMPQ NR, $12
  1177			JB decLast2
  1178			AESENC T0, B0
  1179			MOVOU (16*11)(ks), T0
  1180			AESENC T0, B0
  1181			MOVOU (16*12)(ks), T0
  1182			JE decLast2
  1183			AESENC T0, B0
  1184			MOVOU (16*13)(ks), T0
  1185			AESENC T0, B0
  1186			MOVOU (16*14)(ks), T0
  1187	decLast2:
  1188			AESENCLAST T0, B0
  1189	
  1190			PXOR T1, B0
  1191			MOVOU B0, (ptx)
  1192	
  1193			LEAQ (16*1)(ptx), ptx
  1194			LEAQ (16*1)(ctx), ctx
  1195	
  1196		JMP gcmAesDecSinglesLoop
  1197	
  1198	gcmAesDecTail:
  1199	
  1200		TESTQ ptxLen, ptxLen
  1201		JE gcmAesDecDone
  1202	
  1203		MOVQ ptxLen, aluTMP
  1204		SHLQ $4, aluTMP
  1205		LEAQ andMask<>(SB), aluCTR
  1206		MOVOU -16(aluCTR)(aluTMP*1), T1
  1207	
  1208		MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1209		PAND T1, B0
  1210	
  1211		MOVOU B0, T1
  1212		PSHUFB BSWAP, B0
  1213		PXOR ACC0, B0
  1214	
  1215		MOVOU (16*14)(pTbl), ACC0
  1216		MOVOU (16*15)(pTbl), ACCM
  1217		MOVOU ACC0, ACC1
  1218	
  1219		PCLMULQDQ $0x00, B0, ACC0
  1220		PCLMULQDQ $0x11, B0, ACC1
  1221		PSHUFD $78, B0, T0
  1222		PXOR B0, T0
  1223		PCLMULQDQ $0x00, T0, ACCM
  1224	
  1225		PXOR ACC0, ACCM
  1226		PXOR ACC1, ACCM
  1227		MOVOU ACCM, T0
  1228		PSRLDQ $8, ACCM
  1229		PSLLDQ $8, T0
  1230		PXOR ACCM, ACC1
  1231		PXOR T0, ACC0
  1232	
  1233		reduceRound(ACC0)
  1234		reduceRound(ACC0)
  1235		PXOR ACC1, ACC0
  1236	
  1237		MOVOU (0*16)(SP), B0
  1238		increment(0)
  1239		AESENC B1, B0
  1240		AESENC B2, B0
  1241		AESENC B3, B0
  1242		AESENC B4, B0
  1243		AESENC B5, B0
  1244		AESENC B6, B0
  1245		AESENC B7, B0
  1246		MOVOU (16*8)(ks), T0
  1247		AESENC T0, B0
  1248		MOVOU (16*9)(ks), T0
  1249		AESENC T0, B0
  1250		MOVOU (16*10)(ks), T0
  1251		CMPQ NR, $12
  1252		JB decLast3
  1253		AESENC T0, B0
  1254		MOVOU (16*11)(ks), T0
  1255		AESENC T0, B0
  1256		MOVOU (16*12)(ks), T0
  1257		JE decLast3
  1258		AESENC T0, B0
  1259		MOVOU (16*13)(ks), T0
  1260		AESENC T0, B0
  1261		MOVOU (16*14)(ks), T0
  1262	decLast3:
  1263		AESENCLAST T0, B0
  1264		PXOR T1, B0
  1265	
  1266	ptxStoreLoop:
  1267			PEXTRB $0, B0, (ptx)
  1268			PSRLDQ $1, B0
  1269			LEAQ 1(ptx), ptx
  1270			DECQ ptxLen
  1271	
  1272		JNE ptxStoreLoop
  1273	
  1274	gcmAesDecDone:
  1275	
  1276		MOVOU ACC0, (tPtr)
  1277		RET

View as plain text