...
Run Format

Text file src/crypto/aes/gcm_amd64.s

Documentation: crypto/aes

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6	// The implementation uses some optimization as described in:
     7	// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8	//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9	// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10	//     Hardware
    11	
    12	#include "textflag.h"
    13	
    14	#define B0 X0
    15	#define B1 X1
    16	#define B2 X2
    17	#define B3 X3
    18	#define B4 X4
    19	#define B5 X5
    20	#define B6 X6
    21	#define B7 X7
    22	
    23	#define ACC0 X8
    24	#define ACC1 X9
    25	#define ACCM X10
    26	
    27	#define T0 X11
    28	#define T1 X12
    29	#define T2 X13
    30	#define POLY X14
    31	#define BSWAP X15
    32	
    33	DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34	DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35	
    36	DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37	DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38	
    39	DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40	DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41	DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42	DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43	DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44	DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45	DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46	DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47	DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48	DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49	DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50	DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51	DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52	DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53	DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54	DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55	DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56	DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57	DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58	DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59	DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60	DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61	DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62	DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63	DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64	DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65	DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66	DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67	DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68	DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69	
    70	GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71	GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72	GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73	
    74	// func hasGCMAsm() bool
    75	// returns whether AES-NI AND CLMUL-NI are supported
    76	TEXT ·hasGCMAsm(SB),NOSPLIT,$0
    77		XORQ AX, AX
    78		INCL AX
    79		CPUID
    80		MOVQ CX, DX
    81		SHRQ $25, CX
    82		SHRQ $1, DX
    83		ANDQ DX, CX
    84		ANDQ $1, CX
    85		MOVB CX, ret+0(FP)
    86		RET
    87	
    88	// func aesEncBlock(dst, src *[16]byte, ks []uint32)
    89	TEXT ·aesEncBlock(SB),NOSPLIT,$0
    90		MOVQ dst+0(FP), DI
    91		MOVQ src+8(FP), SI
    92		MOVQ ks_base+16(FP), DX
    93		MOVQ ks_len+24(FP), CX
    94	
    95		SHRQ $2, CX
    96		DECQ CX
    97	
    98		MOVOU (SI), X0
    99		MOVOU (16*0)(DX), X1
   100		PXOR X1, X0
   101		MOVOU (16*1)(DX), X1
   102		AESENC X1, X0
   103		MOVOU (16*2)(DX), X1
   104		AESENC X1, X0
   105		MOVOU (16*3)(DX), X1
   106		AESENC X1, X0
   107		MOVOU (16*4)(DX), X1
   108		AESENC X1, X0
   109		MOVOU (16*5)(DX), X1
   110		AESENC X1, X0
   111		MOVOU (16*6)(DX), X1
   112		AESENC X1, X0
   113		MOVOU (16*7)(DX), X1
   114		AESENC X1, X0
   115		MOVOU (16*8)(DX), X1
   116		AESENC X1, X0
   117		MOVOU (16*9)(DX), X1
   118		AESENC X1, X0
   119		MOVOU (16*10)(DX), X1
   120		CMPQ CX, $12
   121		JB encLast
   122		AESENC X1, X0
   123		MOVOU (16*11)(DX), X1
   124		AESENC X1, X0
   125		MOVOU (16*12)(DX), X1
   126		JE encLast
   127		AESENC X1, X0
   128		MOVOU (16*13)(DX), X1
   129		AESENC X1, X0
   130		MOVOU (16*14)(DX), X1
   131	
   132	encLast:
   133		AESENCLAST X1, X0
   134		MOVOU X0, (DI)
   135	
   136		RET
   137	
   138	// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
   139	TEXT ·gcmAesFinish(SB),NOSPLIT,$0
   140	#define pTbl DI
   141	#define tMsk SI
   142	#define tPtr DX
   143	#define plen AX
   144	#define dlen CX
   145	
   146		MOVQ productTable+0(FP), pTbl
   147		MOVQ tagMask+8(FP), tMsk
   148		MOVQ T+16(FP), tPtr
   149		MOVQ pLen+24(FP), plen
   150		MOVQ dLen+32(FP), dlen
   151	
   152		MOVOU (tPtr), ACC0
   153		MOVOU (tMsk), T2
   154	
   155		MOVOU bswapMask<>(SB), BSWAP
   156		MOVOU gcmPoly<>(SB), POLY
   157	
   158		SHLQ $3, plen
   159		SHLQ $3, dlen
   160	
   161		MOVQ plen, B0
   162		PINSRQ $1, dlen, B0
   163	
   164		PXOR ACC0, B0
   165	
   166		MOVOU (16*14)(pTbl), ACC0
   167		MOVOU (16*15)(pTbl), ACCM
   168		MOVOU ACC0, ACC1
   169	
   170		PCLMULQDQ $0x00, B0, ACC0
   171		PCLMULQDQ $0x11, B0, ACC1
   172		PSHUFD $78, B0, T0
   173		PXOR B0, T0
   174		PCLMULQDQ $0x00, T0, ACCM
   175	
   176		PXOR ACC0, ACCM
   177		PXOR ACC1, ACCM
   178		MOVOU ACCM, T0
   179		PSRLDQ $8, ACCM
   180		PSLLDQ $8, T0
   181		PXOR ACCM, ACC1
   182		PXOR T0, ACC0
   183	
   184		MOVOU POLY, T0
   185		PCLMULQDQ $0x01, ACC0, T0
   186		PSHUFD $78, ACC0, ACC0
   187		PXOR T0, ACC0
   188	
   189		MOVOU POLY, T0
   190		PCLMULQDQ $0x01, ACC0, T0
   191		PSHUFD $78, ACC0, ACC0
   192		PXOR T0, ACC0
   193	
   194		PXOR ACC1, ACC0
   195	
   196		PSHUFB BSWAP, ACC0
   197		PXOR T2, ACC0
   198		MOVOU ACC0, (tPtr)
   199	
   200		RET
   201	#undef pTbl
   202	#undef tMsk
   203	#undef tPtr
   204	#undef plen
   205	#undef dlen
   206	
   207	// func gcmAesInit(productTable *[256]byte, ks []uint32)
   208	TEXT ·gcmAesInit(SB),NOSPLIT,$0
   209	#define dst DI
   210	#define KS SI
   211	#define NR DX
   212	
   213		MOVQ productTable+0(FP), dst
   214		MOVQ ks_base+8(FP), KS
   215		MOVQ ks_len+16(FP), NR
   216	
   217		SHRQ $2, NR
   218		DECQ NR
   219	
   220		MOVOU bswapMask<>(SB), BSWAP
   221		MOVOU gcmPoly<>(SB), POLY
   222	
   223		// Encrypt block 0, with the AES key to generate the hash key H
   224		MOVOU (16*0)(KS), B0
   225		MOVOU (16*1)(KS), T0
   226		AESENC T0, B0
   227		MOVOU (16*2)(KS), T0
   228		AESENC T0, B0
   229		MOVOU (16*3)(KS), T0
   230		AESENC T0, B0
   231		MOVOU (16*4)(KS), T0
   232		AESENC T0, B0
   233		MOVOU (16*5)(KS), T0
   234		AESENC T0, B0
   235		MOVOU (16*6)(KS), T0
   236		AESENC T0, B0
   237		MOVOU (16*7)(KS), T0
   238		AESENC T0, B0
   239		MOVOU (16*8)(KS), T0
   240		AESENC T0, B0
   241		MOVOU (16*9)(KS), T0
   242		AESENC T0, B0
   243		MOVOU (16*10)(KS), T0
   244		CMPQ NR, $12
   245		JB initEncLast
   246		AESENC T0, B0
   247		MOVOU (16*11)(KS), T0
   248		AESENC T0, B0
   249		MOVOU (16*12)(KS), T0
   250		JE initEncLast
   251		AESENC T0, B0
   252		MOVOU (16*13)(KS), T0
   253		AESENC T0, B0
   254		MOVOU (16*14)(KS), T0
   255	initEncLast:
   256		AESENCLAST T0, B0
   257	
   258		PSHUFB BSWAP, B0
   259		// H * 2
   260		PSHUFD $0xff, B0, T0
   261		MOVOU B0, T1
   262		PSRAL $31, T0
   263		PAND POLY, T0
   264		PSRLL $31, T1
   265		PSLLDQ $4, T1
   266		PSLLL $1, B0
   267		PXOR T0, B0
   268		PXOR T1, B0
   269		// Karatsuba pre-computations
   270		MOVOU B0, (16*14)(dst)
   271		PSHUFD $78, B0, B1
   272		PXOR B0, B1
   273		MOVOU B1, (16*15)(dst)
   274	
   275		MOVOU B0, B2
   276		MOVOU B1, B3
   277		// Now prepare powers of H and pre-computations for them
   278		MOVQ $7, AX
   279	
   280	initLoop:
   281			MOVOU B2, T0
   282			MOVOU B2, T1
   283			MOVOU B3, T2
   284			PCLMULQDQ $0x00, B0, T0
   285			PCLMULQDQ $0x11, B0, T1
   286			PCLMULQDQ $0x00, B1, T2
   287	
   288			PXOR T0, T2
   289			PXOR T1, T2
   290			MOVOU T2, B4
   291			PSLLDQ $8, B4
   292			PSRLDQ $8, T2
   293			PXOR B4, T0
   294			PXOR T2, T1
   295	
   296			MOVOU POLY, B2
   297			PCLMULQDQ $0x01, T0, B2
   298			PSHUFD $78, T0, T0
   299			PXOR B2, T0
   300			MOVOU POLY, B2
   301			PCLMULQDQ $0x01, T0, B2
   302			PSHUFD $78, T0, T0
   303			PXOR T0, B2
   304			PXOR T1, B2
   305	
   306			MOVOU B2, (16*12)(dst)
   307			PSHUFD $78, B2, B3
   308			PXOR B2, B3
   309			MOVOU B3, (16*13)(dst)
   310	
   311			DECQ AX
   312			LEAQ (-16*2)(dst), dst
   313		JNE initLoop
   314	
   315		RET
   316	#undef NR
   317	#undef KS
   318	#undef dst
   319	
   320	// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   321	TEXT ·gcmAesData(SB),NOSPLIT,$0
   322	#define pTbl DI
   323	#define aut SI
   324	#define tPtr CX
   325	#define autLen DX
   326	
   327	#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   328	#define mulRoundAAD(X ,i) \
   329		MOVOU (16*(i*2))(pTbl), T1;\
   330		MOVOU T1, T2;\
   331		PCLMULQDQ $0x00, X, T1;\
   332		PXOR T1, ACC0;\
   333		PCLMULQDQ $0x11, X, T2;\
   334		PXOR T2, ACC1;\
   335		PSHUFD $78, X, T1;\
   336		PXOR T1, X;\
   337		MOVOU (16*(i*2+1))(pTbl), T1;\
   338		PCLMULQDQ $0x00, X, T1;\
   339		PXOR T1, ACCM
   340	
   341		MOVQ productTable+0(FP), pTbl
   342		MOVQ data_base+8(FP), aut
   343		MOVQ data_len+16(FP), autLen
   344		MOVQ T+32(FP), tPtr
   345	
   346		PXOR ACC0, ACC0
   347		MOVOU bswapMask<>(SB), BSWAP
   348		MOVOU gcmPoly<>(SB), POLY
   349	
   350		TESTQ autLen, autLen
   351		JEQ dataBail
   352	
   353		CMPQ autLen, $13	// optimize the TLS case
   354		JE dataTLS
   355		CMPQ autLen, $128
   356		JB startSinglesLoop
   357		JMP dataOctaLoop
   358	
   359	dataTLS:
   360		MOVOU (16*14)(pTbl), T1
   361		MOVOU (16*15)(pTbl), T2
   362		PXOR B0, B0
   363		MOVQ (aut), B0
   364		PINSRD $2, 8(aut), B0
   365		PINSRB $12, 12(aut), B0
   366		XORQ autLen, autLen
   367		JMP dataMul
   368	
   369	dataOctaLoop:
   370			CMPQ autLen, $128
   371			JB startSinglesLoop
   372			SUBQ $128, autLen
   373	
   374			MOVOU (16*0)(aut), X0
   375			MOVOU (16*1)(aut), X1
   376			MOVOU (16*2)(aut), X2
   377			MOVOU (16*3)(aut), X3
   378			MOVOU (16*4)(aut), X4
   379			MOVOU (16*5)(aut), X5
   380			MOVOU (16*6)(aut), X6
   381			MOVOU (16*7)(aut), X7
   382			LEAQ (16*8)(aut), aut
   383			PSHUFB BSWAP, X0
   384			PSHUFB BSWAP, X1
   385			PSHUFB BSWAP, X2
   386			PSHUFB BSWAP, X3
   387			PSHUFB BSWAP, X4
   388			PSHUFB BSWAP, X5
   389			PSHUFB BSWAP, X6
   390			PSHUFB BSWAP, X7
   391			PXOR ACC0, X0
   392	
   393			MOVOU (16*0)(pTbl), ACC0
   394			MOVOU (16*1)(pTbl), ACCM
   395			MOVOU ACC0, ACC1
   396			PSHUFD $78, X0, T1
   397			PXOR X0, T1
   398			PCLMULQDQ $0x00, X0, ACC0
   399			PCLMULQDQ $0x11, X0, ACC1
   400			PCLMULQDQ $0x00, T1, ACCM
   401	
   402			mulRoundAAD(X1, 1)
   403			mulRoundAAD(X2, 2)
   404			mulRoundAAD(X3, 3)
   405			mulRoundAAD(X4, 4)
   406			mulRoundAAD(X5, 5)
   407			mulRoundAAD(X6, 6)
   408			mulRoundAAD(X7, 7)
   409	
   410			PXOR ACC0, ACCM
   411			PXOR ACC1, ACCM
   412			MOVOU ACCM, T0
   413			PSRLDQ $8, ACCM
   414			PSLLDQ $8, T0
   415			PXOR ACCM, ACC1
   416			PXOR T0, ACC0
   417			reduceRound(ACC0)
   418			reduceRound(ACC0)
   419			PXOR ACC1, ACC0
   420		JMP dataOctaLoop
   421	
   422	startSinglesLoop:
   423		MOVOU (16*14)(pTbl), T1
   424		MOVOU (16*15)(pTbl), T2
   425	
   426	dataSinglesLoop:
   427	
   428			CMPQ autLen, $16
   429			JB dataEnd
   430			SUBQ $16, autLen
   431	
   432			MOVOU (aut), B0
   433	dataMul:
   434			PSHUFB BSWAP, B0
   435			PXOR ACC0, B0
   436	
   437			MOVOU T1, ACC0
   438			MOVOU T2, ACCM
   439			MOVOU T1, ACC1
   440	
   441			PSHUFD $78, B0, T0
   442			PXOR B0, T0
   443			PCLMULQDQ $0x00, B0, ACC0
   444			PCLMULQDQ $0x11, B0, ACC1
   445			PCLMULQDQ $0x00, T0, ACCM
   446	
   447			PXOR ACC0, ACCM
   448			PXOR ACC1, ACCM
   449			MOVOU ACCM, T0
   450			PSRLDQ $8, ACCM
   451			PSLLDQ $8, T0
   452			PXOR ACCM, ACC1
   453			PXOR T0, ACC0
   454	
   455			MOVOU POLY, T0
   456			PCLMULQDQ $0x01, ACC0, T0
   457			PSHUFD $78, ACC0, ACC0
   458			PXOR T0, ACC0
   459	
   460			MOVOU POLY, T0
   461			PCLMULQDQ $0x01, ACC0, T0
   462			PSHUFD $78, ACC0, ACC0
   463			PXOR T0, ACC0
   464			PXOR ACC1, ACC0
   465	
   466			LEAQ 16(aut), aut
   467	
   468		JMP dataSinglesLoop
   469	
   470	dataEnd:
   471	
   472		TESTQ autLen, autLen
   473		JEQ dataBail
   474	
   475		PXOR B0, B0
   476		LEAQ -1(aut)(autLen*1), aut
   477	
   478	dataLoadLoop:
   479	
   480			PSLLDQ $1, B0
   481			PINSRB $0, (aut), B0
   482	
   483			LEAQ -1(aut), aut
   484			DECQ autLen
   485			JNE dataLoadLoop
   486	
   487		JMP dataMul
   488	
   489	dataBail:
   490		MOVOU ACC0, (tPtr)
   491		RET
   492	#undef pTbl
   493	#undef aut
   494	#undef tPtr
   495	#undef autLen
   496	
   497	// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   498	TEXT ·gcmAesEnc(SB),0,$256-96
   499	#define pTbl DI
   500	#define ctx DX
   501	#define ctrPtr CX
   502	#define ptx SI
   503	#define ks AX
   504	#define tPtr R8
   505	#define ptxLen R9
   506	#define aluCTR R10
   507	#define aluTMP R11
   508	#define aluK R12
   509	#define NR R13
   510	
   511	#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   512	#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   513	#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   514	#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   515	#define combinedRound(i) \
   516		MOVOU (16*i)(ks), T0;\
   517		AESENC T0, B0;\
   518		AESENC T0, B1;\
   519		AESENC T0, B2;\
   520		AESENC T0, B3;\
   521		 MOVOU (16*(i*2))(pTbl), T1;\
   522		 MOVOU T1, T2;\
   523		AESENC T0, B4;\
   524		AESENC T0, B5;\
   525		AESENC T0, B6;\
   526		AESENC T0, B7;\
   527		 MOVOU (16*i)(SP), T0;\
   528		 PCLMULQDQ $0x00, T0, T1;\
   529		 PXOR T1, ACC0;\
   530		 PSHUFD $78, T0, T1;\
   531		 PCLMULQDQ $0x11, T0, T2;\
   532		 PXOR T1, T0;\
   533		 PXOR T2, ACC1;\
   534		 MOVOU (16*(i*2+1))(pTbl), T2;\
   535		 PCLMULQDQ $0x00, T2, T0;\
   536		 PXOR T0, ACCM
   537	#define mulRound(i) \
   538		MOVOU (16*i)(SP), T0;\
   539		MOVOU (16*(i*2))(pTbl), T1;\
   540		MOVOU T1, T2;\
   541		PCLMULQDQ $0x00, T0, T1;\
   542		PXOR T1, ACC0;\
   543		PCLMULQDQ $0x11, T0, T2;\
   544		PXOR T2, ACC1;\
   545		PSHUFD $78, T0, T1;\
   546		PXOR T1, T0;\
   547		MOVOU (16*(i*2+1))(pTbl), T1;\
   548		PCLMULQDQ $0x00, T0, T1;\
   549		PXOR T1, ACCM
   550	
   551		MOVQ productTable+0(FP), pTbl
   552		MOVQ dst+8(FP), ctx
   553		MOVQ src_base+32(FP), ptx
   554		MOVQ src_len+40(FP), ptxLen
   555		MOVQ ctr+56(FP), ctrPtr
   556		MOVQ T+64(FP), tPtr
   557		MOVQ ks_base+72(FP), ks
   558		MOVQ ks_len+80(FP), NR
   559	
   560		SHRQ $2, NR
   561		DECQ NR
   562	
   563		MOVOU bswapMask<>(SB), BSWAP
   564		MOVOU gcmPoly<>(SB), POLY
   565	
   566		MOVOU (tPtr), ACC0
   567		PXOR ACC1, ACC1
   568		PXOR ACCM, ACCM
   569		MOVOU (ctrPtr), B0
   570		MOVL (3*4)(ctrPtr), aluCTR
   571		MOVOU (ks), T0
   572		MOVL (3*4)(ks), aluK
   573		BSWAPL aluCTR
   574		BSWAPL aluK
   575	
   576		PXOR B0, T0
   577		MOVOU T0, (8*16 + 0*16)(SP)
   578		increment(0)
   579	
   580		CMPQ ptxLen, $128
   581		JB gcmAesEncSingles
   582		SUBQ $128, ptxLen
   583	
   584		// We have at least 8 blocks to encrypt, prepare the rest of the counters
   585		MOVOU T0, (8*16 + 1*16)(SP)
   586		increment(1)
   587		MOVOU T0, (8*16 + 2*16)(SP)
   588		increment(2)
   589		MOVOU T0, (8*16 + 3*16)(SP)
   590		increment(3)
   591		MOVOU T0, (8*16 + 4*16)(SP)
   592		increment(4)
   593		MOVOU T0, (8*16 + 5*16)(SP)
   594		increment(5)
   595		MOVOU T0, (8*16 + 6*16)(SP)
   596		increment(6)
   597		MOVOU T0, (8*16 + 7*16)(SP)
   598		increment(7)
   599	
   600		MOVOU (8*16 + 0*16)(SP), B0
   601		MOVOU (8*16 + 1*16)(SP), B1
   602		MOVOU (8*16 + 2*16)(SP), B2
   603		MOVOU (8*16 + 3*16)(SP), B3
   604		MOVOU (8*16 + 4*16)(SP), B4
   605		MOVOU (8*16 + 5*16)(SP), B5
   606		MOVOU (8*16 + 6*16)(SP), B6
   607		MOVOU (8*16 + 7*16)(SP), B7
   608	
   609		aesRound(1)
   610		increment(0)
   611		aesRound(2)
   612		increment(1)
   613		aesRound(3)
   614		increment(2)
   615		aesRound(4)
   616		increment(3)
   617		aesRound(5)
   618		increment(4)
   619		aesRound(6)
   620		increment(5)
   621		aesRound(7)
   622		increment(6)
   623		aesRound(8)
   624		increment(7)
   625		aesRound(9)
   626		MOVOU (16*10)(ks), T0
   627		CMPQ NR, $12
   628		JB encLast1
   629		aesRnd(T0)
   630		aesRound(11)
   631		MOVOU (16*12)(ks), T0
   632		JE encLast1
   633		aesRnd(T0)
   634		aesRound(13)
   635		MOVOU (16*14)(ks), T0
   636	encLast1:
   637		aesRndLast(T0)
   638	
   639		MOVOU (16*0)(ptx), T0
   640		PXOR T0, B0
   641		MOVOU (16*1)(ptx), T0
   642		PXOR T0, B1
   643		MOVOU (16*2)(ptx), T0
   644		PXOR T0, B2
   645		MOVOU (16*3)(ptx), T0
   646		PXOR T0, B3
   647		MOVOU (16*4)(ptx), T0
   648		PXOR T0, B4
   649		MOVOU (16*5)(ptx), T0
   650		PXOR T0, B5
   651		MOVOU (16*6)(ptx), T0
   652		PXOR T0, B6
   653		MOVOU (16*7)(ptx), T0
   654		PXOR T0, B7
   655	
   656		MOVOU B0, (16*0)(ctx)
   657		PSHUFB BSWAP, B0
   658		PXOR ACC0, B0
   659		MOVOU B1, (16*1)(ctx)
   660		PSHUFB BSWAP, B1
   661		MOVOU B2, (16*2)(ctx)
   662		PSHUFB BSWAP, B2
   663		MOVOU B3, (16*3)(ctx)
   664		PSHUFB BSWAP, B3
   665		MOVOU B4, (16*4)(ctx)
   666		PSHUFB BSWAP, B4
   667		MOVOU B5, (16*5)(ctx)
   668		PSHUFB BSWAP, B5
   669		MOVOU B6, (16*6)(ctx)
   670		PSHUFB BSWAP, B6
   671		MOVOU B7, (16*7)(ctx)
   672		PSHUFB BSWAP, B7
   673	
   674		MOVOU B0, (16*0)(SP)
   675		MOVOU B1, (16*1)(SP)
   676		MOVOU B2, (16*2)(SP)
   677		MOVOU B3, (16*3)(SP)
   678		MOVOU B4, (16*4)(SP)
   679		MOVOU B5, (16*5)(SP)
   680		MOVOU B6, (16*6)(SP)
   681		MOVOU B7, (16*7)(SP)
   682	
   683		LEAQ 128(ptx), ptx
   684		LEAQ 128(ctx), ctx
   685	
   686	gcmAesEncOctetsLoop:
   687	
   688			CMPQ ptxLen, $128
   689			JB gcmAesEncOctetsEnd
   690			SUBQ $128, ptxLen
   691	
   692			MOVOU (8*16 + 0*16)(SP), B0
   693			MOVOU (8*16 + 1*16)(SP), B1
   694			MOVOU (8*16 + 2*16)(SP), B2
   695			MOVOU (8*16 + 3*16)(SP), B3
   696			MOVOU (8*16 + 4*16)(SP), B4
   697			MOVOU (8*16 + 5*16)(SP), B5
   698			MOVOU (8*16 + 6*16)(SP), B6
   699			MOVOU (8*16 + 7*16)(SP), B7
   700	
   701			MOVOU (16*0)(SP), T0
   702			PSHUFD $78, T0, T1
   703			PXOR T0, T1
   704	
   705			MOVOU (16*0)(pTbl), ACC0
   706			MOVOU (16*1)(pTbl), ACCM
   707			MOVOU ACC0, ACC1
   708	
   709			PCLMULQDQ $0x00, T1, ACCM
   710			PCLMULQDQ $0x00, T0, ACC0
   711			PCLMULQDQ $0x11, T0, ACC1
   712	
   713			combinedRound(1)
   714			increment(0)
   715			combinedRound(2)
   716			increment(1)
   717			combinedRound(3)
   718			increment(2)
   719			combinedRound(4)
   720			increment(3)
   721			combinedRound(5)
   722			increment(4)
   723			combinedRound(6)
   724			increment(5)
   725			combinedRound(7)
   726			increment(6)
   727	
   728			aesRound(8)
   729			increment(7)
   730	
   731			PXOR ACC0, ACCM
   732			PXOR ACC1, ACCM
   733			MOVOU ACCM, T0
   734			PSRLDQ $8, ACCM
   735			PSLLDQ $8, T0
   736			PXOR ACCM, ACC1
   737			PXOR T0, ACC0
   738	
   739			reduceRound(ACC0)
   740			aesRound(9)
   741	
   742			reduceRound(ACC0)
   743			PXOR ACC1, ACC0
   744	
   745			MOVOU (16*10)(ks), T0
   746			CMPQ NR, $12
   747			JB encLast2
   748			aesRnd(T0)
   749			aesRound(11)
   750			MOVOU (16*12)(ks), T0
   751			JE encLast2
   752			aesRnd(T0)
   753			aesRound(13)
   754			MOVOU (16*14)(ks), T0
   755	encLast2:
   756			aesRndLast(T0)
   757	
   758			MOVOU (16*0)(ptx), T0
   759			PXOR T0, B0
   760			MOVOU (16*1)(ptx), T0
   761			PXOR T0, B1
   762			MOVOU (16*2)(ptx), T0
   763			PXOR T0, B2
   764			MOVOU (16*3)(ptx), T0
   765			PXOR T0, B3
   766			MOVOU (16*4)(ptx), T0
   767			PXOR T0, B4
   768			MOVOU (16*5)(ptx), T0
   769			PXOR T0, B5
   770			MOVOU (16*6)(ptx), T0
   771			PXOR T0, B6
   772			MOVOU (16*7)(ptx), T0
   773			PXOR T0, B7
   774	
   775			MOVOU B0, (16*0)(ctx)
   776			PSHUFB BSWAP, B0
   777			PXOR ACC0, B0
   778			MOVOU B1, (16*1)(ctx)
   779			PSHUFB BSWAP, B1
   780			MOVOU B2, (16*2)(ctx)
   781			PSHUFB BSWAP, B2
   782			MOVOU B3, (16*3)(ctx)
   783			PSHUFB BSWAP, B3
   784			MOVOU B4, (16*4)(ctx)
   785			PSHUFB BSWAP, B4
   786			MOVOU B5, (16*5)(ctx)
   787			PSHUFB BSWAP, B5
   788			MOVOU B6, (16*6)(ctx)
   789			PSHUFB BSWAP, B6
   790			MOVOU B7, (16*7)(ctx)
   791			PSHUFB BSWAP, B7
   792	
   793			MOVOU B0, (16*0)(SP)
   794			MOVOU B1, (16*1)(SP)
   795			MOVOU B2, (16*2)(SP)
   796			MOVOU B3, (16*3)(SP)
   797			MOVOU B4, (16*4)(SP)
   798			MOVOU B5, (16*5)(SP)
   799			MOVOU B6, (16*6)(SP)
   800			MOVOU B7, (16*7)(SP)
   801	
   802			LEAQ 128(ptx), ptx
   803			LEAQ 128(ctx), ctx
   804	
   805			JMP gcmAesEncOctetsLoop
   806	
   807	gcmAesEncOctetsEnd:
   808	
   809		MOVOU (16*0)(SP), T0
   810		MOVOU (16*0)(pTbl), ACC0
   811		MOVOU (16*1)(pTbl), ACCM
   812		MOVOU ACC0, ACC1
   813		PSHUFD $78, T0, T1
   814		PXOR T0, T1
   815		PCLMULQDQ $0x00, T0, ACC0
   816		PCLMULQDQ $0x11, T0, ACC1
   817		PCLMULQDQ $0x00, T1, ACCM
   818	
   819		mulRound(1)
   820		mulRound(2)
   821		mulRound(3)
   822		mulRound(4)
   823		mulRound(5)
   824		mulRound(6)
   825		mulRound(7)
   826	
   827		PXOR ACC0, ACCM
   828		PXOR ACC1, ACCM
   829		MOVOU ACCM, T0
   830		PSRLDQ $8, ACCM
   831		PSLLDQ $8, T0
   832		PXOR ACCM, ACC1
   833		PXOR T0, ACC0
   834	
   835		reduceRound(ACC0)
   836		reduceRound(ACC0)
   837		PXOR ACC1, ACC0
   838	
   839		TESTQ ptxLen, ptxLen
   840		JE gcmAesEncDone
   841	
   842		SUBQ $7, aluCTR
   843	
   844	gcmAesEncSingles:
   845	
   846		MOVOU (16*1)(ks), B1
   847		MOVOU (16*2)(ks), B2
   848		MOVOU (16*3)(ks), B3
   849		MOVOU (16*4)(ks), B4
   850		MOVOU (16*5)(ks), B5
   851		MOVOU (16*6)(ks), B6
   852		MOVOU (16*7)(ks), B7
   853	
   854		MOVOU (16*14)(pTbl), T2
   855	
   856	gcmAesEncSinglesLoop:
   857	
   858			CMPQ ptxLen, $16
   859			JB gcmAesEncTail
   860			SUBQ $16, ptxLen
   861	
   862			MOVOU (8*16 + 0*16)(SP), B0
   863			increment(0)
   864	
   865			AESENC B1, B0
   866			AESENC B2, B0
   867			AESENC B3, B0
   868			AESENC B4, B0
   869			AESENC B5, B0
   870			AESENC B6, B0
   871			AESENC B7, B0
   872			MOVOU (16*8)(ks), T0
   873			AESENC T0, B0
   874			MOVOU (16*9)(ks), T0
   875			AESENC T0, B0
   876			MOVOU (16*10)(ks), T0
   877			CMPQ NR, $12
   878			JB encLast3
   879			AESENC T0, B0
   880			MOVOU (16*11)(ks), T0
   881			AESENC T0, B0
   882			MOVOU (16*12)(ks), T0
   883			JE encLast3
   884			AESENC T0, B0
   885			MOVOU (16*13)(ks), T0
   886			AESENC T0, B0
   887			MOVOU (16*14)(ks), T0
   888	encLast3:
   889			AESENCLAST T0, B0
   890	
   891			MOVOU (ptx), T0
   892			PXOR T0, B0
   893			MOVOU B0, (ctx)
   894	
   895			PSHUFB BSWAP, B0
   896			PXOR ACC0, B0
   897	
   898			MOVOU T2, ACC0
   899			MOVOU T2, ACC1
   900			MOVOU (16*15)(pTbl), ACCM
   901	
   902			PSHUFD $78, B0, T0
   903			PXOR B0, T0
   904			PCLMULQDQ $0x00, B0, ACC0
   905			PCLMULQDQ $0x11, B0, ACC1
   906			PCLMULQDQ $0x00, T0, ACCM
   907	
   908			PXOR ACC0, ACCM
   909			PXOR ACC1, ACCM
   910			MOVOU ACCM, T0
   911			PSRLDQ $8, ACCM
   912			PSLLDQ $8, T0
   913			PXOR ACCM, ACC1
   914			PXOR T0, ACC0
   915	
   916			reduceRound(ACC0)
   917			reduceRound(ACC0)
   918			PXOR ACC1, ACC0
   919	
   920			LEAQ (16*1)(ptx), ptx
   921			LEAQ (16*1)(ctx), ctx
   922	
   923		JMP gcmAesEncSinglesLoop
   924	
   925	gcmAesEncTail:
   926		TESTQ ptxLen, ptxLen
   927		JE gcmAesEncDone
   928	
   929		MOVOU (8*16 + 0*16)(SP), B0
   930		AESENC B1, B0
   931		AESENC B2, B0
   932		AESENC B3, B0
   933		AESENC B4, B0
   934		AESENC B5, B0
   935		AESENC B6, B0
   936		AESENC B7, B0
   937		MOVOU (16*8)(ks), T0
   938		AESENC T0, B0
   939		MOVOU (16*9)(ks), T0
   940		AESENC T0, B0
   941		MOVOU (16*10)(ks), T0
   942		CMPQ NR, $12
   943		JB encLast4
   944		AESENC T0, B0
   945		MOVOU (16*11)(ks), T0
   946		AESENC T0, B0
   947		MOVOU (16*12)(ks), T0
   948		JE encLast4
   949		AESENC T0, B0
   950		MOVOU (16*13)(ks), T0
   951		AESENC T0, B0
   952		MOVOU (16*14)(ks), T0
   953	encLast4:
   954		AESENCLAST T0, B0
   955		MOVOU B0, T0
   956	
   957		LEAQ -1(ptx)(ptxLen*1), ptx
   958	
   959		MOVQ ptxLen, aluTMP
   960		SHLQ $4, aluTMP
   961	
   962		LEAQ andMask<>(SB), aluCTR
   963		MOVOU -16(aluCTR)(aluTMP*1), T1
   964	
   965		PXOR B0, B0
   966	ptxLoadLoop:
   967			PSLLDQ $1, B0
   968			PINSRB $0, (ptx), B0
   969			LEAQ -1(ptx), ptx
   970			DECQ ptxLen
   971		JNE ptxLoadLoop
   972	
   973		PXOR T0, B0
   974		PAND T1, B0
   975		MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   976	
   977		PSHUFB BSWAP, B0
   978		PXOR ACC0, B0
   979	
   980		MOVOU T2, ACC0
   981		MOVOU T2, ACC1
   982		MOVOU (16*15)(pTbl), ACCM
   983	
   984		PSHUFD $78, B0, T0
   985		PXOR B0, T0
   986		PCLMULQDQ $0x00, B0, ACC0
   987		PCLMULQDQ $0x11, B0, ACC1
   988		PCLMULQDQ $0x00, T0, ACCM
   989	
   990		PXOR ACC0, ACCM
   991		PXOR ACC1, ACCM
   992		MOVOU ACCM, T0
   993		PSRLDQ $8, ACCM
   994		PSLLDQ $8, T0
   995		PXOR ACCM, ACC1
   996		PXOR T0, ACC0
   997	
   998		reduceRound(ACC0)
   999		reduceRound(ACC0)
  1000		PXOR ACC1, ACC0
  1001	
  1002	gcmAesEncDone:
  1003		MOVOU ACC0, (tPtr)
  1004		RET
  1005	#undef increment
  1006	
  1007	// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
  1008	TEXT ·gcmAesDec(SB),0,$128-96
  1009	#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
  1010	#define combinedDecRound(i) \
  1011		MOVOU (16*i)(ks), T0;\
  1012		AESENC T0, B0;\
  1013		AESENC T0, B1;\
  1014		AESENC T0, B2;\
  1015		AESENC T0, B3;\
  1016		MOVOU (16*(i*2))(pTbl), T1;\
  1017		MOVOU T1, T2;\
  1018		AESENC T0, B4;\
  1019		AESENC T0, B5;\
  1020		AESENC T0, B6;\
  1021		AESENC T0, B7;\
  1022		MOVOU (16*i)(ctx), T0;\
  1023		PSHUFB BSWAP, T0;\
  1024		PCLMULQDQ $0x00, T0, T1;\
  1025		PXOR T1, ACC0;\
  1026		PSHUFD $78, T0, T1;\
  1027		PCLMULQDQ $0x11, T0, T2;\
  1028		PXOR T1, T0;\
  1029		PXOR T2, ACC1;\
  1030		MOVOU (16*(i*2+1))(pTbl), T2;\
  1031		PCLMULQDQ $0x00, T2, T0;\
  1032		PXOR T0, ACCM
  1033	
  1034		MOVQ productTable+0(FP), pTbl
  1035		MOVQ dst+8(FP), ptx
  1036		MOVQ src_base+32(FP), ctx
  1037		MOVQ src_len+40(FP), ptxLen
  1038		MOVQ ctr+56(FP), ctrPtr
  1039		MOVQ T+64(FP), tPtr
  1040		MOVQ ks_base+72(FP), ks
  1041		MOVQ ks_len+80(FP), NR
  1042	
  1043		SHRQ $2, NR
  1044		DECQ NR
  1045	
  1046		MOVOU bswapMask<>(SB), BSWAP
  1047		MOVOU gcmPoly<>(SB), POLY
  1048	
  1049		MOVOU (tPtr), ACC0
  1050		PXOR ACC1, ACC1
  1051		PXOR ACCM, ACCM
  1052		MOVOU (ctrPtr), B0
  1053		MOVL (3*4)(ctrPtr), aluCTR
  1054		MOVOU (ks), T0
  1055		MOVL (3*4)(ks), aluK
  1056		BSWAPL aluCTR
  1057		BSWAPL aluK
  1058	
  1059		PXOR B0, T0
  1060		MOVOU T0, (0*16)(SP)
  1061		increment(0)
  1062	
  1063		CMPQ ptxLen, $128
  1064		JB gcmAesDecSingles
  1065	
  1066		MOVOU T0, (1*16)(SP)
  1067		increment(1)
  1068		MOVOU T0, (2*16)(SP)
  1069		increment(2)
  1070		MOVOU T0, (3*16)(SP)
  1071		increment(3)
  1072		MOVOU T0, (4*16)(SP)
  1073		increment(4)
  1074		MOVOU T0, (5*16)(SP)
  1075		increment(5)
  1076		MOVOU T0, (6*16)(SP)
  1077		increment(6)
  1078		MOVOU T0, (7*16)(SP)
  1079		increment(7)
  1080	
  1081	gcmAesDecOctetsLoop:
  1082	
  1083			CMPQ ptxLen, $128
  1084			JB gcmAesDecEndOctets
  1085			SUBQ $128, ptxLen
  1086	
  1087			MOVOU (0*16)(SP), B0
  1088			MOVOU (1*16)(SP), B1
  1089			MOVOU (2*16)(SP), B2
  1090			MOVOU (3*16)(SP), B3
  1091			MOVOU (4*16)(SP), B4
  1092			MOVOU (5*16)(SP), B5
  1093			MOVOU (6*16)(SP), B6
  1094			MOVOU (7*16)(SP), B7
  1095	
  1096			MOVOU (16*0)(ctx), T0
  1097			PSHUFB BSWAP, T0
  1098			PXOR ACC0, T0
  1099			PSHUFD $78, T0, T1
  1100			PXOR T0, T1
  1101	
  1102			MOVOU (16*0)(pTbl), ACC0
  1103			MOVOU (16*1)(pTbl), ACCM
  1104			MOVOU ACC0, ACC1
  1105	
  1106			PCLMULQDQ $0x00, T1, ACCM
  1107			PCLMULQDQ $0x00, T0, ACC0
  1108			PCLMULQDQ $0x11, T0, ACC1
  1109	
  1110			combinedDecRound(1)
  1111			increment(0)
  1112			combinedDecRound(2)
  1113			increment(1)
  1114			combinedDecRound(3)
  1115			increment(2)
  1116			combinedDecRound(4)
  1117			increment(3)
  1118			combinedDecRound(5)
  1119			increment(4)
  1120			combinedDecRound(6)
  1121			increment(5)
  1122			combinedDecRound(7)
  1123			increment(6)
  1124	
  1125			aesRound(8)
  1126			increment(7)
  1127	
  1128			PXOR ACC0, ACCM
  1129			PXOR ACC1, ACCM
  1130			MOVOU ACCM, T0
  1131			PSRLDQ $8, ACCM
  1132			PSLLDQ $8, T0
  1133			PXOR ACCM, ACC1
  1134			PXOR T0, ACC0
  1135	
  1136			reduceRound(ACC0)
  1137			aesRound(9)
  1138	
  1139			reduceRound(ACC0)
  1140			PXOR ACC1, ACC0
  1141	
  1142			MOVOU (16*10)(ks), T0
  1143			CMPQ NR, $12
  1144			JB decLast1
  1145			aesRnd(T0)
  1146			aesRound(11)
  1147			MOVOU (16*12)(ks), T0
  1148			JE decLast1
  1149			aesRnd(T0)
  1150			aesRound(13)
  1151			MOVOU (16*14)(ks), T0
  1152	decLast1:
  1153			aesRndLast(T0)
  1154	
  1155			MOVOU (16*0)(ctx), T0
  1156			PXOR T0, B0
  1157			MOVOU (16*1)(ctx), T0
  1158			PXOR T0, B1
  1159			MOVOU (16*2)(ctx), T0
  1160			PXOR T0, B2
  1161			MOVOU (16*3)(ctx), T0
  1162			PXOR T0, B3
  1163			MOVOU (16*4)(ctx), T0
  1164			PXOR T0, B4
  1165			MOVOU (16*5)(ctx), T0
  1166			PXOR T0, B5
  1167			MOVOU (16*6)(ctx), T0
  1168			PXOR T0, B6
  1169			MOVOU (16*7)(ctx), T0
  1170			PXOR T0, B7
  1171	
  1172			MOVOU B0, (16*0)(ptx)
  1173			MOVOU B1, (16*1)(ptx)
  1174			MOVOU B2, (16*2)(ptx)
  1175			MOVOU B3, (16*3)(ptx)
  1176			MOVOU B4, (16*4)(ptx)
  1177			MOVOU B5, (16*5)(ptx)
  1178			MOVOU B6, (16*6)(ptx)
  1179			MOVOU B7, (16*7)(ptx)
  1180	
  1181			LEAQ 128(ptx), ptx
  1182			LEAQ 128(ctx), ctx
  1183	
  1184			JMP gcmAesDecOctetsLoop
  1185	
  1186	gcmAesDecEndOctets:
  1187	
  1188		SUBQ $7, aluCTR
  1189	
  1190	gcmAesDecSingles:
  1191	
  1192		MOVOU (16*1)(ks), B1
  1193		MOVOU (16*2)(ks), B2
  1194		MOVOU (16*3)(ks), B3
  1195		MOVOU (16*4)(ks), B4
  1196		MOVOU (16*5)(ks), B5
  1197		MOVOU (16*6)(ks), B6
  1198		MOVOU (16*7)(ks), B7
  1199	
  1200		MOVOU (16*14)(pTbl), T2
  1201	
  1202	gcmAesDecSinglesLoop:
  1203	
  1204			CMPQ ptxLen, $16
  1205			JB gcmAesDecTail
  1206			SUBQ $16, ptxLen
  1207	
  1208			MOVOU (ctx), B0
  1209			MOVOU B0, T1
  1210			PSHUFB BSWAP, B0
  1211			PXOR ACC0, B0
  1212	
  1213			MOVOU T2, ACC0
  1214			MOVOU T2, ACC1
  1215			MOVOU (16*15)(pTbl), ACCM
  1216	
  1217			PCLMULQDQ $0x00, B0, ACC0
  1218			PCLMULQDQ $0x11, B0, ACC1
  1219			PSHUFD $78, B0, T0
  1220			PXOR B0, T0
  1221			PCLMULQDQ $0x00, T0, ACCM
  1222	
  1223			PXOR ACC0, ACCM
  1224			PXOR ACC1, ACCM
  1225			MOVOU ACCM, T0
  1226			PSRLDQ $8, ACCM
  1227			PSLLDQ $8, T0
  1228			PXOR ACCM, ACC1
  1229			PXOR T0, ACC0
  1230	
  1231			reduceRound(ACC0)
  1232			reduceRound(ACC0)
  1233			PXOR ACC1, ACC0
  1234	
  1235			MOVOU (0*16)(SP), B0
  1236			increment(0)
  1237			AESENC B1, B0
  1238			AESENC B2, B0
  1239			AESENC B3, B0
  1240			AESENC B4, B0
  1241			AESENC B5, B0
  1242			AESENC B6, B0
  1243			AESENC B7, B0
  1244			MOVOU (16*8)(ks), T0
  1245			AESENC T0, B0
  1246			MOVOU (16*9)(ks), T0
  1247			AESENC T0, B0
  1248			MOVOU (16*10)(ks), T0
  1249			CMPQ NR, $12
  1250			JB decLast2
  1251			AESENC T0, B0
  1252			MOVOU (16*11)(ks), T0
  1253			AESENC T0, B0
  1254			MOVOU (16*12)(ks), T0
  1255			JE decLast2
  1256			AESENC T0, B0
  1257			MOVOU (16*13)(ks), T0
  1258			AESENC T0, B0
  1259			MOVOU (16*14)(ks), T0
  1260	decLast2:
  1261			AESENCLAST T0, B0
  1262	
  1263			PXOR T1, B0
  1264			MOVOU B0, (ptx)
  1265	
  1266			LEAQ (16*1)(ptx), ptx
  1267			LEAQ (16*1)(ctx), ctx
  1268	
  1269		JMP gcmAesDecSinglesLoop
  1270	
  1271	gcmAesDecTail:
  1272	
  1273		TESTQ ptxLen, ptxLen
  1274		JE gcmAesDecDone
  1275	
  1276		MOVQ ptxLen, aluTMP
  1277		SHLQ $4, aluTMP
  1278		LEAQ andMask<>(SB), aluCTR
  1279		MOVOU -16(aluCTR)(aluTMP*1), T1
  1280	
  1281		MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1282		PAND T1, B0
  1283	
  1284		MOVOU B0, T1
  1285		PSHUFB BSWAP, B0
  1286		PXOR ACC0, B0
  1287	
  1288		MOVOU (16*14)(pTbl), ACC0
  1289		MOVOU (16*15)(pTbl), ACCM
  1290		MOVOU ACC0, ACC1
  1291	
  1292		PCLMULQDQ $0x00, B0, ACC0
  1293		PCLMULQDQ $0x11, B0, ACC1
  1294		PSHUFD $78, B0, T0
  1295		PXOR B0, T0
  1296		PCLMULQDQ $0x00, T0, ACCM
  1297	
  1298		PXOR ACC0, ACCM
  1299		PXOR ACC1, ACCM
  1300		MOVOU ACCM, T0
  1301		PSRLDQ $8, ACCM
  1302		PSLLDQ $8, T0
  1303		PXOR ACCM, ACC1
  1304		PXOR T0, ACC0
  1305	
  1306		reduceRound(ACC0)
  1307		reduceRound(ACC0)
  1308		PXOR ACC1, ACC0
  1309	
  1310		MOVOU (0*16)(SP), B0
  1311		increment(0)
  1312		AESENC B1, B0
  1313		AESENC B2, B0
  1314		AESENC B3, B0
  1315		AESENC B4, B0
  1316		AESENC B5, B0
  1317		AESENC B6, B0
  1318		AESENC B7, B0
  1319		MOVOU (16*8)(ks), T0
  1320		AESENC T0, B0
  1321		MOVOU (16*9)(ks), T0
  1322		AESENC T0, B0
  1323		MOVOU (16*10)(ks), T0
  1324		CMPQ NR, $12
  1325		JB decLast3
  1326		AESENC T0, B0
  1327		MOVOU (16*11)(ks), T0
  1328		AESENC T0, B0
  1329		MOVOU (16*12)(ks), T0
  1330		JE decLast3
  1331		AESENC T0, B0
  1332		MOVOU (16*13)(ks), T0
  1333		AESENC T0, B0
  1334		MOVOU (16*14)(ks), T0
  1335	decLast3:
  1336		AESENCLAST T0, B0
  1337		PXOR T1, B0
  1338	
  1339	ptxStoreLoop:
  1340			PEXTRB $0, B0, (ptx)
  1341			PSRLDQ $1, B0
  1342			LEAQ 1(ptx), ptx
  1343			DECQ ptxLen
  1344	
  1345		JNE ptxStoreLoop
  1346	
  1347	gcmAesDecDone:
  1348	
  1349		MOVOU ACC0, (tPtr)
  1350		RET

View as plain text