...
Run Format

Text file src/crypto/aes/gcm_amd64.s

Documentation: crypto/aes

     1// Copyright 2015 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6// The implementation uses some optimization as described in:
     7// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10//     Hardware
    11
    12#include "textflag.h"
    13
    14#define B0 X0
    15#define B1 X1
    16#define B2 X2
    17#define B3 X3
    18#define B4 X4
    19#define B5 X5
    20#define B6 X6
    21#define B7 X7
    22
    23#define ACC0 X8
    24#define ACC1 X9
    25#define ACCM X10
    26
    27#define T0 X11
    28#define T1 X12
    29#define T2 X13
    30#define POLY X14
    31#define BSWAP X15
    32
    33DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35
    36DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38
    39DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69
    70GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73
    74// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    75TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    76#define pTbl DI
    77#define tMsk SI
    78#define tPtr DX
    79#define plen AX
    80#define dlen CX
    81
    82	MOVQ productTable+0(FP), pTbl
    83	MOVQ tagMask+8(FP), tMsk
    84	MOVQ T+16(FP), tPtr
    85	MOVQ pLen+24(FP), plen
    86	MOVQ dLen+32(FP), dlen
    87
    88	MOVOU (tPtr), ACC0
    89	MOVOU (tMsk), T2
    90
    91	MOVOU bswapMask<>(SB), BSWAP
    92	MOVOU gcmPoly<>(SB), POLY
    93
    94	SHLQ $3, plen
    95	SHLQ $3, dlen
    96
    97	MOVQ plen, B0
    98	PINSRQ $1, dlen, B0
    99
   100	PXOR ACC0, B0
   101
   102	MOVOU (16*14)(pTbl), ACC0
   103	MOVOU (16*15)(pTbl), ACCM
   104	MOVOU ACC0, ACC1
   105
   106	PCLMULQDQ $0x00, B0, ACC0
   107	PCLMULQDQ $0x11, B0, ACC1
   108	PSHUFD $78, B0, T0
   109	PXOR B0, T0
   110	PCLMULQDQ $0x00, T0, ACCM
   111
   112	PXOR ACC0, ACCM
   113	PXOR ACC1, ACCM
   114	MOVOU ACCM, T0
   115	PSRLDQ $8, ACCM
   116	PSLLDQ $8, T0
   117	PXOR ACCM, ACC1
   118	PXOR T0, ACC0
   119
   120	MOVOU POLY, T0
   121	PCLMULQDQ $0x01, ACC0, T0
   122	PSHUFD $78, ACC0, ACC0
   123	PXOR T0, ACC0
   124
   125	MOVOU POLY, T0
   126	PCLMULQDQ $0x01, ACC0, T0
   127	PSHUFD $78, ACC0, ACC0
   128	PXOR T0, ACC0
   129
   130	PXOR ACC1, ACC0
   131
   132	PSHUFB BSWAP, ACC0
   133	PXOR T2, ACC0
   134	MOVOU ACC0, (tPtr)
   135
   136	RET
   137#undef pTbl
   138#undef tMsk
   139#undef tPtr
   140#undef plen
   141#undef dlen
   142
   143// func gcmAesInit(productTable *[256]byte, ks []uint32)
   144TEXT ·gcmAesInit(SB),NOSPLIT,$0
   145#define dst DI
   146#define KS SI
   147#define NR DX
   148
   149	MOVQ productTable+0(FP), dst
   150	MOVQ ks_base+8(FP), KS
   151	MOVQ ks_len+16(FP), NR
   152
   153	SHRQ $2, NR
   154	DECQ NR
   155
   156	MOVOU bswapMask<>(SB), BSWAP
   157	MOVOU gcmPoly<>(SB), POLY
   158
   159	// Encrypt block 0, with the AES key to generate the hash key H
   160	MOVOU (16*0)(KS), B0
   161	MOVOU (16*1)(KS), T0
   162	AESENC T0, B0
   163	MOVOU (16*2)(KS), T0
   164	AESENC T0, B0
   165	MOVOU (16*3)(KS), T0
   166	AESENC T0, B0
   167	MOVOU (16*4)(KS), T0
   168	AESENC T0, B0
   169	MOVOU (16*5)(KS), T0
   170	AESENC T0, B0
   171	MOVOU (16*6)(KS), T0
   172	AESENC T0, B0
   173	MOVOU (16*7)(KS), T0
   174	AESENC T0, B0
   175	MOVOU (16*8)(KS), T0
   176	AESENC T0, B0
   177	MOVOU (16*9)(KS), T0
   178	AESENC T0, B0
   179	MOVOU (16*10)(KS), T0
   180	CMPQ NR, $12
   181	JB initEncLast
   182	AESENC T0, B0
   183	MOVOU (16*11)(KS), T0
   184	AESENC T0, B0
   185	MOVOU (16*12)(KS), T0
   186	JE initEncLast
   187	AESENC T0, B0
   188	MOVOU (16*13)(KS), T0
   189	AESENC T0, B0
   190	MOVOU (16*14)(KS), T0
   191initEncLast:
   192	AESENCLAST T0, B0
   193
   194	PSHUFB BSWAP, B0
   195	// H * 2
   196	PSHUFD $0xff, B0, T0
   197	MOVOU B0, T1
   198	PSRAL $31, T0
   199	PAND POLY, T0
   200	PSRLL $31, T1
   201	PSLLDQ $4, T1
   202	PSLLL $1, B0
   203	PXOR T0, B0
   204	PXOR T1, B0
   205	// Karatsuba pre-computations
   206	MOVOU B0, (16*14)(dst)
   207	PSHUFD $78, B0, B1
   208	PXOR B0, B1
   209	MOVOU B1, (16*15)(dst)
   210
   211	MOVOU B0, B2
   212	MOVOU B1, B3
   213	// Now prepare powers of H and pre-computations for them
   214	MOVQ $7, AX
   215
   216initLoop:
   217		MOVOU B2, T0
   218		MOVOU B2, T1
   219		MOVOU B3, T2
   220		PCLMULQDQ $0x00, B0, T0
   221		PCLMULQDQ $0x11, B0, T1
   222		PCLMULQDQ $0x00, B1, T2
   223
   224		PXOR T0, T2
   225		PXOR T1, T2
   226		MOVOU T2, B4
   227		PSLLDQ $8, B4
   228		PSRLDQ $8, T2
   229		PXOR B4, T0
   230		PXOR T2, T1
   231
   232		MOVOU POLY, B2
   233		PCLMULQDQ $0x01, T0, B2
   234		PSHUFD $78, T0, T0
   235		PXOR B2, T0
   236		MOVOU POLY, B2
   237		PCLMULQDQ $0x01, T0, B2
   238		PSHUFD $78, T0, T0
   239		PXOR T0, B2
   240		PXOR T1, B2
   241
   242		MOVOU B2, (16*12)(dst)
   243		PSHUFD $78, B2, B3
   244		PXOR B2, B3
   245		MOVOU B3, (16*13)(dst)
   246
   247		DECQ AX
   248		LEAQ (-16*2)(dst), dst
   249	JNE initLoop
   250
   251	RET
   252#undef NR
   253#undef KS
   254#undef dst
   255
   256// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   257TEXT ·gcmAesData(SB),NOSPLIT,$0
   258#define pTbl DI
   259#define aut SI
   260#define tPtr CX
   261#define autLen DX
   262
   263#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   264#define mulRoundAAD(X ,i) \
   265	MOVOU (16*(i*2))(pTbl), T1;\
   266	MOVOU T1, T2;\
   267	PCLMULQDQ $0x00, X, T1;\
   268	PXOR T1, ACC0;\
   269	PCLMULQDQ $0x11, X, T2;\
   270	PXOR T2, ACC1;\
   271	PSHUFD $78, X, T1;\
   272	PXOR T1, X;\
   273	MOVOU (16*(i*2+1))(pTbl), T1;\
   274	PCLMULQDQ $0x00, X, T1;\
   275	PXOR T1, ACCM
   276
   277	MOVQ productTable+0(FP), pTbl
   278	MOVQ data_base+8(FP), aut
   279	MOVQ data_len+16(FP), autLen
   280	MOVQ T+32(FP), tPtr
   281
   282	PXOR ACC0, ACC0
   283	MOVOU bswapMask<>(SB), BSWAP
   284	MOVOU gcmPoly<>(SB), POLY
   285
   286	TESTQ autLen, autLen
   287	JEQ dataBail
   288
   289	CMPQ autLen, $13	// optimize the TLS case
   290	JE dataTLS
   291	CMPQ autLen, $128
   292	JB startSinglesLoop
   293	JMP dataOctaLoop
   294
   295dataTLS:
   296	MOVOU (16*14)(pTbl), T1
   297	MOVOU (16*15)(pTbl), T2
   298	PXOR B0, B0
   299	MOVQ (aut), B0
   300	PINSRD $2, 8(aut), B0
   301	PINSRB $12, 12(aut), B0
   302	XORQ autLen, autLen
   303	JMP dataMul
   304
   305dataOctaLoop:
   306		CMPQ autLen, $128
   307		JB startSinglesLoop
   308		SUBQ $128, autLen
   309
   310		MOVOU (16*0)(aut), X0
   311		MOVOU (16*1)(aut), X1
   312		MOVOU (16*2)(aut), X2
   313		MOVOU (16*3)(aut), X3
   314		MOVOU (16*4)(aut), X4
   315		MOVOU (16*5)(aut), X5
   316		MOVOU (16*6)(aut), X6
   317		MOVOU (16*7)(aut), X7
   318		LEAQ (16*8)(aut), aut
   319		PSHUFB BSWAP, X0
   320		PSHUFB BSWAP, X1
   321		PSHUFB BSWAP, X2
   322		PSHUFB BSWAP, X3
   323		PSHUFB BSWAP, X4
   324		PSHUFB BSWAP, X5
   325		PSHUFB BSWAP, X6
   326		PSHUFB BSWAP, X7
   327		PXOR ACC0, X0
   328
   329		MOVOU (16*0)(pTbl), ACC0
   330		MOVOU (16*1)(pTbl), ACCM
   331		MOVOU ACC0, ACC1
   332		PSHUFD $78, X0, T1
   333		PXOR X0, T1
   334		PCLMULQDQ $0x00, X0, ACC0
   335		PCLMULQDQ $0x11, X0, ACC1
   336		PCLMULQDQ $0x00, T1, ACCM
   337
   338		mulRoundAAD(X1, 1)
   339		mulRoundAAD(X2, 2)
   340		mulRoundAAD(X3, 3)
   341		mulRoundAAD(X4, 4)
   342		mulRoundAAD(X5, 5)
   343		mulRoundAAD(X6, 6)
   344		mulRoundAAD(X7, 7)
   345
   346		PXOR ACC0, ACCM
   347		PXOR ACC1, ACCM
   348		MOVOU ACCM, T0
   349		PSRLDQ $8, ACCM
   350		PSLLDQ $8, T0
   351		PXOR ACCM, ACC1
   352		PXOR T0, ACC0
   353		reduceRound(ACC0)
   354		reduceRound(ACC0)
   355		PXOR ACC1, ACC0
   356	JMP dataOctaLoop
   357
   358startSinglesLoop:
   359	MOVOU (16*14)(pTbl), T1
   360	MOVOU (16*15)(pTbl), T2
   361
   362dataSinglesLoop:
   363
   364		CMPQ autLen, $16
   365		JB dataEnd
   366		SUBQ $16, autLen
   367
   368		MOVOU (aut), B0
   369dataMul:
   370		PSHUFB BSWAP, B0
   371		PXOR ACC0, B0
   372
   373		MOVOU T1, ACC0
   374		MOVOU T2, ACCM
   375		MOVOU T1, ACC1
   376
   377		PSHUFD $78, B0, T0
   378		PXOR B0, T0
   379		PCLMULQDQ $0x00, B0, ACC0
   380		PCLMULQDQ $0x11, B0, ACC1
   381		PCLMULQDQ $0x00, T0, ACCM
   382
   383		PXOR ACC0, ACCM
   384		PXOR ACC1, ACCM
   385		MOVOU ACCM, T0
   386		PSRLDQ $8, ACCM
   387		PSLLDQ $8, T0
   388		PXOR ACCM, ACC1
   389		PXOR T0, ACC0
   390
   391		MOVOU POLY, T0
   392		PCLMULQDQ $0x01, ACC0, T0
   393		PSHUFD $78, ACC0, ACC0
   394		PXOR T0, ACC0
   395
   396		MOVOU POLY, T0
   397		PCLMULQDQ $0x01, ACC0, T0
   398		PSHUFD $78, ACC0, ACC0
   399		PXOR T0, ACC0
   400		PXOR ACC1, ACC0
   401
   402		LEAQ 16(aut), aut
   403
   404	JMP dataSinglesLoop
   405
   406dataEnd:
   407
   408	TESTQ autLen, autLen
   409	JEQ dataBail
   410
   411	PXOR B0, B0
   412	LEAQ -1(aut)(autLen*1), aut
   413
   414dataLoadLoop:
   415
   416		PSLLDQ $1, B0
   417		PINSRB $0, (aut), B0
   418
   419		LEAQ -1(aut), aut
   420		DECQ autLen
   421		JNE dataLoadLoop
   422
   423	JMP dataMul
   424
   425dataBail:
   426	MOVOU ACC0, (tPtr)
   427	RET
   428#undef pTbl
   429#undef aut
   430#undef tPtr
   431#undef autLen
   432
   433// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   434TEXT ·gcmAesEnc(SB),0,$256-96
   435#define pTbl DI
   436#define ctx DX
   437#define ctrPtr CX
   438#define ptx SI
   439#define ks AX
   440#define tPtr R8
   441#define ptxLen R9
   442#define aluCTR R10
   443#define aluTMP R11
   444#define aluK R12
   445#define NR R13
   446
   447#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   448#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   449#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   450#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   451#define combinedRound(i) \
   452	MOVOU (16*i)(ks), T0;\
   453	AESENC T0, B0;\
   454	AESENC T0, B1;\
   455	AESENC T0, B2;\
   456	AESENC T0, B3;\
   457	 MOVOU (16*(i*2))(pTbl), T1;\
   458	 MOVOU T1, T2;\
   459	AESENC T0, B4;\
   460	AESENC T0, B5;\
   461	AESENC T0, B6;\
   462	AESENC T0, B7;\
   463	 MOVOU (16*i)(SP), T0;\
   464	 PCLMULQDQ $0x00, T0, T1;\
   465	 PXOR T1, ACC0;\
   466	 PSHUFD $78, T0, T1;\
   467	 PCLMULQDQ $0x11, T0, T2;\
   468	 PXOR T1, T0;\
   469	 PXOR T2, ACC1;\
   470	 MOVOU (16*(i*2+1))(pTbl), T2;\
   471	 PCLMULQDQ $0x00, T2, T0;\
   472	 PXOR T0, ACCM
   473#define mulRound(i) \
   474	MOVOU (16*i)(SP), T0;\
   475	MOVOU (16*(i*2))(pTbl), T1;\
   476	MOVOU T1, T2;\
   477	PCLMULQDQ $0x00, T0, T1;\
   478	PXOR T1, ACC0;\
   479	PCLMULQDQ $0x11, T0, T2;\
   480	PXOR T2, ACC1;\
   481	PSHUFD $78, T0, T1;\
   482	PXOR T1, T0;\
   483	MOVOU (16*(i*2+1))(pTbl), T1;\
   484	PCLMULQDQ $0x00, T0, T1;\
   485	PXOR T1, ACCM
   486
   487	MOVQ productTable+0(FP), pTbl
   488	MOVQ dst+8(FP), ctx
   489	MOVQ src_base+32(FP), ptx
   490	MOVQ src_len+40(FP), ptxLen
   491	MOVQ ctr+56(FP), ctrPtr
   492	MOVQ T+64(FP), tPtr
   493	MOVQ ks_base+72(FP), ks
   494	MOVQ ks_len+80(FP), NR
   495
   496	SHRQ $2, NR
   497	DECQ NR
   498
   499	MOVOU bswapMask<>(SB), BSWAP
   500	MOVOU gcmPoly<>(SB), POLY
   501
   502	MOVOU (tPtr), ACC0
   503	PXOR ACC1, ACC1
   504	PXOR ACCM, ACCM
   505	MOVOU (ctrPtr), B0
   506	MOVL (3*4)(ctrPtr), aluCTR
   507	MOVOU (ks), T0
   508	MOVL (3*4)(ks), aluK
   509	BSWAPL aluCTR
   510	BSWAPL aluK
   511
   512	PXOR B0, T0
   513	MOVOU T0, (8*16 + 0*16)(SP)
   514	increment(0)
   515
   516	CMPQ ptxLen, $128
   517	JB gcmAesEncSingles
   518	SUBQ $128, ptxLen
   519
   520	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   521	MOVOU T0, (8*16 + 1*16)(SP)
   522	increment(1)
   523	MOVOU T0, (8*16 + 2*16)(SP)
   524	increment(2)
   525	MOVOU T0, (8*16 + 3*16)(SP)
   526	increment(3)
   527	MOVOU T0, (8*16 + 4*16)(SP)
   528	increment(4)
   529	MOVOU T0, (8*16 + 5*16)(SP)
   530	increment(5)
   531	MOVOU T0, (8*16 + 6*16)(SP)
   532	increment(6)
   533	MOVOU T0, (8*16 + 7*16)(SP)
   534	increment(7)
   535
   536	MOVOU (8*16 + 0*16)(SP), B0
   537	MOVOU (8*16 + 1*16)(SP), B1
   538	MOVOU (8*16 + 2*16)(SP), B2
   539	MOVOU (8*16 + 3*16)(SP), B3
   540	MOVOU (8*16 + 4*16)(SP), B4
   541	MOVOU (8*16 + 5*16)(SP), B5
   542	MOVOU (8*16 + 6*16)(SP), B6
   543	MOVOU (8*16 + 7*16)(SP), B7
   544
   545	aesRound(1)
   546	increment(0)
   547	aesRound(2)
   548	increment(1)
   549	aesRound(3)
   550	increment(2)
   551	aesRound(4)
   552	increment(3)
   553	aesRound(5)
   554	increment(4)
   555	aesRound(6)
   556	increment(5)
   557	aesRound(7)
   558	increment(6)
   559	aesRound(8)
   560	increment(7)
   561	aesRound(9)
   562	MOVOU (16*10)(ks), T0
   563	CMPQ NR, $12
   564	JB encLast1
   565	aesRnd(T0)
   566	aesRound(11)
   567	MOVOU (16*12)(ks), T0
   568	JE encLast1
   569	aesRnd(T0)
   570	aesRound(13)
   571	MOVOU (16*14)(ks), T0
   572encLast1:
   573	aesRndLast(T0)
   574
   575	MOVOU (16*0)(ptx), T0
   576	PXOR T0, B0
   577	MOVOU (16*1)(ptx), T0
   578	PXOR T0, B1
   579	MOVOU (16*2)(ptx), T0
   580	PXOR T0, B2
   581	MOVOU (16*3)(ptx), T0
   582	PXOR T0, B3
   583	MOVOU (16*4)(ptx), T0
   584	PXOR T0, B4
   585	MOVOU (16*5)(ptx), T0
   586	PXOR T0, B5
   587	MOVOU (16*6)(ptx), T0
   588	PXOR T0, B6
   589	MOVOU (16*7)(ptx), T0
   590	PXOR T0, B7
   591
   592	MOVOU B0, (16*0)(ctx)
   593	PSHUFB BSWAP, B0
   594	PXOR ACC0, B0
   595	MOVOU B1, (16*1)(ctx)
   596	PSHUFB BSWAP, B1
   597	MOVOU B2, (16*2)(ctx)
   598	PSHUFB BSWAP, B2
   599	MOVOU B3, (16*3)(ctx)
   600	PSHUFB BSWAP, B3
   601	MOVOU B4, (16*4)(ctx)
   602	PSHUFB BSWAP, B4
   603	MOVOU B5, (16*5)(ctx)
   604	PSHUFB BSWAP, B5
   605	MOVOU B6, (16*6)(ctx)
   606	PSHUFB BSWAP, B6
   607	MOVOU B7, (16*7)(ctx)
   608	PSHUFB BSWAP, B7
   609
   610	MOVOU B0, (16*0)(SP)
   611	MOVOU B1, (16*1)(SP)
   612	MOVOU B2, (16*2)(SP)
   613	MOVOU B3, (16*3)(SP)
   614	MOVOU B4, (16*4)(SP)
   615	MOVOU B5, (16*5)(SP)
   616	MOVOU B6, (16*6)(SP)
   617	MOVOU B7, (16*7)(SP)
   618
   619	LEAQ 128(ptx), ptx
   620	LEAQ 128(ctx), ctx
   621
   622gcmAesEncOctetsLoop:
   623
   624		CMPQ ptxLen, $128
   625		JB gcmAesEncOctetsEnd
   626		SUBQ $128, ptxLen
   627
   628		MOVOU (8*16 + 0*16)(SP), B0
   629		MOVOU (8*16 + 1*16)(SP), B1
   630		MOVOU (8*16 + 2*16)(SP), B2
   631		MOVOU (8*16 + 3*16)(SP), B3
   632		MOVOU (8*16 + 4*16)(SP), B4
   633		MOVOU (8*16 + 5*16)(SP), B5
   634		MOVOU (8*16 + 6*16)(SP), B6
   635		MOVOU (8*16 + 7*16)(SP), B7
   636
   637		MOVOU (16*0)(SP), T0
   638		PSHUFD $78, T0, T1
   639		PXOR T0, T1
   640
   641		MOVOU (16*0)(pTbl), ACC0
   642		MOVOU (16*1)(pTbl), ACCM
   643		MOVOU ACC0, ACC1
   644
   645		PCLMULQDQ $0x00, T1, ACCM
   646		PCLMULQDQ $0x00, T0, ACC0
   647		PCLMULQDQ $0x11, T0, ACC1
   648
   649		combinedRound(1)
   650		increment(0)
   651		combinedRound(2)
   652		increment(1)
   653		combinedRound(3)
   654		increment(2)
   655		combinedRound(4)
   656		increment(3)
   657		combinedRound(5)
   658		increment(4)
   659		combinedRound(6)
   660		increment(5)
   661		combinedRound(7)
   662		increment(6)
   663
   664		aesRound(8)
   665		increment(7)
   666
   667		PXOR ACC0, ACCM
   668		PXOR ACC1, ACCM
   669		MOVOU ACCM, T0
   670		PSRLDQ $8, ACCM
   671		PSLLDQ $8, T0
   672		PXOR ACCM, ACC1
   673		PXOR T0, ACC0
   674
   675		reduceRound(ACC0)
   676		aesRound(9)
   677
   678		reduceRound(ACC0)
   679		PXOR ACC1, ACC0
   680
   681		MOVOU (16*10)(ks), T0
   682		CMPQ NR, $12
   683		JB encLast2
   684		aesRnd(T0)
   685		aesRound(11)
   686		MOVOU (16*12)(ks), T0
   687		JE encLast2
   688		aesRnd(T0)
   689		aesRound(13)
   690		MOVOU (16*14)(ks), T0
   691encLast2:
   692		aesRndLast(T0)
   693
   694		MOVOU (16*0)(ptx), T0
   695		PXOR T0, B0
   696		MOVOU (16*1)(ptx), T0
   697		PXOR T0, B1
   698		MOVOU (16*2)(ptx), T0
   699		PXOR T0, B2
   700		MOVOU (16*3)(ptx), T0
   701		PXOR T0, B3
   702		MOVOU (16*4)(ptx), T0
   703		PXOR T0, B4
   704		MOVOU (16*5)(ptx), T0
   705		PXOR T0, B5
   706		MOVOU (16*6)(ptx), T0
   707		PXOR T0, B6
   708		MOVOU (16*7)(ptx), T0
   709		PXOR T0, B7
   710
   711		MOVOU B0, (16*0)(ctx)
   712		PSHUFB BSWAP, B0
   713		PXOR ACC0, B0
   714		MOVOU B1, (16*1)(ctx)
   715		PSHUFB BSWAP, B1
   716		MOVOU B2, (16*2)(ctx)
   717		PSHUFB BSWAP, B2
   718		MOVOU B3, (16*3)(ctx)
   719		PSHUFB BSWAP, B3
   720		MOVOU B4, (16*4)(ctx)
   721		PSHUFB BSWAP, B4
   722		MOVOU B5, (16*5)(ctx)
   723		PSHUFB BSWAP, B5
   724		MOVOU B6, (16*6)(ctx)
   725		PSHUFB BSWAP, B6
   726		MOVOU B7, (16*7)(ctx)
   727		PSHUFB BSWAP, B7
   728
   729		MOVOU B0, (16*0)(SP)
   730		MOVOU B1, (16*1)(SP)
   731		MOVOU B2, (16*2)(SP)
   732		MOVOU B3, (16*3)(SP)
   733		MOVOU B4, (16*4)(SP)
   734		MOVOU B5, (16*5)(SP)
   735		MOVOU B6, (16*6)(SP)
   736		MOVOU B7, (16*7)(SP)
   737
   738		LEAQ 128(ptx), ptx
   739		LEAQ 128(ctx), ctx
   740
   741		JMP gcmAesEncOctetsLoop
   742
   743gcmAesEncOctetsEnd:
   744
   745	MOVOU (16*0)(SP), T0
   746	MOVOU (16*0)(pTbl), ACC0
   747	MOVOU (16*1)(pTbl), ACCM
   748	MOVOU ACC0, ACC1
   749	PSHUFD $78, T0, T1
   750	PXOR T0, T1
   751	PCLMULQDQ $0x00, T0, ACC0
   752	PCLMULQDQ $0x11, T0, ACC1
   753	PCLMULQDQ $0x00, T1, ACCM
   754
   755	mulRound(1)
   756	mulRound(2)
   757	mulRound(3)
   758	mulRound(4)
   759	mulRound(5)
   760	mulRound(6)
   761	mulRound(7)
   762
   763	PXOR ACC0, ACCM
   764	PXOR ACC1, ACCM
   765	MOVOU ACCM, T0
   766	PSRLDQ $8, ACCM
   767	PSLLDQ $8, T0
   768	PXOR ACCM, ACC1
   769	PXOR T0, ACC0
   770
   771	reduceRound(ACC0)
   772	reduceRound(ACC0)
   773	PXOR ACC1, ACC0
   774
   775	TESTQ ptxLen, ptxLen
   776	JE gcmAesEncDone
   777
   778	SUBQ $7, aluCTR
   779
   780gcmAesEncSingles:
   781
   782	MOVOU (16*1)(ks), B1
   783	MOVOU (16*2)(ks), B2
   784	MOVOU (16*3)(ks), B3
   785	MOVOU (16*4)(ks), B4
   786	MOVOU (16*5)(ks), B5
   787	MOVOU (16*6)(ks), B6
   788	MOVOU (16*7)(ks), B7
   789
   790	MOVOU (16*14)(pTbl), T2
   791
   792gcmAesEncSinglesLoop:
   793
   794		CMPQ ptxLen, $16
   795		JB gcmAesEncTail
   796		SUBQ $16, ptxLen
   797
   798		MOVOU (8*16 + 0*16)(SP), B0
   799		increment(0)
   800
   801		AESENC B1, B0
   802		AESENC B2, B0
   803		AESENC B3, B0
   804		AESENC B4, B0
   805		AESENC B5, B0
   806		AESENC B6, B0
   807		AESENC B7, B0
   808		MOVOU (16*8)(ks), T0
   809		AESENC T0, B0
   810		MOVOU (16*9)(ks), T0
   811		AESENC T0, B0
   812		MOVOU (16*10)(ks), T0
   813		CMPQ NR, $12
   814		JB encLast3
   815		AESENC T0, B0
   816		MOVOU (16*11)(ks), T0
   817		AESENC T0, B0
   818		MOVOU (16*12)(ks), T0
   819		JE encLast3
   820		AESENC T0, B0
   821		MOVOU (16*13)(ks), T0
   822		AESENC T0, B0
   823		MOVOU (16*14)(ks), T0
   824encLast3:
   825		AESENCLAST T0, B0
   826
   827		MOVOU (ptx), T0
   828		PXOR T0, B0
   829		MOVOU B0, (ctx)
   830
   831		PSHUFB BSWAP, B0
   832		PXOR ACC0, B0
   833
   834		MOVOU T2, ACC0
   835		MOVOU T2, ACC1
   836		MOVOU (16*15)(pTbl), ACCM
   837
   838		PSHUFD $78, B0, T0
   839		PXOR B0, T0
   840		PCLMULQDQ $0x00, B0, ACC0
   841		PCLMULQDQ $0x11, B0, ACC1
   842		PCLMULQDQ $0x00, T0, ACCM
   843
   844		PXOR ACC0, ACCM
   845		PXOR ACC1, ACCM
   846		MOVOU ACCM, T0
   847		PSRLDQ $8, ACCM
   848		PSLLDQ $8, T0
   849		PXOR ACCM, ACC1
   850		PXOR T0, ACC0
   851
   852		reduceRound(ACC0)
   853		reduceRound(ACC0)
   854		PXOR ACC1, ACC0
   855
   856		LEAQ (16*1)(ptx), ptx
   857		LEAQ (16*1)(ctx), ctx
   858
   859	JMP gcmAesEncSinglesLoop
   860
   861gcmAesEncTail:
   862	TESTQ ptxLen, ptxLen
   863	JE gcmAesEncDone
   864
   865	MOVOU (8*16 + 0*16)(SP), B0
   866	AESENC B1, B0
   867	AESENC B2, B0
   868	AESENC B3, B0
   869	AESENC B4, B0
   870	AESENC B5, B0
   871	AESENC B6, B0
   872	AESENC B7, B0
   873	MOVOU (16*8)(ks), T0
   874	AESENC T0, B0
   875	MOVOU (16*9)(ks), T0
   876	AESENC T0, B0
   877	MOVOU (16*10)(ks), T0
   878	CMPQ NR, $12
   879	JB encLast4
   880	AESENC T0, B0
   881	MOVOU (16*11)(ks), T0
   882	AESENC T0, B0
   883	MOVOU (16*12)(ks), T0
   884	JE encLast4
   885	AESENC T0, B0
   886	MOVOU (16*13)(ks), T0
   887	AESENC T0, B0
   888	MOVOU (16*14)(ks), T0
   889encLast4:
   890	AESENCLAST T0, B0
   891	MOVOU B0, T0
   892
   893	LEAQ -1(ptx)(ptxLen*1), ptx
   894
   895	MOVQ ptxLen, aluTMP
   896	SHLQ $4, aluTMP
   897
   898	LEAQ andMask<>(SB), aluCTR
   899	MOVOU -16(aluCTR)(aluTMP*1), T1
   900
   901	PXOR B0, B0
   902ptxLoadLoop:
   903		PSLLDQ $1, B0
   904		PINSRB $0, (ptx), B0
   905		LEAQ -1(ptx), ptx
   906		DECQ ptxLen
   907	JNE ptxLoadLoop
   908
   909	PXOR T0, B0
   910	PAND T1, B0
   911	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   912
   913	PSHUFB BSWAP, B0
   914	PXOR ACC0, B0
   915
   916	MOVOU T2, ACC0
   917	MOVOU T2, ACC1
   918	MOVOU (16*15)(pTbl), ACCM
   919
   920	PSHUFD $78, B0, T0
   921	PXOR B0, T0
   922	PCLMULQDQ $0x00, B0, ACC0
   923	PCLMULQDQ $0x11, B0, ACC1
   924	PCLMULQDQ $0x00, T0, ACCM
   925
   926	PXOR ACC0, ACCM
   927	PXOR ACC1, ACCM
   928	MOVOU ACCM, T0
   929	PSRLDQ $8, ACCM
   930	PSLLDQ $8, T0
   931	PXOR ACCM, ACC1
   932	PXOR T0, ACC0
   933
   934	reduceRound(ACC0)
   935	reduceRound(ACC0)
   936	PXOR ACC1, ACC0
   937
   938gcmAesEncDone:
   939	MOVOU ACC0, (tPtr)
   940	RET
   941#undef increment
   942
   943// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   944TEXT ·gcmAesDec(SB),0,$128-96
   945#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   946#define combinedDecRound(i) \
   947	MOVOU (16*i)(ks), T0;\
   948	AESENC T0, B0;\
   949	AESENC T0, B1;\
   950	AESENC T0, B2;\
   951	AESENC T0, B3;\
   952	MOVOU (16*(i*2))(pTbl), T1;\
   953	MOVOU T1, T2;\
   954	AESENC T0, B4;\
   955	AESENC T0, B5;\
   956	AESENC T0, B6;\
   957	AESENC T0, B7;\
   958	MOVOU (16*i)(ctx), T0;\
   959	PSHUFB BSWAP, T0;\
   960	PCLMULQDQ $0x00, T0, T1;\
   961	PXOR T1, ACC0;\
   962	PSHUFD $78, T0, T1;\
   963	PCLMULQDQ $0x11, T0, T2;\
   964	PXOR T1, T0;\
   965	PXOR T2, ACC1;\
   966	MOVOU (16*(i*2+1))(pTbl), T2;\
   967	PCLMULQDQ $0x00, T2, T0;\
   968	PXOR T0, ACCM
   969
   970	MOVQ productTable+0(FP), pTbl
   971	MOVQ dst+8(FP), ptx
   972	MOVQ src_base+32(FP), ctx
   973	MOVQ src_len+40(FP), ptxLen
   974	MOVQ ctr+56(FP), ctrPtr
   975	MOVQ T+64(FP), tPtr
   976	MOVQ ks_base+72(FP), ks
   977	MOVQ ks_len+80(FP), NR
   978
   979	SHRQ $2, NR
   980	DECQ NR
   981
   982	MOVOU bswapMask<>(SB), BSWAP
   983	MOVOU gcmPoly<>(SB), POLY
   984
   985	MOVOU (tPtr), ACC0
   986	PXOR ACC1, ACC1
   987	PXOR ACCM, ACCM
   988	MOVOU (ctrPtr), B0
   989	MOVL (3*4)(ctrPtr), aluCTR
   990	MOVOU (ks), T0
   991	MOVL (3*4)(ks), aluK
   992	BSWAPL aluCTR
   993	BSWAPL aluK
   994
   995	PXOR B0, T0
   996	MOVOU T0, (0*16)(SP)
   997	increment(0)
   998
   999	CMPQ ptxLen, $128
  1000	JB gcmAesDecSingles
  1001
  1002	MOVOU T0, (1*16)(SP)
  1003	increment(1)
  1004	MOVOU T0, (2*16)(SP)
  1005	increment(2)
  1006	MOVOU T0, (3*16)(SP)
  1007	increment(3)
  1008	MOVOU T0, (4*16)(SP)
  1009	increment(4)
  1010	MOVOU T0, (5*16)(SP)
  1011	increment(5)
  1012	MOVOU T0, (6*16)(SP)
  1013	increment(6)
  1014	MOVOU T0, (7*16)(SP)
  1015	increment(7)
  1016
  1017gcmAesDecOctetsLoop:
  1018
  1019		CMPQ ptxLen, $128
  1020		JB gcmAesDecEndOctets
  1021		SUBQ $128, ptxLen
  1022
  1023		MOVOU (0*16)(SP), B0
  1024		MOVOU (1*16)(SP), B1
  1025		MOVOU (2*16)(SP), B2
  1026		MOVOU (3*16)(SP), B3
  1027		MOVOU (4*16)(SP), B4
  1028		MOVOU (5*16)(SP), B5
  1029		MOVOU (6*16)(SP), B6
  1030		MOVOU (7*16)(SP), B7
  1031
  1032		MOVOU (16*0)(ctx), T0
  1033		PSHUFB BSWAP, T0
  1034		PXOR ACC0, T0
  1035		PSHUFD $78, T0, T1
  1036		PXOR T0, T1
  1037
  1038		MOVOU (16*0)(pTbl), ACC0
  1039		MOVOU (16*1)(pTbl), ACCM
  1040		MOVOU ACC0, ACC1
  1041
  1042		PCLMULQDQ $0x00, T1, ACCM
  1043		PCLMULQDQ $0x00, T0, ACC0
  1044		PCLMULQDQ $0x11, T0, ACC1
  1045
  1046		combinedDecRound(1)
  1047		increment(0)
  1048		combinedDecRound(2)
  1049		increment(1)
  1050		combinedDecRound(3)
  1051		increment(2)
  1052		combinedDecRound(4)
  1053		increment(3)
  1054		combinedDecRound(5)
  1055		increment(4)
  1056		combinedDecRound(6)
  1057		increment(5)
  1058		combinedDecRound(7)
  1059		increment(6)
  1060
  1061		aesRound(8)
  1062		increment(7)
  1063
  1064		PXOR ACC0, ACCM
  1065		PXOR ACC1, ACCM
  1066		MOVOU ACCM, T0
  1067		PSRLDQ $8, ACCM
  1068		PSLLDQ $8, T0
  1069		PXOR ACCM, ACC1
  1070		PXOR T0, ACC0
  1071
  1072		reduceRound(ACC0)
  1073		aesRound(9)
  1074
  1075		reduceRound(ACC0)
  1076		PXOR ACC1, ACC0
  1077
  1078		MOVOU (16*10)(ks), T0
  1079		CMPQ NR, $12
  1080		JB decLast1
  1081		aesRnd(T0)
  1082		aesRound(11)
  1083		MOVOU (16*12)(ks), T0
  1084		JE decLast1
  1085		aesRnd(T0)
  1086		aesRound(13)
  1087		MOVOU (16*14)(ks), T0
  1088decLast1:
  1089		aesRndLast(T0)
  1090
  1091		MOVOU (16*0)(ctx), T0
  1092		PXOR T0, B0
  1093		MOVOU (16*1)(ctx), T0
  1094		PXOR T0, B1
  1095		MOVOU (16*2)(ctx), T0
  1096		PXOR T0, B2
  1097		MOVOU (16*3)(ctx), T0
  1098		PXOR T0, B3
  1099		MOVOU (16*4)(ctx), T0
  1100		PXOR T0, B4
  1101		MOVOU (16*5)(ctx), T0
  1102		PXOR T0, B5
  1103		MOVOU (16*6)(ctx), T0
  1104		PXOR T0, B6
  1105		MOVOU (16*7)(ctx), T0
  1106		PXOR T0, B7
  1107
  1108		MOVOU B0, (16*0)(ptx)
  1109		MOVOU B1, (16*1)(ptx)
  1110		MOVOU B2, (16*2)(ptx)
  1111		MOVOU B3, (16*3)(ptx)
  1112		MOVOU B4, (16*4)(ptx)
  1113		MOVOU B5, (16*5)(ptx)
  1114		MOVOU B6, (16*6)(ptx)
  1115		MOVOU B7, (16*7)(ptx)
  1116
  1117		LEAQ 128(ptx), ptx
  1118		LEAQ 128(ctx), ctx
  1119
  1120		JMP gcmAesDecOctetsLoop
  1121
  1122gcmAesDecEndOctets:
  1123
  1124	SUBQ $7, aluCTR
  1125
  1126gcmAesDecSingles:
  1127
  1128	MOVOU (16*1)(ks), B1
  1129	MOVOU (16*2)(ks), B2
  1130	MOVOU (16*3)(ks), B3
  1131	MOVOU (16*4)(ks), B4
  1132	MOVOU (16*5)(ks), B5
  1133	MOVOU (16*6)(ks), B6
  1134	MOVOU (16*7)(ks), B7
  1135
  1136	MOVOU (16*14)(pTbl), T2
  1137
  1138gcmAesDecSinglesLoop:
  1139
  1140		CMPQ ptxLen, $16
  1141		JB gcmAesDecTail
  1142		SUBQ $16, ptxLen
  1143
  1144		MOVOU (ctx), B0
  1145		MOVOU B0, T1
  1146		PSHUFB BSWAP, B0
  1147		PXOR ACC0, B0
  1148
  1149		MOVOU T2, ACC0
  1150		MOVOU T2, ACC1
  1151		MOVOU (16*15)(pTbl), ACCM
  1152
  1153		PCLMULQDQ $0x00, B0, ACC0
  1154		PCLMULQDQ $0x11, B0, ACC1
  1155		PSHUFD $78, B0, T0
  1156		PXOR B0, T0
  1157		PCLMULQDQ $0x00, T0, ACCM
  1158
  1159		PXOR ACC0, ACCM
  1160		PXOR ACC1, ACCM
  1161		MOVOU ACCM, T0
  1162		PSRLDQ $8, ACCM
  1163		PSLLDQ $8, T0
  1164		PXOR ACCM, ACC1
  1165		PXOR T0, ACC0
  1166
  1167		reduceRound(ACC0)
  1168		reduceRound(ACC0)
  1169		PXOR ACC1, ACC0
  1170
  1171		MOVOU (0*16)(SP), B0
  1172		increment(0)
  1173		AESENC B1, B0
  1174		AESENC B2, B0
  1175		AESENC B3, B0
  1176		AESENC B4, B0
  1177		AESENC B5, B0
  1178		AESENC B6, B0
  1179		AESENC B7, B0
  1180		MOVOU (16*8)(ks), T0
  1181		AESENC T0, B0
  1182		MOVOU (16*9)(ks), T0
  1183		AESENC T0, B0
  1184		MOVOU (16*10)(ks), T0
  1185		CMPQ NR, $12
  1186		JB decLast2
  1187		AESENC T0, B0
  1188		MOVOU (16*11)(ks), T0
  1189		AESENC T0, B0
  1190		MOVOU (16*12)(ks), T0
  1191		JE decLast2
  1192		AESENC T0, B0
  1193		MOVOU (16*13)(ks), T0
  1194		AESENC T0, B0
  1195		MOVOU (16*14)(ks), T0
  1196decLast2:
  1197		AESENCLAST T0, B0
  1198
  1199		PXOR T1, B0
  1200		MOVOU B0, (ptx)
  1201
  1202		LEAQ (16*1)(ptx), ptx
  1203		LEAQ (16*1)(ctx), ctx
  1204
  1205	JMP gcmAesDecSinglesLoop
  1206
  1207gcmAesDecTail:
  1208
  1209	TESTQ ptxLen, ptxLen
  1210	JE gcmAesDecDone
  1211
  1212	MOVQ ptxLen, aluTMP
  1213	SHLQ $4, aluTMP
  1214	LEAQ andMask<>(SB), aluCTR
  1215	MOVOU -16(aluCTR)(aluTMP*1), T1
  1216
  1217	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1218	PAND T1, B0
  1219
  1220	MOVOU B0, T1
  1221	PSHUFB BSWAP, B0
  1222	PXOR ACC0, B0
  1223
  1224	MOVOU (16*14)(pTbl), ACC0
  1225	MOVOU (16*15)(pTbl), ACCM
  1226	MOVOU ACC0, ACC1
  1227
  1228	PCLMULQDQ $0x00, B0, ACC0
  1229	PCLMULQDQ $0x11, B0, ACC1
  1230	PSHUFD $78, B0, T0
  1231	PXOR B0, T0
  1232	PCLMULQDQ $0x00, T0, ACCM
  1233
  1234	PXOR ACC0, ACCM
  1235	PXOR ACC1, ACCM
  1236	MOVOU ACCM, T0
  1237	PSRLDQ $8, ACCM
  1238	PSLLDQ $8, T0
  1239	PXOR ACCM, ACC1
  1240	PXOR T0, ACC0
  1241
  1242	reduceRound(ACC0)
  1243	reduceRound(ACC0)
  1244	PXOR ACC1, ACC0
  1245
  1246	MOVOU (0*16)(SP), B0
  1247	increment(0)
  1248	AESENC B1, B0
  1249	AESENC B2, B0
  1250	AESENC B3, B0
  1251	AESENC B4, B0
  1252	AESENC B5, B0
  1253	AESENC B6, B0
  1254	AESENC B7, B0
  1255	MOVOU (16*8)(ks), T0
  1256	AESENC T0, B0
  1257	MOVOU (16*9)(ks), T0
  1258	AESENC T0, B0
  1259	MOVOU (16*10)(ks), T0
  1260	CMPQ NR, $12
  1261	JB decLast3
  1262	AESENC T0, B0
  1263	MOVOU (16*11)(ks), T0
  1264	AESENC T0, B0
  1265	MOVOU (16*12)(ks), T0
  1266	JE decLast3
  1267	AESENC T0, B0
  1268	MOVOU (16*13)(ks), T0
  1269	AESENC T0, B0
  1270	MOVOU (16*14)(ks), T0
  1271decLast3:
  1272	AESENCLAST T0, B0
  1273	PXOR T1, B0
  1274
  1275ptxStoreLoop:
  1276		PEXTRB $0, B0, (ptx)
  1277		PSRLDQ $1, B0
  1278		LEAQ 1(ptx), ptx
  1279		DECQ ptxLen
  1280
  1281	JNE ptxStoreLoop
  1282
  1283gcmAesDecDone:
  1284
  1285	MOVOU ACC0, (tPtr)
  1286	RET

View as plain text