...
Run Format

Text file src/crypto/rc4/rc4_amd64.s

Documentation: crypto/rc4

     1	// Original source:
     2	//	http://www.zorinaq.com/papers/rc4-amd64.html
     3	//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
     4	
     5	#include "textflag.h"
     6	
     7	// Local modifications:
     8	//
     9	// Transliterated from GNU to 6a assembly syntax by the Go authors.
    10	// The comments and spacing are from the original.
    11	//
    12	// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
    13	//
    14	// The original code accumulated 64 bits of key stream in an integer
    15	// register and then XOR'ed the key stream into the data 8 bytes at a time.
    16	// Modified to accumulate 128 bits of key stream into an XMM register
    17	// and then XOR the key stream into the data 16 bytes at a time.
    18	// Approximately doubles throughput.
    19	
    20	// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
    21	// but makes the code run 2.0x slower on Xeon.
    22	#define EXTEND(r) MOVBLZX r, r
    23	
    24	/*
    25	** RC4 implementation optimized for AMD64.
    26	**
    27	** Author: Marc Bevand <bevand_m (at) epita.fr>
    28	** Licence: I hereby disclaim the copyright on this code and place it
    29	** in the public domain.
    30	**
    31	** The code has been designed to be easily integrated into openssl:
    32	** the exported RC4() function can replace the actual implementations
    33	** openssl already contains. Please note that when linking with openssl,
    34	** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
    35	** with -DRC4_INT='unsigned long'.
    36	**
    37	** The throughput achieved by this code is about 320 MBytes/sec, on
    38	** a 1.8 GHz AMD Opteron (rev C0) processor.
    39	*/
    40	
    41	TEXT ·xorKeyStream(SB),NOSPLIT,$0
    42		MOVQ	n+16(FP),	BX		// rbx = ARG(len)
    43		MOVQ	src+8(FP),	SI		// in = ARG(in)
    44		MOVQ	dst+0(FP),	DI		// out = ARG(out)
    45		MOVQ	state+24(FP),	BP		// d = ARG(data)
    46		MOVQ	i+32(FP),	AX
    47		MOVBQZX	0(AX),		CX		// x = *xp
    48		MOVQ	j+40(FP),	AX
    49		MOVBQZX	0(AX),		DX		// y = *yp
    50	
    51		LEAQ	(SI)(BX*1),	R9		// limit = in+len
    52	
    53	l1:	CMPQ	SI,		R9		// cmp in with in+len
    54		JGE	finished			// jump if (in >= in+len)
    55	
    56		INCB	CX
    57		EXTEND(CX)
    58		TESTL	$15,		CX
    59		JZ	wordloop
    60	
    61		MOVBLZX	(BP)(CX*4),	AX
    62	
    63		ADDB	AX,		DX		// y += tx
    64		EXTEND(DX)
    65		MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
    66		MOVB	BX,		(BP)(CX*4)	// d[x] = ty
    67		ADDB	AX,		BX		// val = ty+tx
    68		EXTEND(BX)
    69		MOVB	AX,		(BP)(DX*4)	// d[y] = tx
    70		MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
    71		XORB	(SI),		R8		// xor 1 byte
    72		MOVB	R8,		(DI)
    73		INCQ	SI				// in++
    74		INCQ	DI				// out++
    75		JMP l1
    76	
    77	wordloop:
    78		SUBQ	$16,		R9
    79		CMPQ	SI,		R9
    80		JGT	end
    81	
    82	start:
    83		ADDQ	$16,		SI		// increment in
    84		ADDQ	$16,		DI		// increment out
    85	
    86		// Each KEYROUND generates one byte of key and
    87		// inserts it into an XMM register at the given 16-bit index.
    88		// The key state array is uint32 words only using the bottom
    89		// byte of each word, so the 16-bit OR only copies 8 useful bits.
    90		// We accumulate alternating bytes into X0 and X1, and then at
    91		// the end we OR X1<<8 into X0 to produce the actual key.
    92		//
    93		// At the beginning of the loop, CX%16 == 0, so the 16 loads
    94		// at state[CX], state[CX+1], ..., state[CX+15] can precompute
    95		// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
    96		// without fear of the byte computation CX+15 wrapping around.
    97		//
    98		// The first round needs R12[0], the second needs R12[1], and so on.
    99		// We can avoid memory stalls by starting the load for round n+1
   100		// before the end of round n, using the LOAD macro.
   101		LEAQ	(BP)(CX*4),	R12
   102	
   103	#define KEYROUND(xmm, load, off, r1, r2, index) \
   104		MOVBLZX	(BP)(DX*4),	R8; \
   105		MOVB	r1,		(BP)(DX*4); \
   106		load((off+1), r2); \
   107		MOVB	R8,		(off*4)(R12); \
   108		ADDB	r1,		R8; \
   109		EXTEND(R8); \
   110		PINSRW	$index, (BP)(R8*4), xmm
   111	
   112	#define LOAD(off, reg) \
   113		MOVBLZX	(off*4)(R12),	reg; \
   114		ADDB	reg,		DX; \
   115		EXTEND(DX)
   116	
   117	#define SKIP(off, reg)
   118	
   119		LOAD(0, AX)
   120		KEYROUND(X0, LOAD, 0, AX, BX, 0)
   121		KEYROUND(X1, LOAD, 1, BX, AX, 0)
   122		KEYROUND(X0, LOAD, 2, AX, BX, 1)
   123		KEYROUND(X1, LOAD, 3, BX, AX, 1)
   124		KEYROUND(X0, LOAD, 4, AX, BX, 2)
   125		KEYROUND(X1, LOAD, 5, BX, AX, 2)
   126		KEYROUND(X0, LOAD, 6, AX, BX, 3)
   127		KEYROUND(X1, LOAD, 7, BX, AX, 3)
   128		KEYROUND(X0, LOAD, 8, AX, BX, 4)
   129		KEYROUND(X1, LOAD, 9, BX, AX, 4)
   130		KEYROUND(X0, LOAD, 10, AX, BX, 5)
   131		KEYROUND(X1, LOAD, 11, BX, AX, 5)
   132		KEYROUND(X0, LOAD, 12, AX, BX, 6)
   133		KEYROUND(X1, LOAD, 13, BX, AX, 6)
   134		KEYROUND(X0, LOAD, 14, AX, BX, 7)
   135		KEYROUND(X1, SKIP, 15, BX, AX, 7)
   136		
   137		ADDB	$16,		CX
   138	
   139		PSLLQ	$8,		X1
   140		PXOR	X1,		X0
   141		MOVOU	-16(SI),	X2
   142		PXOR	X0,		X2
   143		MOVOU	X2,		-16(DI)
   144	
   145		CMPQ	SI,		R9		// cmp in with in+len-16
   146		JLE	start				// jump if (in <= in+len-16)
   147	
   148	end:
   149		DECB	CX
   150		ADDQ	$16,		R9		// tmp = in+len
   151	
   152		// handle the last bytes, one by one
   153	l2:	CMPQ	SI,		R9		// cmp in with in+len
   154		JGE	finished			// jump if (in >= in+len)
   155	
   156		INCB	CX
   157		EXTEND(CX)
   158		MOVBLZX	(BP)(CX*4),	AX
   159	
   160		ADDB	AX,		DX		// y += tx
   161		EXTEND(DX)
   162		MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
   163		MOVB	BX,		(BP)(CX*4)	// d[x] = ty
   164		ADDB	AX,		BX		// val = ty+tx
   165		EXTEND(BX)
   166		MOVB	AX,		(BP)(DX*4)	// d[y] = tx
   167		MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
   168		XORB	(SI),		R8		// xor 1 byte
   169		MOVB	R8,		(DI)
   170		INCQ	SI				// in++
   171		INCQ	DI				// out++
   172		JMP l2
   173	
   174	finished:
   175		MOVQ	j+40(FP),	BX
   176		MOVB	DX, 0(BX)
   177		MOVQ	i+32(FP),	AX
   178		MOVB	CX, 0(AX)
   179		RET

View as plain text