...
Run Format

Text file src/crypto/rc4/rc4_amd64p32.s

Documentation: crypto/rc4

     1	// Original source:
     2	//	http://www.zorinaq.com/papers/rc4-amd64.html
     3	//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
     4	
     5	#include "textflag.h"
     6	
     7	// Local modifications:
     8	//
     9	// Transliterated from GNU to 6a assembly syntax by the Go authors.
    10	// The comments and spacing are from the original.
    11	//
    12	// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
    13	//
    14	// The original code accumulated 64 bits of key stream in an integer
    15	// register and then XOR'ed the key stream into the data 8 bytes at a time.
    16	// Modified to accumulate 128 bits of key stream into an XMM register
    17	// and then XOR the key stream into the data 16 bytes at a time.
    18	// Approximately doubles throughput.
    19	//
    20	// Converted to amd64p32.
    21	//
    22	// To make safe for Native Client, avoid use of BP, R15,
    23	// and two-register addressing modes.
    24	
    25	// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
    26	// but makes the code run 2.0x slower on Xeon.
    27	#define EXTEND(r) MOVBLZX r, r
    28	
    29	/*
    30	** RC4 implementation optimized for AMD64.
    31	**
    32	** Author: Marc Bevand <bevand_m (at) epita.fr>
    33	** Licence: I hereby disclaim the copyright on this code and place it
    34	** in the public domain.
    35	**
    36	** The code has been designed to be easily integrated into openssl:
    37	** the exported RC4() function can replace the actual implementations
    38	** openssl already contains. Please note that when linking with openssl,
    39	** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
    40	** with -DRC4_INT='unsigned long'.
    41	**
    42	** The throughput achieved by this code is about 320 MBytes/sec, on
    43	** a 1.8 GHz AMD Opteron (rev C0) processor.
    44	*/
    45	
    46	TEXT ·xorKeyStream(SB),NOSPLIT,$0
    47		MOVL	n+8(FP),	BX		// rbx = ARG(len)
    48		MOVL	src+4(FP),	SI		// in = ARG(in)
    49		MOVL	dst+0(FP),	DI		// out = ARG(out)
    50		MOVL	state+12(FP),	R10		// d = ARG(data)
    51		MOVL	i+16(FP),	AX
    52		MOVBQZX	0(AX),		CX		// x = *xp
    53		MOVL	j+20(FP),	AX
    54		MOVBQZX	0(AX),		DX		// y = *yp
    55	
    56		LEAQ	(SI)(BX*1),	R9		// limit = in+len
    57	
    58	l1:	CMPQ	SI,		R9		// cmp in with in+len
    59		JGE	finished			// jump if (in >= in+len)
    60	
    61		INCB	CX
    62		EXTEND(CX)
    63		TESTL	$15,		CX
    64		JZ	wordloop
    65		LEAL	(R10)(CX*4), R12
    66	
    67		MOVBLZX	(R12),	AX
    68	
    69		ADDB	AX,		DX		// y += tx
    70		EXTEND(DX)
    71		LEAL (R10)(DX*4), R11
    72		MOVBLZX	(R11),	BX		// ty = d[y]
    73		MOVB	BX,		(R12)	// d[x] = ty
    74		ADDB	AX,		BX		// val = ty+tx
    75		EXTEND(BX)
    76		LEAL (R10)(BX*4), R13
    77		MOVB	AX,		(R11)	// d[y] = tx
    78		MOVBLZX	(R13),	R8		// val = d[val]
    79		XORB	(SI),		R8		// xor 1 byte
    80		MOVB	R8,		(DI)
    81		INCQ	SI				// in++
    82		INCQ	DI				// out++
    83		JMP l1
    84	
    85	wordloop:
    86		SUBQ	$16,		R9
    87		CMPQ	SI,		R9
    88		JGT	end
    89	
    90	start:
    91		ADDQ	$16,		SI		// increment in
    92		ADDQ	$16,		DI		// increment out
    93	
    94		// Each KEYROUND generates one byte of key and
    95		// inserts it into an XMM register at the given 16-bit index.
    96		// The key state array is uint32 words only using the bottom
    97		// byte of each word, so the 16-bit OR only copies 8 useful bits.
    98		// We accumulate alternating bytes into X0 and X1, and then at
    99		// the end we OR X1<<8 into X0 to produce the actual key.
   100		//
   101		// At the beginning of the loop, CX%16 == 0, so the 16 loads
   102		// at state[CX], state[CX+1], ..., state[CX+15] can precompute
   103		// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
   104		// without fear of the byte computation CX+15 wrapping around.
   105		//
   106		// The first round needs R12[0], the second needs R12[1], and so on.
   107		// We can avoid memory stalls by starting the load for round n+1
   108		// before the end of round n, using the LOAD macro.
   109		LEAQ	(R10)(CX*4),	R12
   110	
   111	#define KEYROUND(xmm, load, off, r1, r2, index) \
   112		LEAL (R10)(DX*4), R11; \
   113		MOVBLZX	(R11),	R8; \
   114		MOVB	r1,		(R11); \
   115		load((off+1), r2); \
   116		MOVB	R8,		(off*4)(R12); \
   117		ADDB	r1,		R8; \
   118		EXTEND(R8); \
   119		LEAL (R10)(R8*4), R14; \
   120		PINSRW	$index, (R14), xmm
   121	
   122	#define LOAD(off, reg) \
   123		MOVBLZX	(off*4)(R12),	reg; \
   124		ADDB	reg,		DX; \
   125		EXTEND(DX)
   126	
   127	#define SKIP(off, reg)
   128	
   129		LOAD(0, AX)
   130		KEYROUND(X0, LOAD, 0, AX, BX, 0)
   131		KEYROUND(X1, LOAD, 1, BX, AX, 0)
   132		KEYROUND(X0, LOAD, 2, AX, BX, 1)
   133		KEYROUND(X1, LOAD, 3, BX, AX, 1)
   134		KEYROUND(X0, LOAD, 4, AX, BX, 2)
   135		KEYROUND(X1, LOAD, 5, BX, AX, 2)
   136		KEYROUND(X0, LOAD, 6, AX, BX, 3)
   137		KEYROUND(X1, LOAD, 7, BX, AX, 3)
   138		KEYROUND(X0, LOAD, 8, AX, BX, 4)
   139		KEYROUND(X1, LOAD, 9, BX, AX, 4)
   140		KEYROUND(X0, LOAD, 10, AX, BX, 5)
   141		KEYROUND(X1, LOAD, 11, BX, AX, 5)
   142		KEYROUND(X0, LOAD, 12, AX, BX, 6)
   143		KEYROUND(X1, LOAD, 13, BX, AX, 6)
   144		KEYROUND(X0, LOAD, 14, AX, BX, 7)
   145		KEYROUND(X1, SKIP, 15, BX, AX, 7)
   146		
   147		ADDB	$16,		CX
   148	
   149		PSLLQ	$8,		X1
   150		PXOR	X1,		X0
   151		MOVOU	-16(SI),	X2
   152		PXOR	X0,		X2
   153		MOVOU	X2,		-16(DI)
   154	
   155		CMPQ	SI,		R9		// cmp in with in+len-16
   156		JLE	start				// jump if (in <= in+len-16)
   157	
   158	end:
   159		DECB	CX
   160		ADDQ	$16,		R9		// tmp = in+len
   161	
   162		// handle the last bytes, one by one
   163	l2:	CMPQ	SI,		R9		// cmp in with in+len
   164		JGE	finished			// jump if (in >= in+len)
   165	
   166		INCB	CX
   167		EXTEND(CX)
   168		LEAL (R10)(CX*4), R12
   169		MOVBLZX	(R12),	AX
   170	
   171		ADDB	AX,		DX		// y += tx
   172		EXTEND(DX)
   173		LEAL (R10)(DX*4), R11
   174		MOVBLZX	(R11),	BX		// ty = d[y]
   175		MOVB	BX,		(R12)	// d[x] = ty
   176		ADDB	AX,		BX		// val = ty+tx
   177		EXTEND(BX)
   178		LEAL (R10)(BX*4), R13
   179		MOVB	AX,		(R11)	// d[y] = tx
   180		MOVBLZX	(R13),	R8		// val = d[val]
   181		XORB	(SI),		R8		// xor 1 byte
   182		MOVB	R8,		(DI)
   183		INCQ	SI				// in++
   184		INCQ	DI				// out++
   185		JMP l2
   186	
   187	finished:
   188		MOVL	j+20(FP),	BX
   189		MOVB	DX, 0(BX)
   190		MOVL	i+16(FP),	AX
   191		MOVB	CX, 0(AX)
   192		RET

View as plain text