...
Run Format

Text file src/hash/crc32/crc32_amd64.s

Documentation: hash/crc32

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
     8	//
     9	// func castagnoliSSE42(crc uint32, p []byte) uint32
    10	TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
    11		MOVL crc+0(FP), AX  // CRC value
    12		MOVQ p+8(FP), SI  // data pointer
    13		MOVQ p_len+16(FP), CX  // len(p)
    14	
    15		// If there are fewer than 8 bytes to process, skip alignment.
    16		CMPQ CX, $8
    17		JL less_than_8
    18	
    19		MOVQ SI, BX
    20		ANDQ $7, BX
    21		JZ aligned
    22	
    23		// Process the first few bytes to 8-byte align the input.
    24	
    25		// BX = 8 - BX. We need to process this many bytes to align.
    26		SUBQ $1, BX
    27		XORQ $7, BX
    28	
    29		BTQ $0, BX
    30		JNC align_2
    31	
    32		CRC32B (SI), AX
    33		DECQ CX
    34		INCQ SI
    35	
    36	align_2:
    37		BTQ $1, BX
    38		JNC align_4
    39	
    40		// CRC32W (SI), AX
    41		BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    42	
    43		SUBQ $2, CX
    44		ADDQ $2, SI
    45	
    46	align_4:
    47		BTQ $2, BX
    48		JNC aligned
    49	
    50		// CRC32L (SI), AX
    51		BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    52	
    53		SUBQ $4, CX
    54		ADDQ $4, SI
    55	
    56	aligned:
    57		// The input is now 8-byte aligned and we can process 8-byte chunks.
    58		CMPQ CX, $8
    59		JL less_than_8
    60	
    61		CRC32Q (SI), AX
    62		ADDQ $8, SI
    63		SUBQ $8, CX
    64		JMP aligned
    65	
    66	less_than_8:
    67		// We may have some bytes left over; process 4 bytes, then 2, then 1.
    68		BTQ $2, CX
    69		JNC less_than_4
    70	
    71		// CRC32L (SI), AX
    72		BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    73		ADDQ $4, SI
    74	
    75	less_than_4:
    76		BTQ $1, CX
    77		JNC less_than_2
    78	
    79		// CRC32W (SI), AX
    80		BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    81		ADDQ $2, SI
    82	
    83	less_than_2:
    84		BTQ $0, CX
    85		JNC done
    86	
    87		CRC32B (SI), AX
    88	
    89	done:
    90		MOVL AX, ret+32(FP)
    91		RET
    92	
    93	// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
    94	// bytes from each buffer.
    95	//
    96	// func castagnoliSSE42Triple(
    97	//     crc1, crc2, crc3 uint32,
    98	//     a, b, c []byte,
    99	//     rounds uint32,
   100	// ) (retA uint32, retB uint32, retC uint32)
   101	TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
   102		MOVL crcA+0(FP), AX
   103		MOVL crcB+4(FP), CX
   104		MOVL crcC+8(FP), DX
   105	
   106		MOVQ a+16(FP), R8   // data pointer
   107		MOVQ b+40(FP), R9   // data pointer
   108		MOVQ c+64(FP), R10  // data pointer
   109	
   110		MOVL rounds+88(FP), R11
   111	
   112	loop:
   113		CRC32Q (R8), AX
   114		CRC32Q (R9), CX
   115		CRC32Q (R10), DX
   116	
   117		CRC32Q 8(R8), AX
   118		CRC32Q 8(R9), CX
   119		CRC32Q 8(R10), DX
   120	
   121		CRC32Q 16(R8), AX
   122		CRC32Q 16(R9), CX
   123		CRC32Q 16(R10), DX
   124	
   125		ADDQ $24, R8
   126		ADDQ $24, R9
   127		ADDQ $24, R10
   128	
   129		DECQ R11
   130		JNZ loop
   131	
   132		MOVL AX, retA+96(FP)
   133		MOVL CX, retB+100(FP)
   134		MOVL DX, retC+104(FP)
   135		RET
   136	
   137	// CRC32 polynomial data
   138	//
   139	// These constants are lifted from the
   140	// Linux kernel, since they avoid the costly
   141	// PSHUFB 16 byte reversal proposed in the
   142	// original Intel paper.
   143	DATA r2r1<>+0(SB)/8, $0x154442bd4
   144	DATA r2r1<>+8(SB)/8, $0x1c6e41596
   145	DATA r4r3<>+0(SB)/8, $0x1751997d0
   146	DATA r4r3<>+8(SB)/8, $0x0ccaa009e
   147	DATA rupoly<>+0(SB)/8, $0x1db710641
   148	DATA rupoly<>+8(SB)/8, $0x1f7011641
   149	DATA r5<>+0(SB)/8, $0x163cd6124
   150	
   151	GLOBL r2r1<>(SB),RODATA,$16
   152	GLOBL r4r3<>(SB),RODATA,$16
   153	GLOBL rupoly<>(SB),RODATA,$16
   154	GLOBL r5<>(SB),RODATA,$8
   155	
   156	// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   157	// len(p) must be at least 64, and must be a multiple of 16.
   158	
   159	// func ieeeCLMUL(crc uint32, p []byte) uint32
   160	TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
   161		MOVL   crc+0(FP), X0             // Initial CRC value
   162		MOVQ   p+8(FP), SI  	         // data pointer
   163		MOVQ   p_len+16(FP), CX          // len(p)
   164	
   165		MOVOU  (SI), X1
   166		MOVOU  16(SI), X2
   167		MOVOU  32(SI), X3
   168		MOVOU  48(SI), X4
   169		PXOR   X0, X1
   170		ADDQ   $64, SI                  // buf+=64
   171		SUBQ   $64, CX                  // len-=64
   172		CMPQ   CX, $64                  // Less than 64 bytes left
   173		JB     remain64
   174	
   175		MOVOA  r2r1<>+0(SB), X0
   176	loopback64:
   177		MOVOA  X1, X5
   178		MOVOA  X2, X6
   179		MOVOA  X3, X7
   180		MOVOA  X4, X8
   181	
   182		PCLMULQDQ $0, X0, X1
   183		PCLMULQDQ $0, X0, X2
   184		PCLMULQDQ $0, X0, X3
   185		PCLMULQDQ $0, X0, X4
   186	
   187		/* Load next early */
   188		MOVOU    (SI), X11
   189		MOVOU    16(SI), X12
   190		MOVOU    32(SI), X13
   191		MOVOU    48(SI), X14
   192	
   193		PCLMULQDQ $0x11, X0, X5
   194		PCLMULQDQ $0x11, X0, X6
   195		PCLMULQDQ $0x11, X0, X7
   196		PCLMULQDQ $0x11, X0, X8
   197	
   198		PXOR     X5, X1
   199		PXOR     X6, X2
   200		PXOR     X7, X3
   201		PXOR     X8, X4
   202	
   203		PXOR     X11, X1
   204		PXOR     X12, X2
   205		PXOR     X13, X3
   206		PXOR     X14, X4
   207	
   208		ADDQ    $0x40, DI
   209		ADDQ    $64, SI      // buf+=64
   210		SUBQ    $64, CX      // len-=64
   211		CMPQ    CX, $64      // Less than 64 bytes left?
   212		JGE     loopback64
   213	
   214		/* Fold result into a single register (X1) */
   215	remain64:
   216		MOVOA       r4r3<>+0(SB), X0
   217	
   218		MOVOA       X1, X5
   219		PCLMULQDQ   $0, X0, X1
   220		PCLMULQDQ   $0x11, X0, X5
   221		PXOR        X5, X1
   222		PXOR        X2, X1
   223	
   224		MOVOA       X1, X5
   225		PCLMULQDQ   $0, X0, X1
   226		PCLMULQDQ   $0x11, X0, X5
   227		PXOR        X5, X1
   228		PXOR        X3, X1
   229	
   230		MOVOA       X1, X5
   231		PCLMULQDQ   $0, X0, X1
   232		PCLMULQDQ   $0x11, X0, X5
   233		PXOR        X5, X1
   234		PXOR        X4, X1
   235	
   236		/* If there is less than 16 bytes left we are done */
   237		CMPQ        CX, $16
   238		JB          finish
   239	
   240		/* Encode 16 bytes */
   241	remain16:
   242		MOVOU       (SI), X10
   243		MOVOA       X1, X5
   244		PCLMULQDQ   $0, X0, X1
   245		PCLMULQDQ   $0x11, X0, X5
   246		PXOR        X5, X1
   247		PXOR        X10, X1
   248		SUBQ        $16, CX
   249		ADDQ        $16, SI
   250		CMPQ        CX, $16
   251		JGE         remain16
   252	
   253	finish:
   254		/* Fold final result into 32 bits and return it */
   255		PCMPEQB     X3, X3
   256		PCLMULQDQ   $1, X1, X0
   257		PSRLDQ      $8, X1
   258		PXOR        X0, X1
   259	
   260		MOVOA       X1, X2
   261		MOVQ        r5<>+0(SB), X0
   262	
   263		/* Creates 32 bit mask. Note that we don't care about upper half. */
   264		PSRLQ       $32, X3
   265	
   266		PSRLDQ      $4, X2
   267		PAND        X3, X1
   268		PCLMULQDQ   $0, X0, X1
   269		PXOR        X2, X1
   270	
   271		MOVOA       rupoly<>+0(SB), X0
   272	
   273		MOVOA       X1, X2
   274		PAND        X3, X1
   275		PCLMULQDQ   $0x10, X0, X1
   276		PAND        X3, X1
   277		PCLMULQDQ   $0, X0, X1
   278		PXOR        X2, X1
   279	
   280		PEXTRD	$1, X1, AX
   281		MOVL        AX, ret+32(FP)
   282	
   283		RET

View as plain text