// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ppc64 || ppc64le #include "go_asm.h" #include "textflag.h" // Helper names for x-form loads in BE ordering. #ifdef GOARCH_ppc64le #define _LDBEX MOVDBR #define _LWBEX MOVWBR #define _LHBEX MOVHBR #else #define _LDBEX MOVD #define _LWBEX MOVW #define _LHBEX MOVH #endif #ifdef GOPPC64_power9 #define SETB_CR0(rout) SETB CR0, rout #define SETB_CR1(rout) SETB CR1, rout #define SETB_INIT() #define SETB_CR0_NE(rout) SETB_CR0(rout) #else // A helper macro to emulate SETB on P8. This assumes // -1 is in R20, and 1 is in R21. crxlt and crxeq must // also be the same CR field. #define _SETB(crxlt, crxeq, rout) \ ISEL crxeq,R0,R21,rout \ ISEL crxlt,R20,rout,rout // A special case when it is know the comparison // will always be not equal. The result must be -1 or 1. #define SETB_CR0_NE(rout) \ ISEL CR0LT,R20,R21,rout #define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout) #define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout) #define SETB_INIT() \ MOVD $-1,R20 \ MOVD $1,R21 #endif TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 // incoming: // R3 a addr // R4 a len // R6 b addr // R7 b len // // on entry to cmpbody: // R3 return value if len(a) == len(b) // R5 a addr // R6 b addr // R9 min(len(a),len(b)) SETB_INIT() MOVD R3,R5 CMP R4,R7,CR0 CMP R3,R6,CR7 ISEL CR0LT,R4,R7,R9 SETB_CR0(R3) BC $12,30,LR // beqlr cr7 BR cmpbody<>(SB) TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 // incoming: // R3 a addr -> R5 // R4 a len -> R3 // R5 b addr -> R6 // R6 b len -> R4 // // on entry to cmpbody: // R3 compare value if compared length is same. // R5 a addr // R6 b addr // R9 min(len(a),len(b)) SETB_INIT() CMP R4,R6,CR0 CMP R3,R5,CR7 ISEL CR0LT,R4,R6,R9 MOVD R5,R6 MOVD R3,R5 SETB_CR0(R3) BC $12,30,LR // beqlr cr7 BR cmpbody<>(SB) #ifdef GOARCH_ppc64le DATA byteswap<>+0(SB)/8, $0x0706050403020100 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL byteswap<>+0(SB), RODATA, $16 #define SWAP V21 #endif TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 start: CMP R9,$16,CR0 CMP R9,$32,CR1 CMP R9,$64,CR2 MOVD $16,R10 BLT cmp8 BLT CR1,cmp16 BLT CR2,cmp32 cmp64: // >= 64B DCBT (R5) // optimize for size>=64 DCBT (R6) // cache hint SRD $6,R9,R14 // There is at least one iteration. MOVD R14,CTR ANDCC $63,R9,R9 CMP R9,$16,CR1 // Do setup for tail check early on. CMP R9,$32,CR2 CMP R9,$48,CR3 ADD $-16,R9,R9 MOVD $32,R11 // set offsets to load into vector MOVD $48,R12 // set offsets to load into vector PCALIGN $16 cmp64_loop: LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different // jump out if its different LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different ADD $64,R5,R5 // increment to next 64 bytes of A ADD $64,R6,R6 // increment to next 64 bytes of B BDNZ cmp64_loop BC $12,2,LR // beqlr // Finish out tail with minimal overlapped checking. // Note, 0 tail is handled by beqlr above. BLE CR1,cmp64_tail_gt0 BLE CR2,cmp64_tail_gt16 BLE CR3,cmp64_tail_gt32 cmp64_tail_gt48: // 49 - 63 B LXVD2X (R0)(R5),V3 LXVD2X (R0)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R10),V3 LXVD2X (R6)(R10),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R11),V3 LXVD2X (R6)(R11),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different BR cmp64_tail_gt0 PCALIGN $16 cmp64_tail_gt32: // 33 - 48B LXVD2X (R0)(R5),V3 LXVD2X (R0)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R10),V3 LXVD2X (R6)(R10),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different BR cmp64_tail_gt0 PCALIGN $16 cmp64_tail_gt16: // 17 - 32B LXVD2X (R0)(R5),V3 LXVD2X (R0)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different BR cmp64_tail_gt0 PCALIGN $16 cmp64_tail_gt0: // 1 - 16B LXVD2X (R5)(R9),V3 LXVD2X (R6)(R9),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different RET PCALIGN $16 cmp32: // 32 - 63B ANDCC $31,R9,R9 LXVD2X (R0)(R5),V3 LXVD2X (R0)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R10)(R5),V3 LXVD2X (R10)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different BC $12,2,LR // beqlr ADD R9,R10,R10 LXVD2X (R9)(R5),V3 LXVD2X (R9)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R10)(R5),V3 LXVD2X (R10)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different RET PCALIGN $16 cmp16: // 16 - 31B ANDCC $15,R9,R9 LXVD2X (R0)(R5),V3 LXVD2X (R0)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different BC $12,2,LR // beqlr LXVD2X (R9)(R5),V3 LXVD2X (R9)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different RET PCALIGN $16 different: #ifdef GOARCH_ppc64le MOVD $byteswap<>+00(SB),R16 LXVD2X (R16)(R0),SWAP // Set up swap string VPERM V3,V3,SWAP,V3 VPERM V4,V4,SWAP,V4 #endif MFVSRD VS35,R16 // move upper doublewords of A and B into GPR for comparison MFVSRD VS36,R10 CMPU R16,R10 BEQ lower SETB_CR0_NE(R3) RET PCALIGN $16 lower: VSLDOI $8,V3,V3,V3 // move lower doublewords of A and B into GPR for comparison MFVSRD VS35,R16 VSLDOI $8,V4,V4,V4 MFVSRD VS36,R10 CMPU R16,R10 SETB_CR0_NE(R3) RET PCALIGN $16 cmp8: // 8 - 15B (0 - 15B if GOPPC64_power10) #ifdef GOPPC64_power10 SLD $56,R9,R9 LXVLL R5,R9,V3 // Load bytes starting from MSB to LSB, unused are zero filled. LXVLL R6,R9,V4 VCMPUQ V3,V4,CR0 // Compare as a 128b integer. SETB_CR0(R6) ISEL CR0EQ,R3,R6,R3 // If equal, length determines the return value. RET #else CMP R9,$8 BLT cmp4 ANDCC $7,R9,R9 _LDBEX (R0)(R5),R10 _LDBEX (R0)(R6),R11 _LDBEX (R9)(R5),R12 _LDBEX (R9)(R6),R14 CMPU R10,R11,CR0 SETB_CR0(R5) CMPU R12,R14,CR1 SETB_CR1(R6) CRAND CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value. ISEL CR0EQ,R6,R5,R4 ISEL CR1EQ,R3,R4,R3 RET PCALIGN $16 cmp4: // 4 - 7B CMP R9,$4 BLT cmp2 ANDCC $3,R9,R9 _LWBEX (R0)(R5),R10 _LWBEX (R0)(R6),R11 _LWBEX (R9)(R5),R12 _LWBEX (R9)(R6),R14 RLDIMI $32,R10,$0,R12 RLDIMI $32,R11,$0,R14 CMPU R12,R14 BR cmp0 PCALIGN $16 cmp2: // 2 - 3B CMP R9,$2 BLT cmp1 ANDCC $1,R9,R9 _LHBEX (R0)(R5),R10 _LHBEX (R0)(R6),R11 _LHBEX (R9)(R5),R12 _LHBEX (R9)(R6),R14 RLDIMI $32,R10,$0,R12 RLDIMI $32,R11,$0,R14 CMPU R12,R14 BR cmp0 PCALIGN $16 cmp1: CMP R9,$0 BEQ cmp0 MOVBZ (R5),R10 MOVBZ (R6),R11 CMPU R10,R11 cmp0: SETB_CR0(R6) ISEL CR0EQ,R3,R6,R3 RET #endif