Text file src/runtime/memclr_amd64.s

Documentation: runtime

     1// Copyright 2014 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !plan9
     6
     7#include "go_asm.h"
     8#include "textflag.h"
     9
    10// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
    11
    12// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    13TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
    14	MOVQ	ptr+0(FP), DI
    15	MOVQ	n+8(FP), BX
    16	XORQ	AX, AX
    17
    18	// MOVOU seems always faster than REP STOSQ.
    19tail:
    20	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    21	TESTQ	BX, BX
    22	JEQ	_0
    23	CMPQ	BX, $2
    24	JBE	_1or2
    25	CMPQ	BX, $4
    26	JBE	_3or4
    27	CMPQ	BX, $8
    28	JB	_5through7
    29	JE	_8
    30	CMPQ	BX, $16
    31	JBE	_9through16
    32	PXOR	X0, X0
    33	CMPQ	BX, $32
    34	JBE	_17through32
    35	CMPQ	BX, $64
    36	JBE	_33through64
    37	CMPQ	BX, $128
    38	JBE	_65through128
    39	CMPQ	BX, $256
    40	JBE	_129through256
    41	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    42	JE loop_preheader_avx2
    43	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    44
    45loop:
    46	MOVOU	X0, 0(DI)
    47	MOVOU	X0, 16(DI)
    48	MOVOU	X0, 32(DI)
    49	MOVOU	X0, 48(DI)
    50	MOVOU	X0, 64(DI)
    51	MOVOU	X0, 80(DI)
    52	MOVOU	X0, 96(DI)
    53	MOVOU	X0, 112(DI)
    54	MOVOU	X0, 128(DI)
    55	MOVOU	X0, 144(DI)
    56	MOVOU	X0, 160(DI)
    57	MOVOU	X0, 176(DI)
    58	MOVOU	X0, 192(DI)
    59	MOVOU	X0, 208(DI)
    60	MOVOU	X0, 224(DI)
    61	MOVOU	X0, 240(DI)
    62	SUBQ	$256, BX
    63	ADDQ	$256, DI
    64	CMPQ	BX, $256
    65	JAE	loop
    66	JMP	tail
    67
    68loop_preheader_avx2:
    69	VPXOR Y0, Y0, Y0
    70	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    71	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    72	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    73	CMPQ    BX, $0x2000000
    74	JAE     loop_preheader_avx2_huge
    75loop_avx2:
    76	VMOVDQU	Y0, 0(DI)
    77	VMOVDQU	Y0, 32(DI)
    78	VMOVDQU	Y0, 64(DI)
    79	VMOVDQU	Y0, 96(DI)
    80	SUBQ	$128, BX
    81	ADDQ	$128, DI
    82	CMPQ	BX, $128
    83	JAE	loop_avx2
    84	VMOVDQU  Y0, -32(DI)(BX*1)
    85	VMOVDQU  Y0, -64(DI)(BX*1)
    86	VMOVDQU  Y0, -96(DI)(BX*1)
    87	VMOVDQU  Y0, -128(DI)(BX*1)
    88	VZEROUPPER
    89	RET
    90loop_preheader_avx2_huge:
    91	// Align to 32 byte boundary
    92	VMOVDQU  Y0, 0(DI)
    93	MOVQ	DI, SI
    94	ADDQ	$32, DI
    95	ANDQ	$~31, DI
    96	SUBQ	DI, SI
    97	ADDQ	SI, BX
    98loop_avx2_huge:
    99	VMOVNTDQ	Y0, 0(DI)
   100	VMOVNTDQ	Y0, 32(DI)
   101	VMOVNTDQ	Y0, 64(DI)
   102	VMOVNTDQ	Y0, 96(DI)
   103	SUBQ	$128, BX
   104	ADDQ	$128, DI
   105	CMPQ	BX, $128
   106	JAE	loop_avx2_huge
   107	// In the description of MOVNTDQ in [1]
   108	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   109	// should be used in conjunction with MOVNTDQ instructions..."
   110	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   111	SFENCE
   112	VMOVDQU  Y0, -32(DI)(BX*1)
   113	VMOVDQU  Y0, -64(DI)(BX*1)
   114	VMOVDQU  Y0, -96(DI)(BX*1)
   115	VMOVDQU  Y0, -128(DI)(BX*1)
   116	VZEROUPPER
   117	RET
   118
   119_1or2:
   120	MOVB	AX, (DI)
   121	MOVB	AX, -1(DI)(BX*1)
   122	RET
   123_0:
   124	RET
   125_3or4:
   126	MOVW	AX, (DI)
   127	MOVW	AX, -2(DI)(BX*1)
   128	RET
   129_5through7:
   130	MOVL	AX, (DI)
   131	MOVL	AX, -4(DI)(BX*1)
   132	RET
   133_8:
   134	// We need a separate case for 8 to make sure we clear pointers atomically.
   135	MOVQ	AX, (DI)
   136	RET
   137_9through16:
   138	MOVQ	AX, (DI)
   139	MOVQ	AX, -8(DI)(BX*1)
   140	RET
   141_17through32:
   142	MOVOU	X0, (DI)
   143	MOVOU	X0, -16(DI)(BX*1)
   144	RET
   145_33through64:
   146	MOVOU	X0, (DI)
   147	MOVOU	X0, 16(DI)
   148	MOVOU	X0, -32(DI)(BX*1)
   149	MOVOU	X0, -16(DI)(BX*1)
   150	RET
   151_65through128:
   152	MOVOU	X0, (DI)
   153	MOVOU	X0, 16(DI)
   154	MOVOU	X0, 32(DI)
   155	MOVOU	X0, 48(DI)
   156	MOVOU	X0, -64(DI)(BX*1)
   157	MOVOU	X0, -48(DI)(BX*1)
   158	MOVOU	X0, -32(DI)(BX*1)
   159	MOVOU	X0, -16(DI)(BX*1)
   160	RET
   161_129through256:
   162	MOVOU	X0, (DI)
   163	MOVOU	X0, 16(DI)
   164	MOVOU	X0, 32(DI)
   165	MOVOU	X0, 48(DI)
   166	MOVOU	X0, 64(DI)
   167	MOVOU	X0, 80(DI)
   168	MOVOU	X0, 96(DI)
   169	MOVOU	X0, 112(DI)
   170	MOVOU	X0, -128(DI)(BX*1)
   171	MOVOU	X0, -112(DI)(BX*1)
   172	MOVOU	X0, -96(DI)(BX*1)
   173	MOVOU	X0, -80(DI)(BX*1)
   174	MOVOU	X0, -64(DI)(BX*1)
   175	MOVOU	X0, -48(DI)(BX*1)
   176	MOVOU	X0, -32(DI)(BX*1)
   177	MOVOU	X0, -16(DI)(BX*1)
   178	RET

View as plain text