...
Run Format

Text file src/runtime/memclr_amd64.s

Documentation: runtime

     1	// Copyright 2014 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !plan9
     6	
     7	#include "textflag.h"
     8	
     9	// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
    10	
    11	// void runtime·memclrNoHeapPointers(void*, uintptr)
    12	TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
    13		MOVQ	ptr+0(FP), DI
    14		MOVQ	n+8(FP), BX
    15		XORQ	AX, AX
    16	
    17		// MOVOU seems always faster than REP STOSQ.
    18	tail:
    19		TESTQ	BX, BX
    20		JEQ	_0
    21		CMPQ	BX, $2
    22		JBE	_1or2
    23		CMPQ	BX, $4
    24		JBE	_3or4
    25		CMPQ	BX, $8
    26		JB	_5through7
    27		JE	_8
    28		CMPQ	BX, $16
    29		JBE	_9through16
    30		PXOR	X0, X0
    31		CMPQ	BX, $32
    32		JBE	_17through32
    33		CMPQ	BX, $64
    34		JBE	_33through64
    35		CMPQ	BX, $128
    36		JBE	_65through128
    37		CMPQ	BX, $256
    38		JBE	_129through256
    39		CMPB	runtime·support_avx2(SB), $1
    40		JE loop_preheader_avx2
    41		// TODO: use branch table and BSR to make this just a single dispatch
    42		// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    43	
    44	loop:
    45		MOVOU	X0, 0(DI)
    46		MOVOU	X0, 16(DI)
    47		MOVOU	X0, 32(DI)
    48		MOVOU	X0, 48(DI)
    49		MOVOU	X0, 64(DI)
    50		MOVOU	X0, 80(DI)
    51		MOVOU	X0, 96(DI)
    52		MOVOU	X0, 112(DI)
    53		MOVOU	X0, 128(DI)
    54		MOVOU	X0, 144(DI)
    55		MOVOU	X0, 160(DI)
    56		MOVOU	X0, 176(DI)
    57		MOVOU	X0, 192(DI)
    58		MOVOU	X0, 208(DI)
    59		MOVOU	X0, 224(DI)
    60		MOVOU	X0, 240(DI)
    61		SUBQ	$256, BX
    62		ADDQ	$256, DI
    63		CMPQ	BX, $256
    64		JAE	loop
    65		JMP	tail
    66	
    67	loop_preheader_avx2:
    68		VPXOR Y0, Y0, Y0
    69		// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    70		// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    71		// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    72		CMPQ    BX, $0x2000000
    73		JAE     loop_preheader_avx2_huge
    74	loop_avx2:
    75		VMOVDQU	Y0, 0(DI)
    76		VMOVDQU	Y0, 32(DI)
    77		VMOVDQU	Y0, 64(DI)
    78		VMOVDQU	Y0, 96(DI)
    79		SUBQ	$128, BX
    80		ADDQ	$128, DI
    81		CMPQ	BX, $128
    82		JAE	loop_avx2
    83		VMOVDQU  Y0, -32(DI)(BX*1)
    84		VMOVDQU  Y0, -64(DI)(BX*1)
    85		VMOVDQU  Y0, -96(DI)(BX*1)
    86		VMOVDQU  Y0, -128(DI)(BX*1)
    87		VZEROUPPER
    88		RET
    89	loop_preheader_avx2_huge:
    90		// Align to 32 byte boundary
    91		VMOVDQU  Y0, 0(DI)
    92		MOVQ	DI, SI
    93		ADDQ	$32, DI
    94		ANDQ	$~31, DI
    95		SUBQ	DI, SI
    96		ADDQ	SI, BX
    97	loop_avx2_huge:
    98		VMOVNTDQ	Y0, 0(DI)
    99		VMOVNTDQ	Y0, 32(DI)
   100		VMOVNTDQ	Y0, 64(DI)
   101		VMOVNTDQ	Y0, 96(DI)
   102		SUBQ	$128, BX
   103		ADDQ	$128, DI
   104		CMPQ	BX, $128
   105		JAE	loop_avx2_huge
   106		// In the description of MOVNTDQ in [1]
   107		// "... fencing operation implemented with the SFENCE or MFENCE instruction
   108		// should be used in conjunction with MOVNTDQ instructions..."
   109		// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   110		SFENCE
   111		VMOVDQU  Y0, -32(DI)(BX*1)
   112		VMOVDQU  Y0, -64(DI)(BX*1)
   113		VMOVDQU  Y0, -96(DI)(BX*1)
   114		VMOVDQU  Y0, -128(DI)(BX*1)
   115		VZEROUPPER
   116		RET
   117	
   118	_1or2:
   119		MOVB	AX, (DI)
   120		MOVB	AX, -1(DI)(BX*1)
   121		RET
   122	_0:
   123		RET
   124	_3or4:
   125		MOVW	AX, (DI)
   126		MOVW	AX, -2(DI)(BX*1)
   127		RET
   128	_5through7:
   129		MOVL	AX, (DI)
   130		MOVL	AX, -4(DI)(BX*1)
   131		RET
   132	_8:
   133		// We need a separate case for 8 to make sure we clear pointers atomically.
   134		MOVQ	AX, (DI)
   135		RET
   136	_9through16:
   137		MOVQ	AX, (DI)
   138		MOVQ	AX, -8(DI)(BX*1)
   139		RET
   140	_17through32:
   141		MOVOU	X0, (DI)
   142		MOVOU	X0, -16(DI)(BX*1)
   143		RET
   144	_33through64:
   145		MOVOU	X0, (DI)
   146		MOVOU	X0, 16(DI)
   147		MOVOU	X0, -32(DI)(BX*1)
   148		MOVOU	X0, -16(DI)(BX*1)
   149		RET
   150	_65through128:
   151		MOVOU	X0, (DI)
   152		MOVOU	X0, 16(DI)
   153		MOVOU	X0, 32(DI)
   154		MOVOU	X0, 48(DI)
   155		MOVOU	X0, -64(DI)(BX*1)
   156		MOVOU	X0, -48(DI)(BX*1)
   157		MOVOU	X0, -32(DI)(BX*1)
   158		MOVOU	X0, -16(DI)(BX*1)
   159		RET
   160	_129through256:
   161		MOVOU	X0, (DI)
   162		MOVOU	X0, 16(DI)
   163		MOVOU	X0, 32(DI)
   164		MOVOU	X0, 48(DI)
   165		MOVOU	X0, 64(DI)
   166		MOVOU	X0, 80(DI)
   167		MOVOU	X0, 96(DI)
   168		MOVOU	X0, 112(DI)
   169		MOVOU	X0, -128(DI)(BX*1)
   170		MOVOU	X0, -112(DI)(BX*1)
   171		MOVOU	X0, -96(DI)(BX*1)
   172		MOVOU	X0, -80(DI)(BX*1)
   173		MOVOU	X0, -64(DI)(BX*1)
   174		MOVOU	X0, -48(DI)(BX*1)
   175		MOVOU	X0, -32(DI)(BX*1)
   176		MOVOU	X0, -16(DI)(BX*1)
   177		RET

View as plain text