...
Run Format

Text file src/runtime/memmove_amd64.s

Documentation: runtime

     1	// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3	//
     4	//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5	//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6	//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7	//
     8	// Permission is hereby granted, free of charge, to any person obtaining a copy
     9	// of this software and associated documentation files (the "Software"), to deal
    10	// in the Software without restriction, including without limitation the rights
    11	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12	// copies of the Software, and to permit persons to whom the Software is
    13	// furnished to do so, subject to the following conditions:
    14	//
    15	// The above copyright notice and this permission notice shall be included in
    16	// all copies or substantial portions of the Software.
    17	//
    18	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24	// THE SOFTWARE.
    25	
    26	// +build !plan9
    27	
    28	#include "textflag.h"
    29	
    30	// func memmove(to, from unsafe.Pointer, n uintptr)
    31	TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32	
    33		MOVQ	to+0(FP), DI
    34		MOVQ	from+8(FP), SI
    35		MOVQ	n+16(FP), BX
    36	
    37		// REP instructions have a high startup cost, so we handle small sizes
    38		// with some straightline code. The REP MOVSQ instruction is really fast
    39		// for large sizes. The cutover is approximately 2K.
    40	tail:
    41		// move_129through256 or smaller work whether or not the source and the
    42		// destination memory regions overlap because they load all data into
    43		// registers before writing it back.  move_256through2048 on the other
    44		// hand can be used only when the memory regions don't overlap or the copy
    45		// direction is forward.
    46		//
    47		// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    48		TESTQ	BX, BX
    49		JEQ	move_0
    50		CMPQ	BX, $2
    51		JBE	move_1or2
    52		CMPQ	BX, $4
    53		JB	move_3
    54		JBE	move_4
    55		CMPQ	BX, $8
    56		JB	move_5through7
    57		JE	move_8
    58		CMPQ	BX, $16
    59		JBE	move_9through16
    60		CMPQ	BX, $32
    61		JBE	move_17through32
    62		CMPQ	BX, $64
    63		JBE	move_33through64
    64		CMPQ	BX, $128
    65		JBE	move_65through128
    66		CMPQ	BX, $256
    67		JBE	move_129through256
    68	
    69		TESTB	$1, runtime·useAVXmemmove(SB)
    70		JNZ	avxUnaligned
    71	
    72	/*
    73	 * check and set for backwards
    74	 */
    75		CMPQ	SI, DI
    76		JLS	back
    77	
    78	/*
    79	 * forward copy loop
    80	 */
    81	forward:
    82		CMPQ	BX, $2048
    83		JLS	move_256through2048
    84	
    85		// If REP MOVSB isn't fast, don't use it
    86		CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
    87		JNE	fwdBy8
    88	
    89		// Check alignment
    90		MOVL	SI, AX
    91		ORL	DI, AX
    92		TESTL	$7, AX
    93		JEQ	fwdBy8
    94	
    95		// Do 1 byte at a time
    96		MOVQ	BX, CX
    97		REP;	MOVSB
    98		RET
    99	
   100	fwdBy8:
   101		// Do 8 bytes at a time
   102		MOVQ	BX, CX
   103		SHRQ	$3, CX
   104		ANDQ	$7, BX
   105		REP;	MOVSQ
   106		JMP	tail
   107	
   108	back:
   109	/*
   110	 * check overlap
   111	 */
   112		MOVQ	SI, CX
   113		ADDQ	BX, CX
   114		CMPQ	CX, DI
   115		JLS	forward
   116	/*
   117	 * whole thing backwards has
   118	 * adjusted addresses
   119	 */
   120		ADDQ	BX, DI
   121		ADDQ	BX, SI
   122		STD
   123	
   124	/*
   125	 * copy
   126	 */
   127		MOVQ	BX, CX
   128		SHRQ	$3, CX
   129		ANDQ	$7, BX
   130	
   131		SUBQ	$8, DI
   132		SUBQ	$8, SI
   133		REP;	MOVSQ
   134	
   135		CLD
   136		ADDQ	$8, DI
   137		ADDQ	$8, SI
   138		SUBQ	BX, DI
   139		SUBQ	BX, SI
   140		JMP	tail
   141	
   142	move_1or2:
   143		MOVB	(SI), AX
   144		MOVB	-1(SI)(BX*1), CX
   145		MOVB	AX, (DI)
   146		MOVB	CX, -1(DI)(BX*1)
   147		RET
   148	move_0:
   149		RET
   150	move_4:
   151		MOVL	(SI), AX
   152		MOVL	AX, (DI)
   153		RET
   154	move_3:
   155		MOVW	(SI), AX
   156		MOVB	2(SI), CX
   157		MOVW	AX, (DI)
   158		MOVB	CX, 2(DI)
   159		RET
   160	move_5through7:
   161		MOVL	(SI), AX
   162		MOVL	-4(SI)(BX*1), CX
   163		MOVL	AX, (DI)
   164		MOVL	CX, -4(DI)(BX*1)
   165		RET
   166	move_8:
   167		// We need a separate case for 8 to make sure we write pointers atomically.
   168		MOVQ	(SI), AX
   169		MOVQ	AX, (DI)
   170		RET
   171	move_9through16:
   172		MOVQ	(SI), AX
   173		MOVQ	-8(SI)(BX*1), CX
   174		MOVQ	AX, (DI)
   175		MOVQ	CX, -8(DI)(BX*1)
   176		RET
   177	move_17through32:
   178		MOVOU	(SI), X0
   179		MOVOU	-16(SI)(BX*1), X1
   180		MOVOU	X0, (DI)
   181		MOVOU	X1, -16(DI)(BX*1)
   182		RET
   183	move_33through64:
   184		MOVOU	(SI), X0
   185		MOVOU	16(SI), X1
   186		MOVOU	-32(SI)(BX*1), X2
   187		MOVOU	-16(SI)(BX*1), X3
   188		MOVOU	X0, (DI)
   189		MOVOU	X1, 16(DI)
   190		MOVOU	X2, -32(DI)(BX*1)
   191		MOVOU	X3, -16(DI)(BX*1)
   192		RET
   193	move_65through128:
   194		MOVOU	(SI), X0
   195		MOVOU	16(SI), X1
   196		MOVOU	32(SI), X2
   197		MOVOU	48(SI), X3
   198		MOVOU	-64(SI)(BX*1), X4
   199		MOVOU	-48(SI)(BX*1), X5
   200		MOVOU	-32(SI)(BX*1), X6
   201		MOVOU	-16(SI)(BX*1), X7
   202		MOVOU	X0, (DI)
   203		MOVOU	X1, 16(DI)
   204		MOVOU	X2, 32(DI)
   205		MOVOU	X3, 48(DI)
   206		MOVOU	X4, -64(DI)(BX*1)
   207		MOVOU	X5, -48(DI)(BX*1)
   208		MOVOU	X6, -32(DI)(BX*1)
   209		MOVOU	X7, -16(DI)(BX*1)
   210		RET
   211	move_129through256:
   212		MOVOU	(SI), X0
   213		MOVOU	16(SI), X1
   214		MOVOU	32(SI), X2
   215		MOVOU	48(SI), X3
   216		MOVOU	64(SI), X4
   217		MOVOU	80(SI), X5
   218		MOVOU	96(SI), X6
   219		MOVOU	112(SI), X7
   220		MOVOU	-128(SI)(BX*1), X8
   221		MOVOU	-112(SI)(BX*1), X9
   222		MOVOU	-96(SI)(BX*1), X10
   223		MOVOU	-80(SI)(BX*1), X11
   224		MOVOU	-64(SI)(BX*1), X12
   225		MOVOU	-48(SI)(BX*1), X13
   226		MOVOU	-32(SI)(BX*1), X14
   227		MOVOU	-16(SI)(BX*1), X15
   228		MOVOU	X0, (DI)
   229		MOVOU	X1, 16(DI)
   230		MOVOU	X2, 32(DI)
   231		MOVOU	X3, 48(DI)
   232		MOVOU	X4, 64(DI)
   233		MOVOU	X5, 80(DI)
   234		MOVOU	X6, 96(DI)
   235		MOVOU	X7, 112(DI)
   236		MOVOU	X8, -128(DI)(BX*1)
   237		MOVOU	X9, -112(DI)(BX*1)
   238		MOVOU	X10, -96(DI)(BX*1)
   239		MOVOU	X11, -80(DI)(BX*1)
   240		MOVOU	X12, -64(DI)(BX*1)
   241		MOVOU	X13, -48(DI)(BX*1)
   242		MOVOU	X14, -32(DI)(BX*1)
   243		MOVOU	X15, -16(DI)(BX*1)
   244		RET
   245	move_256through2048:
   246		SUBQ	$256, BX
   247		MOVOU	(SI), X0
   248		MOVOU	16(SI), X1
   249		MOVOU	32(SI), X2
   250		MOVOU	48(SI), X3
   251		MOVOU	64(SI), X4
   252		MOVOU	80(SI), X5
   253		MOVOU	96(SI), X6
   254		MOVOU	112(SI), X7
   255		MOVOU	128(SI), X8
   256		MOVOU	144(SI), X9
   257		MOVOU	160(SI), X10
   258		MOVOU	176(SI), X11
   259		MOVOU	192(SI), X12
   260		MOVOU	208(SI), X13
   261		MOVOU	224(SI), X14
   262		MOVOU	240(SI), X15
   263		MOVOU	X0, (DI)
   264		MOVOU	X1, 16(DI)
   265		MOVOU	X2, 32(DI)
   266		MOVOU	X3, 48(DI)
   267		MOVOU	X4, 64(DI)
   268		MOVOU	X5, 80(DI)
   269		MOVOU	X6, 96(DI)
   270		MOVOU	X7, 112(DI)
   271		MOVOU	X8, 128(DI)
   272		MOVOU	X9, 144(DI)
   273		MOVOU	X10, 160(DI)
   274		MOVOU	X11, 176(DI)
   275		MOVOU	X12, 192(DI)
   276		MOVOU	X13, 208(DI)
   277		MOVOU	X14, 224(DI)
   278		MOVOU	X15, 240(DI)
   279		CMPQ	BX, $256
   280		LEAQ	256(SI), SI
   281		LEAQ	256(DI), DI
   282		JGE	move_256through2048
   283		JMP	tail
   284	
   285	avxUnaligned:
   286		// There are two implementations of move algorithm.
   287		// The first one for non-overlapped memory regions. It uses forward copying.
   288		// The second one for overlapped regions. It uses backward copying
   289		MOVQ	DI, CX
   290		SUBQ	SI, CX
   291		// Now CX contains distance between SRC and DEST
   292		CMPQ	CX, BX
   293		// If the distance lesser than region length it means that regions are overlapped
   294		JC	copy_backward
   295	
   296		// Non-temporal copy would be better for big sizes.
   297		CMPQ	BX, $0x100000
   298		JAE	gobble_big_data_fwd
   299	
   300		// Memory layout on the source side
   301		// SI                                       CX
   302		// |<---------BX before correction--------->|
   303		// |       |<--BX corrected-->|             |
   304		// |       |                  |<--- AX  --->|
   305		// |<-R11->|                  |<-128 bytes->|
   306		// +----------------------------------------+
   307		// | Head  | Body             | Tail        |
   308		// +-------+------------------+-------------+
   309		// ^       ^                  ^
   310		// |       |                  |
   311		// Save head into Y4          Save tail into X5..X12
   312		//         |
   313		//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   314		// Algorithm:
   315		// 1. Unaligned save of the tail's 128 bytes
   316		// 2. Unaligned save of the head's 32  bytes
   317		// 3. Destination-aligned copying of body (128 bytes per iteration)
   318		// 4. Put head on the new place
   319		// 5. Put the tail on the new place
   320		// It can be important to satisfy processor's pipeline requirements for
   321		// small sizes as the cost of unaligned memory region copying is
   322		// comparable with the cost of main loop. So code is slightly messed there.
   323		// There is more clean implementation of that algorithm for bigger sizes
   324		// where the cost of unaligned part copying is negligible.
   325		// You can see it after gobble_big_data_fwd label.
   326		LEAQ	(SI)(BX*1), CX
   327		MOVQ	DI, R10
   328		// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   329		MOVOU	-0x80(CX), X5
   330		MOVOU	-0x70(CX), X6
   331		MOVQ	$0x80, AX
   332		// Align destination address
   333		ANDQ	$-32, DI
   334		ADDQ	$32, DI
   335		// Continue tail saving.
   336		MOVOU	-0x60(CX), X7
   337		MOVOU	-0x50(CX), X8
   338		// Make R11 delta between aligned and unaligned destination addresses.
   339		MOVQ	DI, R11
   340		SUBQ	R10, R11
   341		// Continue tail saving.
   342		MOVOU	-0x40(CX), X9
   343		MOVOU	-0x30(CX), X10
   344		// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   345		SUBQ	R11, BX
   346		// Continue tail saving.
   347		MOVOU	-0x20(CX), X11
   348		MOVOU	-0x10(CX), X12
   349		// The tail will be put on its place after main body copying.
   350		// It's time for the unaligned heading part.
   351		VMOVDQU	(SI), Y4
   352		// Adjust source address to point past head.
   353		ADDQ	R11, SI
   354		SUBQ	AX, BX
   355		// Aligned memory copying there
   356	gobble_128_loop:
   357		VMOVDQU	(SI), Y0
   358		VMOVDQU	0x20(SI), Y1
   359		VMOVDQU	0x40(SI), Y2
   360		VMOVDQU	0x60(SI), Y3
   361		ADDQ	AX, SI
   362		VMOVDQA	Y0, (DI)
   363		VMOVDQA	Y1, 0x20(DI)
   364		VMOVDQA	Y2, 0x40(DI)
   365		VMOVDQA	Y3, 0x60(DI)
   366		ADDQ	AX, DI
   367		SUBQ	AX, BX
   368		JA	gobble_128_loop
   369		// Now we can store unaligned parts.
   370		ADDQ	AX, BX
   371		ADDQ	DI, BX
   372		VMOVDQU	Y4, (R10)
   373		VZEROUPPER
   374		MOVOU	X5, -0x80(BX)
   375		MOVOU	X6, -0x70(BX)
   376		MOVOU	X7, -0x60(BX)
   377		MOVOU	X8, -0x50(BX)
   378		MOVOU	X9, -0x40(BX)
   379		MOVOU	X10, -0x30(BX)
   380		MOVOU	X11, -0x20(BX)
   381		MOVOU	X12, -0x10(BX)
   382		RET
   383	
   384	gobble_big_data_fwd:
   385		// There is forward copying for big regions.
   386		// It uses non-temporal mov instructions.
   387		// Details of this algorithm are commented previously for small sizes.
   388		LEAQ	(SI)(BX*1), CX
   389		MOVOU	-0x80(SI)(BX*1), X5
   390		MOVOU	-0x70(CX), X6
   391		MOVOU	-0x60(CX), X7
   392		MOVOU	-0x50(CX), X8
   393		MOVOU	-0x40(CX), X9
   394		MOVOU	-0x30(CX), X10
   395		MOVOU	-0x20(CX), X11
   396		MOVOU	-0x10(CX), X12
   397		VMOVDQU	(SI), Y4
   398		MOVQ	DI, R8
   399		ANDQ	$-32, DI
   400		ADDQ	$32, DI
   401		MOVQ	DI, R10
   402		SUBQ	R8, R10
   403		SUBQ	R10, BX
   404		ADDQ	R10, SI
   405		LEAQ	(DI)(BX*1), CX
   406		SUBQ	$0x80, BX
   407	gobble_mem_fwd_loop:
   408		PREFETCHNTA 0x1C0(SI)
   409		PREFETCHNTA 0x280(SI)
   410		// Prefetch values were chosen empirically.
   411		// Approach for prefetch usage as in 7.6.6 of [1]
   412		// [1] 64-ia-32-architectures-optimization-manual.pdf
   413		// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   414		VMOVDQU	(SI), Y0
   415		VMOVDQU	0x20(SI), Y1
   416		VMOVDQU	0x40(SI), Y2
   417		VMOVDQU	0x60(SI), Y3
   418		ADDQ	$0x80, SI
   419		VMOVNTDQ Y0, (DI)
   420		VMOVNTDQ Y1, 0x20(DI)
   421		VMOVNTDQ Y2, 0x40(DI)
   422		VMOVNTDQ Y3, 0x60(DI)
   423		ADDQ	$0x80, DI
   424		SUBQ	$0x80, BX
   425		JA		gobble_mem_fwd_loop
   426		// NT instructions don't follow the normal cache-coherency rules.
   427		// We need SFENCE there to make copied data available timely.
   428		SFENCE
   429		VMOVDQU	Y4, (R8)
   430		VZEROUPPER
   431		MOVOU	X5, -0x80(CX)
   432		MOVOU	X6, -0x70(CX)
   433		MOVOU	X7, -0x60(CX)
   434		MOVOU	X8, -0x50(CX)
   435		MOVOU	X9, -0x40(CX)
   436		MOVOU	X10, -0x30(CX)
   437		MOVOU	X11, -0x20(CX)
   438		MOVOU	X12, -0x10(CX)
   439		RET
   440	
   441	copy_backward:
   442		MOVQ	DI, AX
   443		// Backward copying is about the same as the forward one.
   444		// Firstly we load unaligned tail in the beginning of region.
   445		MOVOU	(SI), X5
   446		MOVOU	0x10(SI), X6
   447		ADDQ	BX, DI
   448		MOVOU	0x20(SI), X7
   449		MOVOU	0x30(SI), X8
   450		LEAQ	-0x20(DI), R10
   451		MOVQ	DI, R11
   452		MOVOU	0x40(SI), X9
   453		MOVOU	0x50(SI), X10
   454		ANDQ	$0x1F, R11
   455		MOVOU	0x60(SI), X11
   456		MOVOU	0x70(SI), X12
   457		XORQ	R11, DI
   458		// Let's point SI to the end of region
   459		ADDQ	BX, SI
   460		// and load unaligned head into X4.
   461		VMOVDQU	-0x20(SI), Y4
   462		SUBQ	R11, SI
   463		SUBQ	R11, BX
   464		// If there is enough data for non-temporal moves go to special loop
   465		CMPQ	BX, $0x100000
   466		JA		gobble_big_data_bwd
   467		SUBQ	$0x80, BX
   468	gobble_mem_bwd_loop:
   469		VMOVDQU	-0x20(SI), Y0
   470		VMOVDQU	-0x40(SI), Y1
   471		VMOVDQU	-0x60(SI), Y2
   472		VMOVDQU	-0x80(SI), Y3
   473		SUBQ	$0x80, SI
   474		VMOVDQA	Y0, -0x20(DI)
   475		VMOVDQA	Y1, -0x40(DI)
   476		VMOVDQA	Y2, -0x60(DI)
   477		VMOVDQA	Y3, -0x80(DI)
   478		SUBQ	$0x80, DI
   479		SUBQ	$0x80, BX
   480		JA		gobble_mem_bwd_loop
   481		// Let's store unaligned data
   482		VMOVDQU	Y4, (R10)
   483		VZEROUPPER
   484		MOVOU	X5, (AX)
   485		MOVOU	X6, 0x10(AX)
   486		MOVOU	X7, 0x20(AX)
   487		MOVOU	X8, 0x30(AX)
   488		MOVOU	X9, 0x40(AX)
   489		MOVOU	X10, 0x50(AX)
   490		MOVOU	X11, 0x60(AX)
   491		MOVOU	X12, 0x70(AX)
   492		RET
   493	
   494	gobble_big_data_bwd:
   495		SUBQ	$0x80, BX
   496	gobble_big_mem_bwd_loop:
   497		PREFETCHNTA -0x1C0(SI)
   498		PREFETCHNTA -0x280(SI)
   499		VMOVDQU	-0x20(SI), Y0
   500		VMOVDQU	-0x40(SI), Y1
   501		VMOVDQU	-0x60(SI), Y2
   502		VMOVDQU	-0x80(SI), Y3
   503		SUBQ	$0x80, SI
   504		VMOVNTDQ	Y0, -0x20(DI)
   505		VMOVNTDQ	Y1, -0x40(DI)
   506		VMOVNTDQ	Y2, -0x60(DI)
   507		VMOVNTDQ	Y3, -0x80(DI)
   508		SUBQ	$0x80, DI
   509		SUBQ	$0x80, BX
   510		JA	gobble_big_mem_bwd_loop
   511		SFENCE
   512		VMOVDQU	Y4, (R10)
   513		VZEROUPPER
   514		MOVOU	X5, (AX)
   515		MOVOU	X6, 0x10(AX)
   516		MOVOU	X7, 0x20(AX)
   517		MOVOU	X8, 0x30(AX)
   518		MOVOU	X9, 0x40(AX)
   519		MOVOU	X10, 0x50(AX)
   520		MOVOU	X11, 0x60(AX)
   521		MOVOU	X12, 0x70(AX)
   522		RET

View as plain text