...
Run Format

Text file src/runtime/memmove_amd64.s

Documentation: runtime

     1	// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3	//
     4	//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5	//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6	//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7	//
     8	// Permission is hereby granted, free of charge, to any person obtaining a copy
     9	// of this software and associated documentation files (the "Software"), to deal
    10	// in the Software without restriction, including without limitation the rights
    11	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12	// copies of the Software, and to permit persons to whom the Software is
    13	// furnished to do so, subject to the following conditions:
    14	//
    15	// The above copyright notice and this permission notice shall be included in
    16	// all copies or substantial portions of the Software.
    17	//
    18	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24	// THE SOFTWARE.
    25	
    26	// +build !plan9
    27	
    28	#include "textflag.h"
    29	
    30	// void runtime·memmove(void*, void*, uintptr)
    31	TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32	
    33		MOVQ	to+0(FP), DI
    34		MOVQ	from+8(FP), SI
    35		MOVQ	n+16(FP), BX
    36	
    37		// REP instructions have a high startup cost, so we handle small sizes
    38		// with some straightline code. The REP MOVSQ instruction is really fast
    39		// for large sizes. The cutover is approximately 2K.
    40	tail:
    41		// move_129through256 or smaller work whether or not the source and the
    42		// destination memory regions overlap because they load all data into
    43		// registers before writing it back.  move_256through2048 on the other
    44		// hand can be used only when the memory regions don't overlap or the copy
    45		// direction is forward.
    46		TESTQ	BX, BX
    47		JEQ	move_0
    48		CMPQ	BX, $2
    49		JBE	move_1or2
    50		CMPQ	BX, $4
    51		JBE	move_3or4
    52		CMPQ	BX, $8
    53		JB	move_5through7
    54		JE	move_8
    55		CMPQ	BX, $16
    56		JBE	move_9through16
    57		CMPQ	BX, $32
    58		JBE	move_17through32
    59		CMPQ	BX, $64
    60		JBE	move_33through64
    61		CMPQ	BX, $128
    62		JBE	move_65through128
    63		CMPQ	BX, $256
    64		JBE	move_129through256
    65		// TODO: use branch table and BSR to make this just a single dispatch
    66	
    67		TESTB	$1, runtime·useAVXmemmove(SB)
    68		JNZ	avxUnaligned
    69	
    70	/*
    71	 * check and set for backwards
    72	 */
    73		CMPQ	SI, DI
    74		JLS	back
    75	
    76	/*
    77	 * forward copy loop
    78	 */
    79	forward:
    80		CMPQ	BX, $2048
    81		JLS	move_256through2048
    82	
    83		// If REP MOVSB isn't fast, don't use it
    84		CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
    85		JNE	fwdBy8
    86	
    87		// Check alignment
    88		MOVL	SI, AX
    89		ORL	DI, AX
    90		TESTL	$7, AX
    91		JEQ	fwdBy8
    92	
    93		// Do 1 byte at a time
    94		MOVQ	BX, CX
    95		REP;	MOVSB
    96		RET
    97	
    98	fwdBy8:
    99		// Do 8 bytes at a time
   100		MOVQ	BX, CX
   101		SHRQ	$3, CX
   102		ANDQ	$7, BX
   103		REP;	MOVSQ
   104		JMP	tail
   105	
   106	back:
   107	/*
   108	 * check overlap
   109	 */
   110		MOVQ	SI, CX
   111		ADDQ	BX, CX
   112		CMPQ	CX, DI
   113		JLS	forward
   114	/*
   115	 * whole thing backwards has
   116	 * adjusted addresses
   117	 */
   118		ADDQ	BX, DI
   119		ADDQ	BX, SI
   120		STD
   121	
   122	/*
   123	 * copy
   124	 */
   125		MOVQ	BX, CX
   126		SHRQ	$3, CX
   127		ANDQ	$7, BX
   128	
   129		SUBQ	$8, DI
   130		SUBQ	$8, SI
   131		REP;	MOVSQ
   132	
   133		CLD
   134		ADDQ	$8, DI
   135		ADDQ	$8, SI
   136		SUBQ	BX, DI
   137		SUBQ	BX, SI
   138		JMP	tail
   139	
   140	move_1or2:
   141		MOVB	(SI), AX
   142		MOVB	-1(SI)(BX*1), CX
   143		MOVB	AX, (DI)
   144		MOVB	CX, -1(DI)(BX*1)
   145		RET
   146	move_0:
   147		RET
   148	move_3or4:
   149		CMPQ	BX, $4
   150		JB	move_3
   151		MOVL	(SI), AX
   152		MOVL	AX, (DI)
   153		RET
   154	move_3:
   155		MOVW	(SI), AX
   156		MOVB	2(SI), CX
   157		MOVW	AX, (DI)
   158		MOVB	CX, 2(DI)
   159		RET
   160	move_5through7:
   161		MOVL	(SI), AX
   162		MOVL	-4(SI)(BX*1), CX
   163		MOVL	AX, (DI)
   164		MOVL	CX, -4(DI)(BX*1)
   165		RET
   166	move_8:
   167		// We need a separate case for 8 to make sure we write pointers atomically.
   168		MOVQ	(SI), AX
   169		MOVQ	AX, (DI)
   170		RET
   171	move_9through16:
   172		MOVQ	(SI), AX
   173		MOVQ	-8(SI)(BX*1), CX
   174		MOVQ	AX, (DI)
   175		MOVQ	CX, -8(DI)(BX*1)
   176		RET
   177	move_17through32:
   178		MOVOU	(SI), X0
   179		MOVOU	-16(SI)(BX*1), X1
   180		MOVOU	X0, (DI)
   181		MOVOU	X1, -16(DI)(BX*1)
   182		RET
   183	move_33through64:
   184		MOVOU	(SI), X0
   185		MOVOU	16(SI), X1
   186		MOVOU	-32(SI)(BX*1), X2
   187		MOVOU	-16(SI)(BX*1), X3
   188		MOVOU	X0, (DI)
   189		MOVOU	X1, 16(DI)
   190		MOVOU	X2, -32(DI)(BX*1)
   191		MOVOU	X3, -16(DI)(BX*1)
   192		RET
   193	move_65through128:
   194		MOVOU	(SI), X0
   195		MOVOU	16(SI), X1
   196		MOVOU	32(SI), X2
   197		MOVOU	48(SI), X3
   198		MOVOU	-64(SI)(BX*1), X4
   199		MOVOU	-48(SI)(BX*1), X5
   200		MOVOU	-32(SI)(BX*1), X6
   201		MOVOU	-16(SI)(BX*1), X7
   202		MOVOU	X0, (DI)
   203		MOVOU	X1, 16(DI)
   204		MOVOU	X2, 32(DI)
   205		MOVOU	X3, 48(DI)
   206		MOVOU	X4, -64(DI)(BX*1)
   207		MOVOU	X5, -48(DI)(BX*1)
   208		MOVOU	X6, -32(DI)(BX*1)
   209		MOVOU	X7, -16(DI)(BX*1)
   210		RET
   211	move_129through256:
   212		MOVOU	(SI), X0
   213		MOVOU	16(SI), X1
   214		MOVOU	32(SI), X2
   215		MOVOU	48(SI), X3
   216		MOVOU	64(SI), X4
   217		MOVOU	80(SI), X5
   218		MOVOU	96(SI), X6
   219		MOVOU	112(SI), X7
   220		MOVOU	-128(SI)(BX*1), X8
   221		MOVOU	-112(SI)(BX*1), X9
   222		MOVOU	-96(SI)(BX*1), X10
   223		MOVOU	-80(SI)(BX*1), X11
   224		MOVOU	-64(SI)(BX*1), X12
   225		MOVOU	-48(SI)(BX*1), X13
   226		MOVOU	-32(SI)(BX*1), X14
   227		MOVOU	-16(SI)(BX*1), X15
   228		MOVOU	X0, (DI)
   229		MOVOU	X1, 16(DI)
   230		MOVOU	X2, 32(DI)
   231		MOVOU	X3, 48(DI)
   232		MOVOU	X4, 64(DI)
   233		MOVOU	X5, 80(DI)
   234		MOVOU	X6, 96(DI)
   235		MOVOU	X7, 112(DI)
   236		MOVOU	X8, -128(DI)(BX*1)
   237		MOVOU	X9, -112(DI)(BX*1)
   238		MOVOU	X10, -96(DI)(BX*1)
   239		MOVOU	X11, -80(DI)(BX*1)
   240		MOVOU	X12, -64(DI)(BX*1)
   241		MOVOU	X13, -48(DI)(BX*1)
   242		MOVOU	X14, -32(DI)(BX*1)
   243		MOVOU	X15, -16(DI)(BX*1)
   244		RET
   245	move_256through2048:
   246		SUBQ	$256, BX
   247		MOVOU	(SI), X0
   248		MOVOU	16(SI), X1
   249		MOVOU	32(SI), X2
   250		MOVOU	48(SI), X3
   251		MOVOU	64(SI), X4
   252		MOVOU	80(SI), X5
   253		MOVOU	96(SI), X6
   254		MOVOU	112(SI), X7
   255		MOVOU	128(SI), X8
   256		MOVOU	144(SI), X9
   257		MOVOU	160(SI), X10
   258		MOVOU	176(SI), X11
   259		MOVOU	192(SI), X12
   260		MOVOU	208(SI), X13
   261		MOVOU	224(SI), X14
   262		MOVOU	240(SI), X15
   263		MOVOU	X0, (DI)
   264		MOVOU	X1, 16(DI)
   265		MOVOU	X2, 32(DI)
   266		MOVOU	X3, 48(DI)
   267		MOVOU	X4, 64(DI)
   268		MOVOU	X5, 80(DI)
   269		MOVOU	X6, 96(DI)
   270		MOVOU	X7, 112(DI)
   271		MOVOU	X8, 128(DI)
   272		MOVOU	X9, 144(DI)
   273		MOVOU	X10, 160(DI)
   274		MOVOU	X11, 176(DI)
   275		MOVOU	X12, 192(DI)
   276		MOVOU	X13, 208(DI)
   277		MOVOU	X14, 224(DI)
   278		MOVOU	X15, 240(DI)
   279		CMPQ	BX, $256
   280		LEAQ	256(SI), SI
   281		LEAQ	256(DI), DI
   282		JGE	move_256through2048
   283		JMP	tail
   284	
   285	avxUnaligned:
   286		// There are two implementations of move algorithm.
   287		// The first one for non-ovelapped memory regions. It uses forward copying.
   288		// The second one for overlapped regions. It uses backward copying
   289		MOVQ	DI, CX
   290		SUBQ	SI, CX
   291		// Now CX contains distance between SRC and DEST
   292		CMPQ	CX, BX
   293		// If the distance lesser than region length it means that regions are overlapped
   294		JC	copy_backward
   295	
   296		// Non-temporal copy would be better for big sizes.
   297		CMPQ	BX, $0x100000
   298		JAE	gobble_big_data_fwd
   299	
   300		// Memory layout on the source side
   301		// SI                                       CX
   302		// |<---------BX before correction--------->|
   303		// |       |<--BX corrected-->|             |
   304		// |       |                  |<--- AX  --->|
   305		// |<-R11->|                  |<-128 bytes->|
   306		// +----------------------------------------+
   307		// | Head  | Body             | Tail        |
   308		// +-------+------------------+-------------+
   309		// ^       ^                  ^
   310		// |       |                  |
   311		// Save head into Y4          Save tail into X5..X12
   312		//         |
   313		//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   314		// Algorithm:
   315		// 1. Unaligned save of the tail's 128 bytes
   316		// 2. Unaligned save of the head's 32  bytes
   317		// 3. Destination-aligned copying of body (128 bytes per iteration)
   318		// 4. Put head on the new place
   319		// 5. Put the tail on the new place
   320		// It can be important to satisfy processor's pipeline requirements for
   321		// small sizes as the cost of unaligned memory region copying is
   322		// comparable with the cost of main loop. So code is slightly messed there.
   323		// There is more clean implementation of that algorithm for bigger sizes
   324		// where the cost of unaligned part copying is negligible.
   325		// You can see it after gobble_big_data_fwd label.
   326		LEAQ	(SI)(BX*1), CX
   327		MOVQ	DI, R10
   328		// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   329		MOVOU	-0x80(CX), X5
   330		MOVOU	-0x70(CX), X6
   331		MOVQ	$0x80, AX
   332		// Align destination address
   333		ANDQ	$-32, DI
   334		ADDQ	$32, DI
   335		// Continue tail saving.
   336		MOVOU	-0x60(CX), X7
   337		MOVOU	-0x50(CX), X8
   338		// Make R11 delta between aligned and unaligned destination addresses.
   339		MOVQ	DI, R11
   340		SUBQ	R10, R11
   341		// Continue tail saving.
   342		MOVOU	-0x40(CX), X9
   343		MOVOU	-0x30(CX), X10
   344		// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   345		SUBQ	R11, BX
   346		// Continue tail saving.
   347		MOVOU	-0x20(CX), X11
   348		MOVOU	-0x10(CX), X12
   349		// The tail will be put on it's place after main body copying.
   350		// It's time for the unaligned heading part.
   351		VMOVDQU	(SI), Y4
   352		// Adjust source address to point past head.
   353		ADDQ	R11, SI
   354		SUBQ	AX, BX
   355		// Aligned memory copying there
   356	gobble_128_loop:
   357		VMOVDQU	(SI), Y0
   358		VMOVDQU	0x20(SI), Y1
   359		VMOVDQU	0x40(SI), Y2
   360		VMOVDQU	0x60(SI), Y3
   361		ADDQ	AX, SI
   362		VMOVDQA	Y0, (DI)
   363		VMOVDQA	Y1, 0x20(DI)
   364		VMOVDQA	Y2, 0x40(DI)
   365		VMOVDQA	Y3, 0x60(DI)
   366		ADDQ	AX, DI
   367		SUBQ	AX, BX
   368		JA	gobble_128_loop
   369		// Now we can store unaligned parts.
   370		ADDQ	AX, BX
   371		ADDQ	DI, BX
   372		VMOVDQU	Y4, (R10)
   373		VZEROUPPER
   374		MOVOU	X5, -0x80(BX)
   375		MOVOU	X6, -0x70(BX)
   376		MOVOU	X7, -0x60(BX)
   377		MOVOU	X8, -0x50(BX)
   378		MOVOU	X9, -0x40(BX)
   379		MOVOU	X10, -0x30(BX)
   380		MOVOU	X11, -0x20(BX)
   381		MOVOU	X12, -0x10(BX)
   382		RET
   383	
   384	gobble_big_data_fwd:
   385		// There is forward copying for big regions.
   386		// It uses non-temporal mov instructions.
   387		// Details of this algorithm are commented previously for small sizes.
   388		LEAQ	(SI)(BX*1), CX
   389		MOVOU	-0x80(SI)(BX*1), X5
   390		MOVOU	-0x70(CX), X6
   391		MOVOU	-0x60(CX), X7
   392		MOVOU	-0x50(CX), X8
   393		MOVOU	-0x40(CX), X9
   394		MOVOU	-0x30(CX), X10
   395		MOVOU	-0x20(CX), X11
   396		MOVOU	-0x10(CX), X12
   397		VMOVDQU	(SI), Y4
   398		MOVQ	DI, R8
   399		ANDQ	$-32, DI
   400		ADDQ	$32, DI
   401		MOVQ	DI, R10
   402		SUBQ	R8, R10
   403		SUBQ	R10, BX
   404		ADDQ	R10, SI
   405		LEAQ	(DI)(BX*1), CX
   406		SUBQ	$0x80, BX
   407	gobble_mem_fwd_loop:
   408		PREFETCHNTA 0x1C0(SI)
   409		PREFETCHNTA 0x280(SI)
   410		// Prefetch values were chosen empirically.
   411		// Approach for prefetch usage as in 7.6.6 of [1]
   412		// [1] 64-ia-32-architectures-optimization-manual.pdf
   413		// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   414		VMOVDQU	(SI), Y0
   415		VMOVDQU	0x20(SI), Y1
   416		VMOVDQU	0x40(SI), Y2
   417		VMOVDQU	0x60(SI), Y3
   418		ADDQ	$0x80, SI
   419		VMOVNTDQ Y0, (DI)
   420		VMOVNTDQ Y1, 0x20(DI)
   421		VMOVNTDQ Y2, 0x40(DI)
   422		VMOVNTDQ Y3, 0x60(DI)
   423		ADDQ	$0x80, DI
   424		SUBQ	$0x80, BX
   425		JA		gobble_mem_fwd_loop
   426		// NT instructions don't follow the normal cache-coherency rules.
   427		// We need SFENCE there to make copied data available timely.
   428		SFENCE
   429		VMOVDQU	Y4, (R8)
   430		VZEROUPPER
   431		MOVOU	X5, -0x80(CX)
   432		MOVOU	X6, -0x70(CX)
   433		MOVOU	X7, -0x60(CX)
   434		MOVOU	X8, -0x50(CX)
   435		MOVOU	X9, -0x40(CX)
   436		MOVOU	X10, -0x30(CX)
   437		MOVOU	X11, -0x20(CX)
   438		MOVOU	X12, -0x10(CX)
   439		RET
   440	
   441	copy_backward:
   442		MOVQ	DI, AX
   443		// Backward copying is about the same as the forward one.
   444		// Firstly we load unaligned tail in the beginning of region.
   445		MOVOU	(SI), X5
   446		MOVOU	0x10(SI), X6
   447		ADDQ	BX, DI
   448		MOVOU	0x20(SI), X7
   449		MOVOU	0x30(SI), X8
   450		LEAQ	-0x20(DI), R10
   451		MOVQ	DI, R11
   452		MOVOU	0x40(SI), X9
   453		MOVOU	0x50(SI), X10
   454		ANDQ	$0x1F, R11
   455		MOVOU	0x60(SI), X11
   456		MOVOU	0x70(SI), X12
   457		XORQ	R11, DI
   458		// Let's point SI to the end of region
   459		ADDQ	BX, SI
   460		// and load unaligned head into X4.
   461		VMOVDQU	-0x20(SI), Y4
   462		SUBQ	R11, SI
   463		SUBQ	R11, BX
   464		// If there is enough data for non-temporal moves go to special loop
   465		CMPQ	BX, $0x100000
   466		JA		gobble_big_data_bwd
   467		SUBQ	$0x80, BX
   468	gobble_mem_bwd_loop:
   469		VMOVDQU	-0x20(SI), Y0
   470		VMOVDQU	-0x40(SI), Y1
   471		VMOVDQU	-0x60(SI), Y2
   472		VMOVDQU	-0x80(SI), Y3
   473		SUBQ	$0x80, SI
   474		VMOVDQA	Y0, -0x20(DI)
   475		VMOVDQA	Y1, -0x40(DI)
   476		VMOVDQA	Y2, -0x60(DI)
   477		VMOVDQA	Y3, -0x80(DI)
   478		SUBQ	$0x80, DI
   479		SUBQ	$0x80, BX
   480		JA		gobble_mem_bwd_loop
   481		// Let's store unaligned data
   482		VMOVDQU	Y4, (R10)
   483		VZEROUPPER
   484		MOVOU	X5, (AX)
   485		MOVOU	X6, 0x10(AX)
   486		MOVOU	X7, 0x20(AX)
   487		MOVOU	X8, 0x30(AX)
   488		MOVOU	X9, 0x40(AX)
   489		MOVOU	X10, 0x50(AX)
   490		MOVOU	X11, 0x60(AX)
   491		MOVOU	X12, 0x70(AX)
   492		RET
   493	
   494	gobble_big_data_bwd:
   495		SUBQ	$0x80, BX
   496	gobble_big_mem_bwd_loop:
   497		PREFETCHNTA -0x1C0(SI)
   498		PREFETCHNTA -0x280(SI)
   499		VMOVDQU	-0x20(SI), Y0
   500		VMOVDQU	-0x40(SI), Y1
   501		VMOVDQU	-0x60(SI), Y2
   502		VMOVDQU	-0x80(SI), Y3
   503		SUBQ	$0x80, SI
   504		VMOVNTDQ	Y0, -0x20(DI)
   505		VMOVNTDQ	Y1, -0x40(DI)
   506		VMOVNTDQ	Y2, -0x60(DI)
   507		VMOVNTDQ	Y3, -0x80(DI)
   508		SUBQ	$0x80, DI
   509		SUBQ	$0x80, BX
   510		JA	gobble_big_mem_bwd_loop
   511		SFENCE
   512		VMOVDQU	Y4, (R10)
   513		VZEROUPPER
   514		MOVOU	X5, (AX)
   515		MOVOU	X6, 0x10(AX)
   516		MOVOU	X7, 0x20(AX)
   517		MOVOU	X8, 0x30(AX)
   518		MOVOU	X9, 0x40(AX)
   519		MOVOU	X10, 0x50(AX)
   520		MOVOU	X11, 0x60(AX)
   521		MOVOU	X12, 0x70(AX)
   522		RET

View as plain text