...
Run Format

Text file src/runtime/memmove_386.s

Documentation: runtime

     1	// Inferno's libkern/memmove-386.s
     2	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3	//
     4	//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5	//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6	//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7	//
     8	// Permission is hereby granted, free of charge, to any person obtaining a copy
     9	// of this software and associated documentation files (the "Software"), to deal
    10	// in the Software without restriction, including without limitation the rights
    11	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12	// copies of the Software, and to permit persons to whom the Software is
    13	// furnished to do so, subject to the following conditions:
    14	//
    15	// The above copyright notice and this permission notice shall be included in
    16	// all copies or substantial portions of the Software.
    17	//
    18	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24	// THE SOFTWARE.
    25	
    26	// +build !plan9
    27	
    28	#include "textflag.h"
    29	
    30	TEXT runtime·memmove(SB), NOSPLIT, $0-12
    31		MOVL	to+0(FP), DI
    32		MOVL	from+4(FP), SI
    33		MOVL	n+8(FP), BX
    34	
    35		// REP instructions have a high startup cost, so we handle small sizes
    36		// with some straightline code. The REP MOVSL instruction is really fast
    37		// for large sizes. The cutover is approximately 1K.  We implement up to
    38		// 128 because that is the maximum SSE register load (loading all data
    39		// into registers lets us ignore copy direction).
    40	tail:
    41		TESTL	BX, BX
    42		JEQ	move_0
    43		CMPL	BX, $2
    44		JBE	move_1or2
    45		CMPL	BX, $4
    46		JB	move_3
    47		JE	move_4
    48		CMPL	BX, $8
    49		JBE	move_5through8
    50		CMPL	BX, $16
    51		JBE	move_9through16
    52		CMPB	runtime·support_sse2(SB), $1
    53		JNE	nosse2
    54		CMPL	BX, $32
    55		JBE	move_17through32
    56		CMPL	BX, $64
    57		JBE	move_33through64
    58		CMPL	BX, $128
    59		JBE	move_65through128
    60		// TODO: use branch table and BSR to make this just a single dispatch
    61	
    62	nosse2:
    63	/*
    64	 * check and set for backwards
    65	 */
    66		CMPL	SI, DI
    67		JLS	back
    68	
    69	/*
    70	 * forward copy loop
    71	 */
    72	forward:
    73		// If REP MOVSB isn't fast, don't use it
    74		CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
    75		JNE	fwdBy4
    76	
    77		// Check alignment
    78		MOVL	SI, AX
    79		ORL	DI, AX
    80		TESTL	$3, AX
    81		JEQ	fwdBy4
    82	
    83		// Do 1 byte at a time
    84		MOVL	BX, CX
    85		REP;	MOVSB
    86		RET
    87	
    88	fwdBy4:
    89		// Do 4 bytes at a time
    90		MOVL	BX, CX
    91		SHRL	$2, CX
    92		ANDL	$3, BX
    93		REP;	MOVSL
    94		JMP	tail
    95	
    96	/*
    97	 * check overlap
    98	 */
    99	back:
   100		MOVL	SI, CX
   101		ADDL	BX, CX
   102		CMPL	CX, DI
   103		JLS	forward
   104	/*
   105	 * whole thing backwards has
   106	 * adjusted addresses
   107	 */
   108	
   109		ADDL	BX, DI
   110		ADDL	BX, SI
   111		STD
   112	
   113	/*
   114	 * copy
   115	 */
   116		MOVL	BX, CX
   117		SHRL	$2, CX
   118		ANDL	$3, BX
   119	
   120		SUBL	$4, DI
   121		SUBL	$4, SI
   122		REP;	MOVSL
   123	
   124		CLD
   125		ADDL	$4, DI
   126		ADDL	$4, SI
   127		SUBL	BX, DI
   128		SUBL	BX, SI
   129		JMP	tail
   130	
   131	move_1or2:
   132		MOVB	(SI), AX
   133		MOVB	-1(SI)(BX*1), CX
   134		MOVB	AX, (DI)
   135		MOVB	CX, -1(DI)(BX*1)
   136		RET
   137	move_0:
   138		RET
   139	move_3:
   140		MOVW	(SI), AX
   141		MOVB	2(SI), CX
   142		MOVW	AX, (DI)
   143		MOVB	CX, 2(DI)
   144		RET
   145	move_4:
   146		// We need a separate case for 4 to make sure we write pointers atomically.
   147		MOVL	(SI), AX
   148		MOVL	AX, (DI)
   149		RET
   150	move_5through8:
   151		MOVL	(SI), AX
   152		MOVL	-4(SI)(BX*1), CX
   153		MOVL	AX, (DI)
   154		MOVL	CX, -4(DI)(BX*1)
   155		RET
   156	move_9through16:
   157		MOVL	(SI), AX
   158		MOVL	4(SI), CX
   159		MOVL	-8(SI)(BX*1), DX
   160		MOVL	-4(SI)(BX*1), BP
   161		MOVL	AX, (DI)
   162		MOVL	CX, 4(DI)
   163		MOVL	DX, -8(DI)(BX*1)
   164		MOVL	BP, -4(DI)(BX*1)
   165		RET
   166	move_17through32:
   167		MOVOU	(SI), X0
   168		MOVOU	-16(SI)(BX*1), X1
   169		MOVOU	X0, (DI)
   170		MOVOU	X1, -16(DI)(BX*1)
   171		RET
   172	move_33through64:
   173		MOVOU	(SI), X0
   174		MOVOU	16(SI), X1
   175		MOVOU	-32(SI)(BX*1), X2
   176		MOVOU	-16(SI)(BX*1), X3
   177		MOVOU	X0, (DI)
   178		MOVOU	X1, 16(DI)
   179		MOVOU	X2, -32(DI)(BX*1)
   180		MOVOU	X3, -16(DI)(BX*1)
   181		RET
   182	move_65through128:
   183		MOVOU	(SI), X0
   184		MOVOU	16(SI), X1
   185		MOVOU	32(SI), X2
   186		MOVOU	48(SI), X3
   187		MOVOU	-64(SI)(BX*1), X4
   188		MOVOU	-48(SI)(BX*1), X5
   189		MOVOU	-32(SI)(BX*1), X6
   190		MOVOU	-16(SI)(BX*1), X7
   191		MOVOU	X0, (DI)
   192		MOVOU	X1, 16(DI)
   193		MOVOU	X2, 32(DI)
   194		MOVOU	X3, 48(DI)
   195		MOVOU	X4, -64(DI)(BX*1)
   196		MOVOU	X5, -48(DI)(BX*1)
   197		MOVOU	X6, -32(DI)(BX*1)
   198		MOVOU	X7, -16(DI)(BX*1)
   199		RET

View as plain text