...
Run Format

Text file src/runtime/asm_amd64.s

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "go_tls.h"
     7	#include "funcdata.h"
     8	#include "textflag.h"
     9	
    10	TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11		// copy arguments forward on an even stack
    12		MOVQ	DI, AX		// argc
    13		MOVQ	SI, BX		// argv
    14		SUBQ	$(4*8+7), SP		// 2args 2auto
    15		ANDQ	$~15, SP
    16		MOVQ	AX, 16(SP)
    17		MOVQ	BX, 24(SP)
    18		
    19		// create istack out of the given (operating system) stack.
    20		// _cgo_init may update stackguard.
    21		MOVQ	$runtime·g0(SB), DI
    22		LEAQ	(-64*1024+104)(SP), BX
    23		MOVQ	BX, g_stackguard0(DI)
    24		MOVQ	BX, g_stackguard1(DI)
    25		MOVQ	BX, (g_stack+stack_lo)(DI)
    26		MOVQ	SP, (g_stack+stack_hi)(DI)
    27	
    28		// find out information about the processor we're on
    29		MOVQ	$0, AX
    30		CPUID
    31		MOVQ	AX, SI
    32		CMPQ	AX, $0
    33		JE	nocpuinfo
    34	
    35		// Figure out how to serialize RDTSC.
    36		// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37		// Don't know about the rest, so let's do MFENCE.
    38		CMPL	BX, $0x756E6547  // "Genu"
    39		JNE	notintel
    40		CMPL	DX, $0x49656E69  // "ineI"
    41		JNE	notintel
    42		CMPL	CX, $0x6C65746E  // "ntel"
    43		JNE	notintel
    44		MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45	notintel:
    46	
    47		// Load EAX=1 cpuid flags
    48		MOVQ	$1, AX
    49		CPUID
    50		MOVL	CX, runtime·cpuid_ecx(SB)
    51		MOVL	DX, runtime·cpuid_edx(SB)
    52	
    53		// Load EAX=7/ECX=0 cpuid flags
    54		CMPQ	SI, $7
    55		JLT	no7
    56		MOVL	$7, AX
    57		MOVL	$0, CX
    58		CPUID
    59		MOVL	BX, runtime·cpuid_ebx7(SB)
    60	no7:
    61		// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62		// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63		// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64		MOVL	runtime·cpuid_ecx(SB), CX
    65		ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66		CMPL    CX, $0x18000000
    67		JNE     noavx
    68		MOVL    $0, CX
    69		// For XGETBV, OSXSAVE bit is required and sufficient
    70		XGETBV
    71		ANDL    $6, AX
    72		CMPL    AX, $6 // Check for OS support of YMM registers
    73		JNE     noavx
    74		MOVB    $1, runtime·support_avx(SB)
    75		TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76		JEQ     noavx2
    77		MOVB    $1, runtime·support_avx2(SB)
    78		JMP     nocpuinfo
    79	noavx:
    80		MOVB    $0, runtime·support_avx(SB)
    81	noavx2:
    82		MOVB    $0, runtime·support_avx2(SB)
    83	nocpuinfo:	
    84		
    85		// if there is an _cgo_init, call it.
    86		MOVQ	_cgo_init(SB), AX
    87		TESTQ	AX, AX
    88		JZ	needtls
    89		// g0 already in DI
    90		MOVQ	DI, CX	// Win64 uses CX for first parameter
    91		MOVQ	$setg_gcc<>(SB), SI
    92		CALL	AX
    93	
    94		// update stackguard after _cgo_init
    95		MOVQ	$runtime·g0(SB), CX
    96		MOVQ	(g_stack+stack_lo)(CX), AX
    97		ADDQ	$const__StackGuard, AX
    98		MOVQ	AX, g_stackguard0(CX)
    99		MOVQ	AX, g_stackguard1(CX)
   100	
   101	#ifndef GOOS_windows
   102		JMP ok
   103	#endif
   104	needtls:
   105	#ifdef GOOS_plan9
   106		// skip TLS setup on Plan 9
   107		JMP ok
   108	#endif
   109	#ifdef GOOS_solaris
   110		// skip TLS setup on Solaris
   111		JMP ok
   112	#endif
   113	
   114		LEAQ	runtime·m0+m_tls(SB), DI
   115		CALL	runtime·settls(SB)
   116	
   117		// store through it, to make sure it works
   118		get_tls(BX)
   119		MOVQ	$0x123, g(BX)
   120		MOVQ	runtime·m0+m_tls(SB), AX
   121		CMPQ	AX, $0x123
   122		JEQ 2(PC)
   123		MOVL	AX, 0	// abort
   124	ok:
   125		// set the per-goroutine and per-mach "registers"
   126		get_tls(BX)
   127		LEAQ	runtime·g0(SB), CX
   128		MOVQ	CX, g(BX)
   129		LEAQ	runtime·m0(SB), AX
   130	
   131		// save m->g0 = g0
   132		MOVQ	CX, m_g0(AX)
   133		// save m0 to g0->m
   134		MOVQ	AX, g_m(CX)
   135	
   136		CLD				// convention is D is always left cleared
   137		CALL	runtime·check(SB)
   138	
   139		MOVL	16(SP), AX		// copy argc
   140		MOVL	AX, 0(SP)
   141		MOVQ	24(SP), AX		// copy argv
   142		MOVQ	AX, 8(SP)
   143		CALL	runtime·args(SB)
   144		CALL	runtime·osinit(SB)
   145		CALL	runtime·schedinit(SB)
   146	
   147		// create a new goroutine to start program
   148		MOVQ	$runtime·mainPC(SB), AX		// entry
   149		PUSHQ	AX
   150		PUSHQ	$0			// arg size
   151		CALL	runtime·newproc(SB)
   152		POPQ	AX
   153		POPQ	AX
   154	
   155		// start this M
   156		CALL	runtime·mstart(SB)
   157	
   158		MOVL	$0xf1, 0xf1  // crash
   159		RET
   160	
   161	DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   162	GLOBL	runtime·mainPC(SB),RODATA,$8
   163	
   164	TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   165		BYTE	$0xcc
   166		RET
   167	
   168	TEXT runtime·asminit(SB),NOSPLIT,$0-0
   169		// No per-thread init.
   170		RET
   171	
   172	/*
   173	 *  go-routine
   174	 */
   175	
   176	// void gosave(Gobuf*)
   177	// save state in Gobuf; setjmp
   178	TEXT runtime·gosave(SB), NOSPLIT, $0-8
   179		MOVQ	buf+0(FP), AX		// gobuf
   180		LEAQ	buf+0(FP), BX		// caller's SP
   181		MOVQ	BX, gobuf_sp(AX)
   182		MOVQ	0(SP), BX		// caller's PC
   183		MOVQ	BX, gobuf_pc(AX)
   184		MOVQ	$0, gobuf_ret(AX)
   185		MOVQ	$0, gobuf_ctxt(AX)
   186		MOVQ	BP, gobuf_bp(AX)
   187		get_tls(CX)
   188		MOVQ	g(CX), BX
   189		MOVQ	BX, gobuf_g(AX)
   190		RET
   191	
   192	// void gogo(Gobuf*)
   193	// restore state from Gobuf; longjmp
   194	TEXT runtime·gogo(SB), NOSPLIT, $0-8
   195		MOVQ	buf+0(FP), BX		// gobuf
   196		MOVQ	gobuf_g(BX), DX
   197		MOVQ	0(DX), CX		// make sure g != nil
   198		get_tls(CX)
   199		MOVQ	DX, g(CX)
   200		MOVQ	gobuf_sp(BX), SP	// restore SP
   201		MOVQ	gobuf_ret(BX), AX
   202		MOVQ	gobuf_ctxt(BX), DX
   203		MOVQ	gobuf_bp(BX), BP
   204		MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   205		MOVQ	$0, gobuf_ret(BX)
   206		MOVQ	$0, gobuf_ctxt(BX)
   207		MOVQ	$0, gobuf_bp(BX)
   208		MOVQ	gobuf_pc(BX), BX
   209		JMP	BX
   210	
   211	// func mcall(fn func(*g))
   212	// Switch to m->g0's stack, call fn(g).
   213	// Fn must never return. It should gogo(&g->sched)
   214	// to keep running g.
   215	TEXT runtime·mcall(SB), NOSPLIT, $0-8
   216		MOVQ	fn+0(FP), DI
   217		
   218		get_tls(CX)
   219		MOVQ	g(CX), AX	// save state in g->sched
   220		MOVQ	0(SP), BX	// caller's PC
   221		MOVQ	BX, (g_sched+gobuf_pc)(AX)
   222		LEAQ	fn+0(FP), BX	// caller's SP
   223		MOVQ	BX, (g_sched+gobuf_sp)(AX)
   224		MOVQ	AX, (g_sched+gobuf_g)(AX)
   225		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   226	
   227		// switch to m->g0 & its stack, call fn
   228		MOVQ	g(CX), BX
   229		MOVQ	g_m(BX), BX
   230		MOVQ	m_g0(BX), SI
   231		CMPQ	SI, AX	// if g == m->g0 call badmcall
   232		JNE	3(PC)
   233		MOVQ	$runtime·badmcall(SB), AX
   234		JMP	AX
   235		MOVQ	SI, g(CX)	// g = m->g0
   236		MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   237		PUSHQ	AX
   238		MOVQ	DI, DX
   239		MOVQ	0(DI), DI
   240		CALL	DI
   241		POPQ	AX
   242		MOVQ	$runtime·badmcall2(SB), AX
   243		JMP	AX
   244		RET
   245	
   246	// systemstack_switch is a dummy routine that systemstack leaves at the bottom
   247	// of the G stack. We need to distinguish the routine that
   248	// lives at the bottom of the G stack from the one that lives
   249	// at the top of the system stack because the one at the top of
   250	// the system stack terminates the stack walk (see topofstack()).
   251	TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   252		RET
   253	
   254	// func systemstack(fn func())
   255	TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   256		MOVQ	fn+0(FP), DI	// DI = fn
   257		get_tls(CX)
   258		MOVQ	g(CX), AX	// AX = g
   259		MOVQ	g_m(AX), BX	// BX = m
   260	
   261		MOVQ	m_gsignal(BX), DX	// DX = gsignal
   262		CMPQ	AX, DX
   263		JEQ	noswitch
   264	
   265		MOVQ	m_g0(BX), DX	// DX = g0
   266		CMPQ	AX, DX
   267		JEQ	noswitch
   268	
   269		MOVQ	m_curg(BX), R8
   270		CMPQ	AX, R8
   271		JEQ	switch
   272		
   273		// Bad: g is not gsignal, not g0, not curg. What is it?
   274		MOVQ	$runtime·badsystemstack(SB), AX
   275		CALL	AX
   276	
   277	switch:
   278		// save our state in g->sched. Pretend to
   279		// be systemstack_switch if the G stack is scanned.
   280		MOVQ	$runtime·systemstack_switch(SB), SI
   281		MOVQ	SI, (g_sched+gobuf_pc)(AX)
   282		MOVQ	SP, (g_sched+gobuf_sp)(AX)
   283		MOVQ	AX, (g_sched+gobuf_g)(AX)
   284		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   285	
   286		// switch to g0
   287		MOVQ	DX, g(CX)
   288		MOVQ	(g_sched+gobuf_sp)(DX), BX
   289		// make it look like mstart called systemstack on g0, to stop traceback
   290		SUBQ	$8, BX
   291		MOVQ	$runtime·mstart(SB), DX
   292		MOVQ	DX, 0(BX)
   293		MOVQ	BX, SP
   294	
   295		// call target function
   296		MOVQ	DI, DX
   297		MOVQ	0(DI), DI
   298		CALL	DI
   299	
   300		// switch back to g
   301		get_tls(CX)
   302		MOVQ	g(CX), AX
   303		MOVQ	g_m(AX), BX
   304		MOVQ	m_curg(BX), AX
   305		MOVQ	AX, g(CX)
   306		MOVQ	(g_sched+gobuf_sp)(AX), SP
   307		MOVQ	$0, (g_sched+gobuf_sp)(AX)
   308		RET
   309	
   310	noswitch:
   311		// already on m stack, just call directly
   312		MOVQ	DI, DX
   313		MOVQ	0(DI), DI
   314		CALL	DI
   315		RET
   316	
   317	/*
   318	 * support for morestack
   319	 */
   320	
   321	// Called during function prolog when more stack is needed.
   322	//
   323	// The traceback routines see morestack on a g0 as being
   324	// the top of a stack (for example, morestack calling newstack
   325	// calling the scheduler calling newm calling gc), so we must
   326	// record an argument size. For that purpose, it has no arguments.
   327	TEXT runtime·morestack(SB),NOSPLIT,$0-0
   328		// Cannot grow scheduler stack (m->g0).
   329		get_tls(CX)
   330		MOVQ	g(CX), BX
   331		MOVQ	g_m(BX), BX
   332		MOVQ	m_g0(BX), SI
   333		CMPQ	g(CX), SI
   334		JNE	2(PC)
   335		INT	$3
   336	
   337		// Cannot grow signal stack (m->gsignal).
   338		MOVQ	m_gsignal(BX), SI
   339		CMPQ	g(CX), SI
   340		JNE	2(PC)
   341		INT	$3
   342	
   343		// Called from f.
   344		// Set m->morebuf to f's caller.
   345		MOVQ	8(SP), AX	// f's caller's PC
   346		MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   347		LEAQ	16(SP), AX	// f's caller's SP
   348		MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   349		get_tls(CX)
   350		MOVQ	g(CX), SI
   351		MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   352	
   353		// Set g->sched to context in f.
   354		MOVQ	0(SP), AX // f's PC
   355		MOVQ	AX, (g_sched+gobuf_pc)(SI)
   356		MOVQ	SI, (g_sched+gobuf_g)(SI)
   357		LEAQ	8(SP), AX // f's SP
   358		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   359		MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   360		MOVQ	BP, (g_sched+gobuf_bp)(SI)
   361	
   362		// Call newstack on m->g0's stack.
   363		MOVQ	m_g0(BX), BX
   364		MOVQ	BX, g(CX)
   365		MOVQ	(g_sched+gobuf_sp)(BX), SP
   366		CALL	runtime·newstack(SB)
   367		MOVQ	$0, 0x1003	// crash if newstack returns
   368		RET
   369	
   370	// morestack but not preserving ctxt.
   371	TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   372		MOVL	$0, DX
   373		JMP	runtime·morestack(SB)
   374	
   375	TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   376		// We came here via a RET to an overwritten return PC.
   377		// AX may be live. Other registers are available.
   378	
   379		// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   380		get_tls(CX)
   381		MOVQ	g(CX), CX
   382		MOVQ	(g_stkbar+slice_array)(CX), DX
   383		MOVQ	g_stkbarPos(CX), BX
   384		IMULQ	$stkbar__size, BX	// Too big for SIB.
   385		MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   386		MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   387		// Assert that we're popping the right saved LR.
   388		ADDQ	$8, R8
   389		CMPQ	R8, SP
   390		JEQ	2(PC)
   391		MOVL	$0, 0
   392		// Record that this stack barrier was hit.
   393		ADDQ	$1, g_stkbarPos(CX)
   394		// Jump to the original return PC.
   395		JMP	BX
   396	
   397	// reflectcall: call a function with the given argument list
   398	// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   399	// we don't have variable-sized frames, so we use a small number
   400	// of constant-sized-frame functions to encode a few bits of size in the pc.
   401	// Caution: ugly multiline assembly macros in your future!
   402	
   403	#define DISPATCH(NAME,MAXSIZE)		\
   404		CMPQ	CX, $MAXSIZE;		\
   405		JA	3(PC);			\
   406		MOVQ	$NAME(SB), AX;		\
   407		JMP	AX
   408	// Note: can't just "JMP NAME(SB)" - bad inlining results.
   409	
   410	TEXT reflect·call(SB), NOSPLIT, $0-0
   411		JMP	·reflectcall(SB)
   412	
   413	TEXT ·reflectcall(SB), NOSPLIT, $0-32
   414		MOVLQZX argsize+24(FP), CX
   415		// NOTE(rsc): No call16, because CALLFN needs four words
   416		// of argument space to invoke callwritebarrier.
   417		DISPATCH(runtime·call32, 32)
   418		DISPATCH(runtime·call64, 64)
   419		DISPATCH(runtime·call128, 128)
   420		DISPATCH(runtime·call256, 256)
   421		DISPATCH(runtime·call512, 512)
   422		DISPATCH(runtime·call1024, 1024)
   423		DISPATCH(runtime·call2048, 2048)
   424		DISPATCH(runtime·call4096, 4096)
   425		DISPATCH(runtime·call8192, 8192)
   426		DISPATCH(runtime·call16384, 16384)
   427		DISPATCH(runtime·call32768, 32768)
   428		DISPATCH(runtime·call65536, 65536)
   429		DISPATCH(runtime·call131072, 131072)
   430		DISPATCH(runtime·call262144, 262144)
   431		DISPATCH(runtime·call524288, 524288)
   432		DISPATCH(runtime·call1048576, 1048576)
   433		DISPATCH(runtime·call2097152, 2097152)
   434		DISPATCH(runtime·call4194304, 4194304)
   435		DISPATCH(runtime·call8388608, 8388608)
   436		DISPATCH(runtime·call16777216, 16777216)
   437		DISPATCH(runtime·call33554432, 33554432)
   438		DISPATCH(runtime·call67108864, 67108864)
   439		DISPATCH(runtime·call134217728, 134217728)
   440		DISPATCH(runtime·call268435456, 268435456)
   441		DISPATCH(runtime·call536870912, 536870912)
   442		DISPATCH(runtime·call1073741824, 1073741824)
   443		MOVQ	$runtime·badreflectcall(SB), AX
   444		JMP	AX
   445	
   446	#define CALLFN(NAME,MAXSIZE)			\
   447	TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   448		NO_LOCAL_POINTERS;			\
   449		/* copy arguments to stack */		\
   450		MOVQ	argptr+16(FP), SI;		\
   451		MOVLQZX argsize+24(FP), CX;		\
   452		MOVQ	SP, DI;				\
   453		REP;MOVSB;				\
   454		/* call function */			\
   455		MOVQ	f+8(FP), DX;			\
   456		PCDATA  $PCDATA_StackMapIndex, $0;	\
   457		CALL	(DX);				\
   458		/* copy return values back */		\
   459		MOVQ	argptr+16(FP), DI;		\
   460		MOVLQZX	argsize+24(FP), CX;		\
   461		MOVLQZX retoffset+28(FP), BX;		\
   462		MOVQ	SP, SI;				\
   463		ADDQ	BX, DI;				\
   464		ADDQ	BX, SI;				\
   465		SUBQ	BX, CX;				\
   466		REP;MOVSB;				\
   467		/* execute write barrier updates */	\
   468		MOVQ	argtype+0(FP), DX;		\
   469		MOVQ	argptr+16(FP), DI;		\
   470		MOVLQZX	argsize+24(FP), CX;		\
   471		MOVLQZX retoffset+28(FP), BX;		\
   472		MOVQ	DX, 0(SP);			\
   473		MOVQ	DI, 8(SP);			\
   474		MOVQ	CX, 16(SP);			\
   475		MOVQ	BX, 24(SP);			\
   476		CALL	runtime·callwritebarrier(SB);	\
   477		RET
   478	
   479	CALLFN(·call32, 32)
   480	CALLFN(·call64, 64)
   481	CALLFN(·call128, 128)
   482	CALLFN(·call256, 256)
   483	CALLFN(·call512, 512)
   484	CALLFN(·call1024, 1024)
   485	CALLFN(·call2048, 2048)
   486	CALLFN(·call4096, 4096)
   487	CALLFN(·call8192, 8192)
   488	CALLFN(·call16384, 16384)
   489	CALLFN(·call32768, 32768)
   490	CALLFN(·call65536, 65536)
   491	CALLFN(·call131072, 131072)
   492	CALLFN(·call262144, 262144)
   493	CALLFN(·call524288, 524288)
   494	CALLFN(·call1048576, 1048576)
   495	CALLFN(·call2097152, 2097152)
   496	CALLFN(·call4194304, 4194304)
   497	CALLFN(·call8388608, 8388608)
   498	CALLFN(·call16777216, 16777216)
   499	CALLFN(·call33554432, 33554432)
   500	CALLFN(·call67108864, 67108864)
   501	CALLFN(·call134217728, 134217728)
   502	CALLFN(·call268435456, 268435456)
   503	CALLFN(·call536870912, 536870912)
   504	CALLFN(·call1073741824, 1073741824)
   505	
   506	TEXT runtime·procyield(SB),NOSPLIT,$0-0
   507		MOVL	cycles+0(FP), AX
   508	again:
   509		PAUSE
   510		SUBL	$1, AX
   511		JNZ	again
   512		RET
   513	
   514	
   515	TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   516		// Stores are already ordered on x86, so this is just a
   517		// compile barrier.
   518		RET
   519	
   520	// void jmpdefer(fn, sp);
   521	// called from deferreturn.
   522	// 1. pop the caller
   523	// 2. sub 5 bytes from the callers return
   524	// 3. jmp to the argument
   525	TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   526		MOVQ	fv+0(FP), DX	// fn
   527		MOVQ	argp+8(FP), BX	// caller sp
   528		LEAQ	-8(BX), SP	// caller sp after CALL
   529		MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   530		SUBQ	$5, (SP)	// return to CALL again
   531		MOVQ	0(DX), BX
   532		JMP	BX	// but first run the deferred function
   533	
   534	// Save state of caller into g->sched. Smashes R8, R9.
   535	TEXT gosave<>(SB),NOSPLIT,$0
   536		get_tls(R8)
   537		MOVQ	g(R8), R8
   538		MOVQ	0(SP), R9
   539		MOVQ	R9, (g_sched+gobuf_pc)(R8)
   540		LEAQ	8(SP), R9
   541		MOVQ	R9, (g_sched+gobuf_sp)(R8)
   542		MOVQ	$0, (g_sched+gobuf_ret)(R8)
   543		MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   544		MOVQ	BP, (g_sched+gobuf_bp)(R8)
   545		RET
   546	
   547	// func asmcgocall(fn, arg unsafe.Pointer) int32
   548	// Call fn(arg) on the scheduler stack,
   549	// aligned appropriately for the gcc ABI.
   550	// See cgocall.go for more details.
   551	TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   552		MOVQ	fn+0(FP), AX
   553		MOVQ	arg+8(FP), BX
   554	
   555		MOVQ	SP, DX
   556	
   557		// Figure out if we need to switch to m->g0 stack.
   558		// We get called to create new OS threads too, and those
   559		// come in on the m->g0 stack already.
   560		get_tls(CX)
   561		MOVQ	g(CX), R8
   562		CMPQ	R8, $0
   563		JEQ	nosave
   564		MOVQ	g_m(R8), R8
   565		MOVQ	m_g0(R8), SI
   566		MOVQ	g(CX), DI
   567		CMPQ	SI, DI
   568		JEQ	nosave
   569		MOVQ	m_gsignal(R8), SI
   570		CMPQ	SI, DI
   571		JEQ	nosave
   572		
   573		// Switch to system stack.
   574		MOVQ	m_g0(R8), SI
   575		CALL	gosave<>(SB)
   576		MOVQ	SI, g(CX)
   577		MOVQ	(g_sched+gobuf_sp)(SI), SP
   578	
   579		// Now on a scheduling stack (a pthread-created stack).
   580		// Make sure we have enough room for 4 stack-backed fast-call
   581		// registers as per windows amd64 calling convention.
   582		SUBQ	$64, SP
   583		ANDQ	$~15, SP	// alignment for gcc ABI
   584		MOVQ	DI, 48(SP)	// save g
   585		MOVQ	(g_stack+stack_hi)(DI), DI
   586		SUBQ	DX, DI
   587		MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   588		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   589		MOVQ	BX, CX		// CX = first argument in Win64
   590		CALL	AX
   591	
   592		// Restore registers, g, stack pointer.
   593		get_tls(CX)
   594		MOVQ	48(SP), DI
   595		MOVQ	(g_stack+stack_hi)(DI), SI
   596		SUBQ	40(SP), SI
   597		MOVQ	DI, g(CX)
   598		MOVQ	SI, SP
   599	
   600		MOVL	AX, ret+16(FP)
   601		RET
   602	
   603	nosave:
   604		// Running on a system stack, perhaps even without a g.
   605		// Having no g can happen during thread creation or thread teardown
   606		// (see needm/dropm on Solaris, for example).
   607		// This code is like the above sequence but without saving/restoring g
   608		// and without worrying about the stack moving out from under us
   609		// (because we're on a system stack, not a goroutine stack).
   610		// The above code could be used directly if already on a system stack,
   611		// but then the only path through this code would be a rare case on Solaris.
   612		// Using this code for all "already on system stack" calls exercises it more,
   613		// which should help keep it correct.
   614		SUBQ	$64, SP
   615		ANDQ	$~15, SP
   616		MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   617		MOVQ	DX, 40(SP)	// save original stack pointer
   618		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   619		MOVQ	BX, CX		// CX = first argument in Win64
   620		CALL	AX
   621		MOVQ	40(SP), SI	// restore original stack pointer
   622		MOVQ	SI, SP
   623		MOVL	AX, ret+16(FP)
   624		RET
   625	
   626	// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   627	// Turn the fn into a Go func (by taking its address) and call
   628	// cgocallback_gofunc.
   629	TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   630		LEAQ	fn+0(FP), AX
   631		MOVQ	AX, 0(SP)
   632		MOVQ	frame+8(FP), AX
   633		MOVQ	AX, 8(SP)
   634		MOVQ	framesize+16(FP), AX
   635		MOVQ	AX, 16(SP)
   636		MOVQ	ctxt+24(FP), AX
   637		MOVQ	AX, 24(SP)
   638		MOVQ	$runtime·cgocallback_gofunc(SB), AX
   639		CALL	AX
   640		RET
   641	
   642	// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   643	// See cgocall.go for more details.
   644	TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   645		NO_LOCAL_POINTERS
   646	
   647		// If g is nil, Go did not create the current thread.
   648		// Call needm to obtain one m for temporary use.
   649		// In this case, we're running on the thread stack, so there's
   650		// lots of space, but the linker doesn't know. Hide the call from
   651		// the linker analysis by using an indirect call through AX.
   652		get_tls(CX)
   653	#ifdef GOOS_windows
   654		MOVL	$0, BX
   655		CMPQ	CX, $0
   656		JEQ	2(PC)
   657	#endif
   658		MOVQ	g(CX), BX
   659		CMPQ	BX, $0
   660		JEQ	needm
   661		MOVQ	g_m(BX), BX
   662		MOVQ	BX, R8 // holds oldm until end of function
   663		JMP	havem
   664	needm:
   665		MOVQ	$0, 0(SP)
   666		MOVQ	$runtime·needm(SB), AX
   667		CALL	AX
   668		MOVQ	0(SP), R8
   669		get_tls(CX)
   670		MOVQ	g(CX), BX
   671		MOVQ	g_m(BX), BX
   672		
   673		// Set m->sched.sp = SP, so that if a panic happens
   674		// during the function we are about to execute, it will
   675		// have a valid SP to run on the g0 stack.
   676		// The next few lines (after the havem label)
   677		// will save this SP onto the stack and then write
   678		// the same SP back to m->sched.sp. That seems redundant,
   679		// but if an unrecovered panic happens, unwindm will
   680		// restore the g->sched.sp from the stack location
   681		// and then systemstack will try to use it. If we don't set it here,
   682		// that restored SP will be uninitialized (typically 0) and
   683		// will not be usable.
   684		MOVQ	m_g0(BX), SI
   685		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   686	
   687	havem:
   688		// Now there's a valid m, and we're running on its m->g0.
   689		// Save current m->g0->sched.sp on stack and then set it to SP.
   690		// Save current sp in m->g0->sched.sp in preparation for
   691		// switch back to m->curg stack.
   692		// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   693		MOVQ	m_g0(BX), SI
   694		MOVQ	(g_sched+gobuf_sp)(SI), AX
   695		MOVQ	AX, 0(SP)
   696		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   697	
   698		// Switch to m->curg stack and call runtime.cgocallbackg.
   699		// Because we are taking over the execution of m->curg
   700		// but *not* resuming what had been running, we need to
   701		// save that information (m->curg->sched) so we can restore it.
   702		// We can restore m->curg->sched.sp easily, because calling
   703		// runtime.cgocallbackg leaves SP unchanged upon return.
   704		// To save m->curg->sched.pc, we push it onto the stack.
   705		// This has the added benefit that it looks to the traceback
   706		// routine like cgocallbackg is going to return to that
   707		// PC (because the frame we allocate below has the same
   708		// size as cgocallback_gofunc's frame declared above)
   709		// so that the traceback will seamlessly trace back into
   710		// the earlier calls.
   711		//
   712		// In the new goroutine, 8(SP) holds the saved R8.
   713		MOVQ	m_curg(BX), SI
   714		MOVQ	SI, g(CX)
   715		MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   716		MOVQ	(g_sched+gobuf_pc)(SI), BX
   717		MOVQ	BX, -8(DI)
   718		// Compute the size of the frame, including return PC and, if
   719		// GOEXPERIMENT=framepointer, the saved based pointer
   720		MOVQ	ctxt+24(FP), BX
   721		LEAQ	fv+0(FP), AX
   722		SUBQ	SP, AX
   723		SUBQ	AX, DI
   724		MOVQ	DI, SP
   725	
   726		MOVQ	R8, 8(SP)
   727		MOVQ	BX, 0(SP)
   728		CALL	runtime·cgocallbackg(SB)
   729		MOVQ	8(SP), R8
   730	
   731		// Compute the size of the frame again. FP and SP have
   732		// completely different values here than they did above,
   733		// but only their difference matters.
   734		LEAQ	fv+0(FP), AX
   735		SUBQ	SP, AX
   736	
   737		// Restore g->sched (== m->curg->sched) from saved values.
   738		get_tls(CX)
   739		MOVQ	g(CX), SI
   740		MOVQ	SP, DI
   741		ADDQ	AX, DI
   742		MOVQ	-8(DI), BX
   743		MOVQ	BX, (g_sched+gobuf_pc)(SI)
   744		MOVQ	DI, (g_sched+gobuf_sp)(SI)
   745	
   746		// Switch back to m->g0's stack and restore m->g0->sched.sp.
   747		// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   748		// so we do not have to restore it.)
   749		MOVQ	g(CX), BX
   750		MOVQ	g_m(BX), BX
   751		MOVQ	m_g0(BX), SI
   752		MOVQ	SI, g(CX)
   753		MOVQ	(g_sched+gobuf_sp)(SI), SP
   754		MOVQ	0(SP), AX
   755		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   756		
   757		// If the m on entry was nil, we called needm above to borrow an m
   758		// for the duration of the call. Since the call is over, return it with dropm.
   759		CMPQ	R8, $0
   760		JNE 3(PC)
   761		MOVQ	$runtime·dropm(SB), AX
   762		CALL	AX
   763	
   764		// Done!
   765		RET
   766	
   767	// void setg(G*); set g. for use by needm.
   768	TEXT runtime·setg(SB), NOSPLIT, $0-8
   769		MOVQ	gg+0(FP), BX
   770	#ifdef GOOS_windows
   771		CMPQ	BX, $0
   772		JNE	settls
   773		MOVQ	$0, 0x28(GS)
   774		RET
   775	settls:
   776		MOVQ	g_m(BX), AX
   777		LEAQ	m_tls(AX), AX
   778		MOVQ	AX, 0x28(GS)
   779	#endif
   780		get_tls(CX)
   781		MOVQ	BX, g(CX)
   782		RET
   783	
   784	// void setg_gcc(G*); set g called from gcc.
   785	TEXT setg_gcc<>(SB),NOSPLIT,$0
   786		get_tls(AX)
   787		MOVQ	DI, g(AX)
   788		RET
   789	
   790	// check that SP is in range [g->stack.lo, g->stack.hi)
   791	TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   792		get_tls(CX)
   793		MOVQ	g(CX), AX
   794		CMPQ	(g_stack+stack_hi)(AX), SP
   795		JHI	2(PC)
   796		INT	$3
   797		CMPQ	SP, (g_stack+stack_lo)(AX)
   798		JHI	2(PC)
   799		INT	$3
   800		RET
   801	
   802	TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   803		MOVQ	argp+0(FP),AX		// addr of first arg
   804		MOVQ	-8(AX),AX		// get calling pc
   805		CMPQ	AX, runtime·stackBarrierPC(SB)
   806		JNE	nobar
   807		// Get original return PC.
   808		CALL	runtime·nextBarrierPC(SB)
   809		MOVQ	0(SP), AX
   810	nobar:
   811		MOVQ	AX, ret+8(FP)
   812		RET
   813	
   814	TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   815		MOVQ	argp+0(FP),AX		// addr of first arg
   816		MOVQ	pc+8(FP), BX
   817		MOVQ	-8(AX), CX
   818		CMPQ	CX, runtime·stackBarrierPC(SB)
   819		JEQ	setbar
   820		MOVQ	BX, -8(AX)		// set calling pc
   821		RET
   822	setbar:
   823		// Set the stack barrier return PC.
   824		MOVQ	BX, 0(SP)
   825		CALL	runtime·setNextBarrierPC(SB)
   826		RET
   827	
   828	TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   829		MOVQ	argp+0(FP), AX
   830		MOVQ	AX, ret+8(FP)
   831		RET
   832	
   833	// func cputicks() int64
   834	TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   835		CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   836		JNE	mfence
   837		LFENCE
   838		JMP	done
   839	mfence:
   840		MFENCE
   841	done:
   842		RDTSC
   843		SHLQ	$32, DX
   844		ADDQ	DX, AX
   845		MOVQ	AX, ret+0(FP)
   846		RET
   847	
   848	// memhash_varlen(p unsafe.Pointer, h seed) uintptr
   849	// redirects to memhash(p, h, size) using the size
   850	// stored in the closure.
   851	TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   852		GO_ARGS
   853		NO_LOCAL_POINTERS
   854		MOVQ	p+0(FP), AX
   855		MOVQ	h+8(FP), BX
   856		MOVQ	8(DX), CX
   857		MOVQ	AX, 0(SP)
   858		MOVQ	BX, 8(SP)
   859		MOVQ	CX, 16(SP)
   860		CALL	runtime·memhash(SB)
   861		MOVQ	24(SP), AX
   862		MOVQ	AX, ret+16(FP)
   863		RET
   864	
   865	// hash function using AES hardware instructions
   866	TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   867		MOVQ	p+0(FP), AX	// ptr to data
   868		MOVQ	s+16(FP), CX	// size
   869		LEAQ	ret+24(FP), DX
   870		JMP	runtime·aeshashbody(SB)
   871	
   872	TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   873		MOVQ	p+0(FP), AX	// ptr to string struct
   874		MOVQ	8(AX), CX	// length of string
   875		MOVQ	(AX), AX	// string data
   876		LEAQ	ret+16(FP), DX
   877		JMP	runtime·aeshashbody(SB)
   878	
   879	// AX: data
   880	// CX: length
   881	// DX: address to put return value
   882	TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   883		// Fill an SSE register with our seeds.
   884		MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   885		PINSRW	$4, CX, X0			// 16 bits of length
   886		PSHUFHW $0, X0, X0			// repeat length 4 times total
   887		MOVO	X0, X1				// save unscrambled seed
   888		PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   889		AESENC	X0, X0				// scramble seed
   890	
   891		CMPQ	CX, $16
   892		JB	aes0to15
   893		JE	aes16
   894		CMPQ	CX, $32
   895		JBE	aes17to32
   896		CMPQ	CX, $64
   897		JBE	aes33to64
   898		CMPQ	CX, $128
   899		JBE	aes65to128
   900		JMP	aes129plus
   901	
   902	aes0to15:
   903		TESTQ	CX, CX
   904		JE	aes0
   905	
   906		ADDQ	$16, AX
   907		TESTW	$0xff0, AX
   908		JE	endofpage
   909	
   910		// 16 bytes loaded at this address won't cross
   911		// a page boundary, so we can load it directly.
   912		MOVOU	-16(AX), X1
   913		ADDQ	CX, CX
   914		MOVQ	$masks<>(SB), AX
   915		PAND	(AX)(CX*8), X1
   916	final1:
   917		PXOR	X0, X1	// xor data with seed
   918		AESENC	X1, X1	// scramble combo 3 times
   919		AESENC	X1, X1
   920		AESENC	X1, X1
   921		MOVQ	X1, (DX)
   922		RET
   923	
   924	endofpage:
   925		// address ends in 1111xxxx. Might be up against
   926		// a page boundary, so load ending at last byte.
   927		// Then shift bytes down using pshufb.
   928		MOVOU	-32(AX)(CX*1), X1
   929		ADDQ	CX, CX
   930		MOVQ	$shifts<>(SB), AX
   931		PSHUFB	(AX)(CX*8), X1
   932		JMP	final1
   933	
   934	aes0:
   935		// Return scrambled input seed
   936		AESENC	X0, X0
   937		MOVQ	X0, (DX)
   938		RET
   939	
   940	aes16:
   941		MOVOU	(AX), X1
   942		JMP	final1
   943	
   944	aes17to32:
   945		// make second starting seed
   946		PXOR	runtime·aeskeysched+16(SB), X1
   947		AESENC	X1, X1
   948		
   949		// load data to be hashed
   950		MOVOU	(AX), X2
   951		MOVOU	-16(AX)(CX*1), X3
   952	
   953		// xor with seed
   954		PXOR	X0, X2
   955		PXOR	X1, X3
   956	
   957		// scramble 3 times
   958		AESENC	X2, X2
   959		AESENC	X3, X3
   960		AESENC	X2, X2
   961		AESENC	X3, X3
   962		AESENC	X2, X2
   963		AESENC	X3, X3
   964	
   965		// combine results
   966		PXOR	X3, X2
   967		MOVQ	X2, (DX)
   968		RET
   969	
   970	aes33to64:
   971		// make 3 more starting seeds
   972		MOVO	X1, X2
   973		MOVO	X1, X3
   974		PXOR	runtime·aeskeysched+16(SB), X1
   975		PXOR	runtime·aeskeysched+32(SB), X2
   976		PXOR	runtime·aeskeysched+48(SB), X3
   977		AESENC	X1, X1
   978		AESENC	X2, X2
   979		AESENC	X3, X3
   980		
   981		MOVOU	(AX), X4
   982		MOVOU	16(AX), X5
   983		MOVOU	-32(AX)(CX*1), X6
   984		MOVOU	-16(AX)(CX*1), X7
   985	
   986		PXOR	X0, X4
   987		PXOR	X1, X5
   988		PXOR	X2, X6
   989		PXOR	X3, X7
   990		
   991		AESENC	X4, X4
   992		AESENC	X5, X5
   993		AESENC	X6, X6
   994		AESENC	X7, X7
   995		
   996		AESENC	X4, X4
   997		AESENC	X5, X5
   998		AESENC	X6, X6
   999		AESENC	X7, X7
  1000		
  1001		AESENC	X4, X4
  1002		AESENC	X5, X5
  1003		AESENC	X6, X6
  1004		AESENC	X7, X7
  1005	
  1006		PXOR	X6, X4
  1007		PXOR	X7, X5
  1008		PXOR	X5, X4
  1009		MOVQ	X4, (DX)
  1010		RET
  1011	
  1012	aes65to128:
  1013		// make 7 more starting seeds
  1014		MOVO	X1, X2
  1015		MOVO	X1, X3
  1016		MOVO	X1, X4
  1017		MOVO	X1, X5
  1018		MOVO	X1, X6
  1019		MOVO	X1, X7
  1020		PXOR	runtime·aeskeysched+16(SB), X1
  1021		PXOR	runtime·aeskeysched+32(SB), X2
  1022		PXOR	runtime·aeskeysched+48(SB), X3
  1023		PXOR	runtime·aeskeysched+64(SB), X4
  1024		PXOR	runtime·aeskeysched+80(SB), X5
  1025		PXOR	runtime·aeskeysched+96(SB), X6
  1026		PXOR	runtime·aeskeysched+112(SB), X7
  1027		AESENC	X1, X1
  1028		AESENC	X2, X2
  1029		AESENC	X3, X3
  1030		AESENC	X4, X4
  1031		AESENC	X5, X5
  1032		AESENC	X6, X6
  1033		AESENC	X7, X7
  1034	
  1035		// load data
  1036		MOVOU	(AX), X8
  1037		MOVOU	16(AX), X9
  1038		MOVOU	32(AX), X10
  1039		MOVOU	48(AX), X11
  1040		MOVOU	-64(AX)(CX*1), X12
  1041		MOVOU	-48(AX)(CX*1), X13
  1042		MOVOU	-32(AX)(CX*1), X14
  1043		MOVOU	-16(AX)(CX*1), X15
  1044	
  1045		// xor with seed
  1046		PXOR	X0, X8
  1047		PXOR	X1, X9
  1048		PXOR	X2, X10
  1049		PXOR	X3, X11
  1050		PXOR	X4, X12
  1051		PXOR	X5, X13
  1052		PXOR	X6, X14
  1053		PXOR	X7, X15
  1054	
  1055		// scramble 3 times
  1056		AESENC	X8, X8
  1057		AESENC	X9, X9
  1058		AESENC	X10, X10
  1059		AESENC	X11, X11
  1060		AESENC	X12, X12
  1061		AESENC	X13, X13
  1062		AESENC	X14, X14
  1063		AESENC	X15, X15
  1064	
  1065		AESENC	X8, X8
  1066		AESENC	X9, X9
  1067		AESENC	X10, X10
  1068		AESENC	X11, X11
  1069		AESENC	X12, X12
  1070		AESENC	X13, X13
  1071		AESENC	X14, X14
  1072		AESENC	X15, X15
  1073	
  1074		AESENC	X8, X8
  1075		AESENC	X9, X9
  1076		AESENC	X10, X10
  1077		AESENC	X11, X11
  1078		AESENC	X12, X12
  1079		AESENC	X13, X13
  1080		AESENC	X14, X14
  1081		AESENC	X15, X15
  1082	
  1083		// combine results
  1084		PXOR	X12, X8
  1085		PXOR	X13, X9
  1086		PXOR	X14, X10
  1087		PXOR	X15, X11
  1088		PXOR	X10, X8
  1089		PXOR	X11, X9
  1090		PXOR	X9, X8
  1091		MOVQ	X8, (DX)
  1092		RET
  1093	
  1094	aes129plus:
  1095		// make 7 more starting seeds
  1096		MOVO	X1, X2
  1097		MOVO	X1, X3
  1098		MOVO	X1, X4
  1099		MOVO	X1, X5
  1100		MOVO	X1, X6
  1101		MOVO	X1, X7
  1102		PXOR	runtime·aeskeysched+16(SB), X1
  1103		PXOR	runtime·aeskeysched+32(SB), X2
  1104		PXOR	runtime·aeskeysched+48(SB), X3
  1105		PXOR	runtime·aeskeysched+64(SB), X4
  1106		PXOR	runtime·aeskeysched+80(SB), X5
  1107		PXOR	runtime·aeskeysched+96(SB), X6
  1108		PXOR	runtime·aeskeysched+112(SB), X7
  1109		AESENC	X1, X1
  1110		AESENC	X2, X2
  1111		AESENC	X3, X3
  1112		AESENC	X4, X4
  1113		AESENC	X5, X5
  1114		AESENC	X6, X6
  1115		AESENC	X7, X7
  1116		
  1117		// start with last (possibly overlapping) block
  1118		MOVOU	-128(AX)(CX*1), X8
  1119		MOVOU	-112(AX)(CX*1), X9
  1120		MOVOU	-96(AX)(CX*1), X10
  1121		MOVOU	-80(AX)(CX*1), X11
  1122		MOVOU	-64(AX)(CX*1), X12
  1123		MOVOU	-48(AX)(CX*1), X13
  1124		MOVOU	-32(AX)(CX*1), X14
  1125		MOVOU	-16(AX)(CX*1), X15
  1126	
  1127		// xor in seed
  1128		PXOR	X0, X8
  1129		PXOR	X1, X9
  1130		PXOR	X2, X10
  1131		PXOR	X3, X11
  1132		PXOR	X4, X12
  1133		PXOR	X5, X13
  1134		PXOR	X6, X14
  1135		PXOR	X7, X15
  1136		
  1137		// compute number of remaining 128-byte blocks
  1138		DECQ	CX
  1139		SHRQ	$7, CX
  1140		
  1141	aesloop:
  1142		// scramble state
  1143		AESENC	X8, X8
  1144		AESENC	X9, X9
  1145		AESENC	X10, X10
  1146		AESENC	X11, X11
  1147		AESENC	X12, X12
  1148		AESENC	X13, X13
  1149		AESENC	X14, X14
  1150		AESENC	X15, X15
  1151	
  1152		// scramble state, xor in a block
  1153		MOVOU	(AX), X0
  1154		MOVOU	16(AX), X1
  1155		MOVOU	32(AX), X2
  1156		MOVOU	48(AX), X3
  1157		AESENC	X0, X8
  1158		AESENC	X1, X9
  1159		AESENC	X2, X10
  1160		AESENC	X3, X11
  1161		MOVOU	64(AX), X4
  1162		MOVOU	80(AX), X5
  1163		MOVOU	96(AX), X6
  1164		MOVOU	112(AX), X7
  1165		AESENC	X4, X12
  1166		AESENC	X5, X13
  1167		AESENC	X6, X14
  1168		AESENC	X7, X15
  1169	
  1170		ADDQ	$128, AX
  1171		DECQ	CX
  1172		JNE	aesloop
  1173	
  1174		// 3 more scrambles to finish
  1175		AESENC	X8, X8
  1176		AESENC	X9, X9
  1177		AESENC	X10, X10
  1178		AESENC	X11, X11
  1179		AESENC	X12, X12
  1180		AESENC	X13, X13
  1181		AESENC	X14, X14
  1182		AESENC	X15, X15
  1183		AESENC	X8, X8
  1184		AESENC	X9, X9
  1185		AESENC	X10, X10
  1186		AESENC	X11, X11
  1187		AESENC	X12, X12
  1188		AESENC	X13, X13
  1189		AESENC	X14, X14
  1190		AESENC	X15, X15
  1191		AESENC	X8, X8
  1192		AESENC	X9, X9
  1193		AESENC	X10, X10
  1194		AESENC	X11, X11
  1195		AESENC	X12, X12
  1196		AESENC	X13, X13
  1197		AESENC	X14, X14
  1198		AESENC	X15, X15
  1199	
  1200		PXOR	X12, X8
  1201		PXOR	X13, X9
  1202		PXOR	X14, X10
  1203		PXOR	X15, X11
  1204		PXOR	X10, X8
  1205		PXOR	X11, X9
  1206		PXOR	X9, X8
  1207		MOVQ	X8, (DX)
  1208		RET
  1209		
  1210	TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1211		MOVQ	p+0(FP), AX	// ptr to data
  1212		MOVQ	h+8(FP), X0	// seed
  1213		PINSRD	$2, (AX), X0	// data
  1214		AESENC	runtime·aeskeysched+0(SB), X0
  1215		AESENC	runtime·aeskeysched+16(SB), X0
  1216		AESENC	runtime·aeskeysched+32(SB), X0
  1217		MOVQ	X0, ret+16(FP)
  1218		RET
  1219	
  1220	TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1221		MOVQ	p+0(FP), AX	// ptr to data
  1222		MOVQ	h+8(FP), X0	// seed
  1223		PINSRQ	$1, (AX), X0	// data
  1224		AESENC	runtime·aeskeysched+0(SB), X0
  1225		AESENC	runtime·aeskeysched+16(SB), X0
  1226		AESENC	runtime·aeskeysched+32(SB), X0
  1227		MOVQ	X0, ret+16(FP)
  1228		RET
  1229	
  1230	// simple mask to get rid of data in the high part of the register.
  1231	DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1232	DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1233	DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1234	DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1235	DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1236	DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1237	DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1238	DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1239	DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1240	DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1241	DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1242	DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1243	DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1244	DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1245	DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1246	DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1247	DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1248	DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1249	DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1250	DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1251	DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1252	DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1253	DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1254	DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1255	DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1256	DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1257	DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1258	DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1259	DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1260	DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1261	DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1262	DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1263	GLOBL masks<>(SB),RODATA,$256
  1264	
  1265	TEXT ·checkASM(SB),NOSPLIT,$0-1
  1266		// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1267		MOVQ	$masks<>(SB), AX
  1268		MOVQ	$shifts<>(SB), BX
  1269		ORQ	BX, AX
  1270		TESTQ	$15, AX
  1271		SETEQ	ret+0(FP)
  1272		RET
  1273	
  1274	// these are arguments to pshufb. They move data down from
  1275	// the high bytes of the register to the low bytes of the register.
  1276	// index is how many bytes to move.
  1277	DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1278	DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1279	DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1280	DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1281	DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1282	DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1283	DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1284	DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1285	DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1286	DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1287	DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1288	DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1289	DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1290	DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1291	DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1292	DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1293	DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1294	DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1295	DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1296	DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1297	DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1298	DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1299	DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1300	DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1301	DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1302	DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1303	DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1304	DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1305	DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1306	DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1307	DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1308	DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1309	GLOBL shifts<>(SB),RODATA,$256
  1310	
  1311	// memequal(p, q unsafe.Pointer, size uintptr) bool
  1312	TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1313		MOVQ	a+0(FP), SI
  1314		MOVQ	b+8(FP), DI
  1315		CMPQ	SI, DI
  1316		JEQ	eq
  1317		MOVQ	size+16(FP), BX
  1318		LEAQ	ret+24(FP), AX
  1319		JMP	runtime·memeqbody(SB)
  1320	eq:
  1321		MOVB	$1, ret+24(FP)
  1322		RET
  1323	
  1324	// memequal_varlen(a, b unsafe.Pointer) bool
  1325	TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1326		MOVQ	a+0(FP), SI
  1327		MOVQ	b+8(FP), DI
  1328		CMPQ	SI, DI
  1329		JEQ	eq
  1330		MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1331		LEAQ	ret+16(FP), AX
  1332		JMP	runtime·memeqbody(SB)
  1333	eq:
  1334		MOVB	$1, ret+16(FP)
  1335		RET
  1336	
  1337	// eqstring tests whether two strings are equal.
  1338	// The compiler guarantees that strings passed
  1339	// to eqstring have equal length.
  1340	// See runtime_test.go:eqstring_generic for
  1341	// equivalent Go code.
  1342	TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1343		MOVQ	s1str+0(FP), SI
  1344		MOVQ	s2str+16(FP), DI
  1345		CMPQ	SI, DI
  1346		JEQ	eq
  1347		MOVQ	s1len+8(FP), BX
  1348		LEAQ	v+32(FP), AX
  1349		JMP	runtime·memeqbody(SB)
  1350	eq:
  1351		MOVB	$1, v+32(FP)
  1352		RET
  1353	
  1354	// a in SI
  1355	// b in DI
  1356	// count in BX
  1357	// address of result byte in AX
  1358	TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1359		CMPQ	BX, $8
  1360		JB	small
  1361		CMPQ	BX, $64
  1362		JB	bigloop
  1363		CMPB    runtime·support_avx2(SB), $1
  1364		JE	hugeloop_avx2
  1365		
  1366		// 64 bytes at a time using xmm registers
  1367	hugeloop:
  1368		CMPQ	BX, $64
  1369		JB	bigloop
  1370		MOVOU	(SI), X0
  1371		MOVOU	(DI), X1
  1372		MOVOU	16(SI), X2
  1373		MOVOU	16(DI), X3
  1374		MOVOU	32(SI), X4
  1375		MOVOU	32(DI), X5
  1376		MOVOU	48(SI), X6
  1377		MOVOU	48(DI), X7
  1378		PCMPEQB	X1, X0
  1379		PCMPEQB	X3, X2
  1380		PCMPEQB	X5, X4
  1381		PCMPEQB	X7, X6
  1382		PAND	X2, X0
  1383		PAND	X6, X4
  1384		PAND	X4, X0
  1385		PMOVMSKB X0, DX
  1386		ADDQ	$64, SI
  1387		ADDQ	$64, DI
  1388		SUBQ	$64, BX
  1389		CMPL	DX, $0xffff
  1390		JEQ	hugeloop
  1391		MOVB	$0, (AX)
  1392		RET
  1393	
  1394		// 64 bytes at a time using ymm registers
  1395	hugeloop_avx2:
  1396		CMPQ	BX, $64
  1397		JB	bigloop_avx2
  1398		VMOVDQU	(SI), Y0
  1399		VMOVDQU	(DI), Y1
  1400		VMOVDQU	32(SI), Y2
  1401		VMOVDQU	32(DI), Y3
  1402		VPCMPEQB	Y1, Y0, Y4
  1403		VPCMPEQB	Y2, Y3, Y5
  1404		VPAND	Y4, Y5, Y6
  1405		VPMOVMSKB Y6, DX
  1406		ADDQ	$64, SI
  1407		ADDQ	$64, DI
  1408		SUBQ	$64, BX
  1409		CMPL	DX, $0xffffffff
  1410		JEQ	hugeloop_avx2
  1411		VZEROUPPER
  1412		MOVB	$0, (AX)
  1413		RET
  1414	
  1415	bigloop_avx2:
  1416		VZEROUPPER
  1417	
  1418		// 8 bytes at a time using 64-bit register
  1419	bigloop:
  1420		CMPQ	BX, $8
  1421		JBE	leftover
  1422		MOVQ	(SI), CX
  1423		MOVQ	(DI), DX
  1424		ADDQ	$8, SI
  1425		ADDQ	$8, DI
  1426		SUBQ	$8, BX
  1427		CMPQ	CX, DX
  1428		JEQ	bigloop
  1429		MOVB	$0, (AX)
  1430		RET
  1431	
  1432		// remaining 0-8 bytes
  1433	leftover:
  1434		MOVQ	-8(SI)(BX*1), CX
  1435		MOVQ	-8(DI)(BX*1), DX
  1436		CMPQ	CX, DX
  1437		SETEQ	(AX)
  1438		RET
  1439	
  1440	small:
  1441		CMPQ	BX, $0
  1442		JEQ	equal
  1443	
  1444		LEAQ	0(BX*8), CX
  1445		NEGQ	CX
  1446	
  1447		CMPB	SI, $0xf8
  1448		JA	si_high
  1449	
  1450		// load at SI won't cross a page boundary.
  1451		MOVQ	(SI), SI
  1452		JMP	si_finish
  1453	si_high:
  1454		// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1455		MOVQ	-8(SI)(BX*1), SI
  1456		SHRQ	CX, SI
  1457	si_finish:
  1458	
  1459		// same for DI.
  1460		CMPB	DI, $0xf8
  1461		JA	di_high
  1462		MOVQ	(DI), DI
  1463		JMP	di_finish
  1464	di_high:
  1465		MOVQ	-8(DI)(BX*1), DI
  1466		SHRQ	CX, DI
  1467	di_finish:
  1468	
  1469		SUBQ	SI, DI
  1470		SHLQ	CX, DI
  1471	equal:
  1472		SETEQ	(AX)
  1473		RET
  1474	
  1475	TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1476		MOVQ	s1_base+0(FP), SI
  1477		MOVQ	s1_len+8(FP), BX
  1478		MOVQ	s2_base+16(FP), DI
  1479		MOVQ	s2_len+24(FP), DX
  1480		LEAQ	ret+32(FP), R9
  1481		JMP	runtime·cmpbody(SB)
  1482	
  1483	TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1484		MOVQ	s1+0(FP), SI
  1485		MOVQ	s1+8(FP), BX
  1486		MOVQ	s2+24(FP), DI
  1487		MOVQ	s2+32(FP), DX
  1488		LEAQ	res+48(FP), R9
  1489		JMP	runtime·cmpbody(SB)
  1490	
  1491	// input:
  1492	//   SI = a
  1493	//   DI = b
  1494	//   BX = alen
  1495	//   DX = blen
  1496	//   R9 = address of output word (stores -1/0/1 here)
  1497	TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1498		CMPQ	SI, DI
  1499		JEQ	allsame
  1500		CMPQ	BX, DX
  1501		MOVQ	DX, R8
  1502		CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1503		CMPQ	R8, $8
  1504		JB	small
  1505	
  1506		CMPQ	R8, $63
  1507		JBE	loop
  1508		CMPB    runtime·support_avx2(SB), $1
  1509		JEQ     big_loop_avx2
  1510		JMP	big_loop
  1511	loop:
  1512		CMPQ	R8, $16
  1513		JBE	_0through16
  1514		MOVOU	(SI), X0
  1515		MOVOU	(DI), X1
  1516		PCMPEQB X0, X1
  1517		PMOVMSKB X1, AX
  1518		XORQ	$0xffff, AX	// convert EQ to NE
  1519		JNE	diff16	// branch if at least one byte is not equal
  1520		ADDQ	$16, SI
  1521		ADDQ	$16, DI
  1522		SUBQ	$16, R8
  1523		JMP	loop
  1524		
  1525	diff64:
  1526		ADDQ	$48, SI
  1527		ADDQ	$48, DI
  1528		JMP	diff16
  1529	diff48:
  1530		ADDQ	$32, SI
  1531		ADDQ	$32, DI
  1532		JMP	diff16
  1533	diff32:
  1534		ADDQ	$16, SI
  1535		ADDQ	$16, DI
  1536		// AX = bit mask of differences
  1537	diff16:
  1538		BSFQ	AX, BX	// index of first byte that differs
  1539		XORQ	AX, AX
  1540		MOVB	(SI)(BX*1), CX
  1541		CMPB	CX, (DI)(BX*1)
  1542		SETHI	AX
  1543		LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1544		MOVQ	AX, (R9)
  1545		RET
  1546	
  1547		// 0 through 16 bytes left, alen>=8, blen>=8
  1548	_0through16:
  1549		CMPQ	R8, $8
  1550		JBE	_0through8
  1551		MOVQ	(SI), AX
  1552		MOVQ	(DI), CX
  1553		CMPQ	AX, CX
  1554		JNE	diff8
  1555	_0through8:
  1556		MOVQ	-8(SI)(R8*1), AX
  1557		MOVQ	-8(DI)(R8*1), CX
  1558		CMPQ	AX, CX
  1559		JEQ	allsame
  1560	
  1561		// AX and CX contain parts of a and b that differ.
  1562	diff8:
  1563		BSWAPQ	AX	// reverse order of bytes
  1564		BSWAPQ	CX
  1565		XORQ	AX, CX
  1566		BSRQ	CX, CX	// index of highest bit difference
  1567		SHRQ	CX, AX	// move a's bit to bottom
  1568		ANDQ	$1, AX	// mask bit
  1569		LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1570		MOVQ	AX, (R9)
  1571		RET
  1572	
  1573		// 0-7 bytes in common
  1574	small:
  1575		LEAQ	(R8*8), CX	// bytes left -> bits left
  1576		NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1577		JEQ	allsame
  1578	
  1579		// load bytes of a into high bytes of AX
  1580		CMPB	SI, $0xf8
  1581		JA	si_high
  1582		MOVQ	(SI), SI
  1583		JMP	si_finish
  1584	si_high:
  1585		MOVQ	-8(SI)(R8*1), SI
  1586		SHRQ	CX, SI
  1587	si_finish:
  1588		SHLQ	CX, SI
  1589	
  1590		// load bytes of b in to high bytes of BX
  1591		CMPB	DI, $0xf8
  1592		JA	di_high
  1593		MOVQ	(DI), DI
  1594		JMP	di_finish
  1595	di_high:
  1596		MOVQ	-8(DI)(R8*1), DI
  1597		SHRQ	CX, DI
  1598	di_finish:
  1599		SHLQ	CX, DI
  1600	
  1601		BSWAPQ	SI	// reverse order of bytes
  1602		BSWAPQ	DI
  1603		XORQ	SI, DI	// find bit differences
  1604		JEQ	allsame
  1605		BSRQ	DI, CX	// index of highest bit difference
  1606		SHRQ	CX, SI	// move a's bit to bottom
  1607		ANDQ	$1, SI	// mask bit
  1608		LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1609		MOVQ	AX, (R9)
  1610		RET
  1611	
  1612	allsame:
  1613		XORQ	AX, AX
  1614		XORQ	CX, CX
  1615		CMPQ	BX, DX
  1616		SETGT	AX	// 1 if alen > blen
  1617		SETEQ	CX	// 1 if alen == blen
  1618		LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1619		MOVQ	AX, (R9)
  1620		RET
  1621	
  1622		// this works for >= 64 bytes of data.
  1623	big_loop:
  1624		MOVOU	(SI), X0
  1625		MOVOU	(DI), X1
  1626		PCMPEQB X0, X1
  1627		PMOVMSKB X1, AX
  1628		XORQ	$0xffff, AX
  1629		JNE	diff16
  1630	
  1631		MOVOU	16(SI), X0
  1632		MOVOU	16(DI), X1
  1633		PCMPEQB X0, X1
  1634		PMOVMSKB X1, AX
  1635		XORQ	$0xffff, AX
  1636		JNE	diff32
  1637	
  1638		MOVOU	32(SI), X0
  1639		MOVOU	32(DI), X1
  1640		PCMPEQB X0, X1
  1641		PMOVMSKB X1, AX
  1642		XORQ	$0xffff, AX
  1643		JNE	diff48
  1644	
  1645		MOVOU	48(SI), X0
  1646		MOVOU	48(DI), X1
  1647		PCMPEQB X0, X1
  1648		PMOVMSKB X1, AX
  1649		XORQ	$0xffff, AX
  1650		JNE	diff64
  1651	
  1652		ADDQ	$64, SI
  1653		ADDQ	$64, DI
  1654		SUBQ	$64, R8
  1655		CMPQ	R8, $64
  1656		JBE	loop
  1657		JMP	big_loop
  1658	
  1659		// Compare 64-bytes per loop iteration.
  1660		// Loop is unrolled and uses AVX2.
  1661	big_loop_avx2:
  1662		VMOVDQU	(SI), Y2
  1663		VMOVDQU	(DI), Y3
  1664		VMOVDQU	32(SI), Y4
  1665		VMOVDQU	32(DI), Y5
  1666		VPCMPEQB Y2, Y3, Y0
  1667		VPMOVMSKB Y0, AX
  1668		XORL	$0xffffffff, AX
  1669		JNE	diff32_avx2
  1670		VPCMPEQB Y4, Y5, Y6
  1671		VPMOVMSKB Y6, AX
  1672		XORL	$0xffffffff, AX
  1673		JNE	diff64_avx2
  1674	
  1675		ADDQ	$64, SI
  1676		ADDQ	$64, DI
  1677		SUBQ	$64, R8
  1678		CMPQ	R8, $64
  1679		JB	big_loop_avx2_exit
  1680		JMP	big_loop_avx2
  1681	
  1682		// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1683	diff32_avx2:
  1684		VZEROUPPER
  1685		JMP diff16
  1686	
  1687		// Same as diff32_avx2, but for last 32 bytes.
  1688	diff64_avx2:
  1689		VZEROUPPER
  1690		JMP diff48
  1691	
  1692		// For <64 bytes remainder jump to normal loop.
  1693	big_loop_avx2_exit:
  1694		VZEROUPPER
  1695		JMP loop
  1696	
  1697	
  1698	// TODO: Also use this in bytes.Index
  1699	TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1700		MOVQ s+0(FP), DI
  1701		// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1702		MOVQ s_len+8(FP), DX
  1703		MOVQ c+16(FP), BP
  1704		MOVQ c_len+24(FP), AX
  1705		CMPQ AX, DX
  1706		JA fail
  1707		CMPQ DX, $16
  1708		JAE sse42
  1709	no_sse42:
  1710		CMPQ AX, $2
  1711		JA   _3_or_more
  1712		MOVW (BP), BP
  1713		LEAQ -1(DI)(DX*1), DX
  1714	loop2:
  1715		MOVW (DI), SI
  1716		CMPW SI,BP
  1717		JZ success
  1718		ADDQ $1,DI
  1719		CMPQ DI,DX
  1720		JB loop2
  1721		JMP fail
  1722	_3_or_more:
  1723		CMPQ AX, $3
  1724		JA   _4_or_more
  1725		MOVW 1(BP), BX
  1726		MOVW (BP), BP
  1727		LEAQ -2(DI)(DX*1), DX
  1728	loop3:
  1729		MOVW (DI), SI
  1730		CMPW SI,BP
  1731		JZ   partial_success3
  1732		ADDQ $1,DI
  1733		CMPQ DI,DX
  1734		JB loop3
  1735		JMP fail
  1736	partial_success3:
  1737		MOVW 1(DI), SI
  1738		CMPW SI,BX
  1739		JZ success
  1740		ADDQ $1,DI
  1741		CMPQ DI,DX
  1742		JB loop3
  1743		JMP fail
  1744	_4_or_more:
  1745		CMPQ AX, $4
  1746		JA   _5_or_more
  1747		MOVL (BP), BP
  1748		LEAQ -3(DI)(DX*1), DX
  1749	loop4:
  1750		MOVL (DI), SI
  1751		CMPL SI,BP
  1752		JZ   success
  1753		ADDQ $1,DI
  1754		CMPQ DI,DX
  1755		JB loop4
  1756		JMP fail
  1757	_5_or_more:
  1758		CMPQ AX, $7
  1759		JA   _8_or_more
  1760		LEAQ 1(DI)(DX*1), DX
  1761		SUBQ AX, DX
  1762		MOVL -4(BP)(AX*1), BX
  1763		MOVL (BP), BP
  1764	loop5to7:
  1765		MOVL (DI), SI
  1766		CMPL SI,BP
  1767		JZ   partial_success5to7
  1768		ADDQ $1,DI
  1769		CMPQ DI,DX
  1770		JB loop5to7
  1771		JMP fail
  1772	partial_success5to7:
  1773		MOVL -4(AX)(DI*1), SI
  1774		CMPL SI,BX
  1775		JZ success
  1776		ADDQ $1,DI
  1777		CMPQ DI,DX
  1778		JB loop5to7
  1779		JMP fail
  1780	_8_or_more:
  1781		CMPQ AX, $8
  1782		JA   _9_or_more
  1783		MOVQ (BP), BP
  1784		LEAQ -7(DI)(DX*1), DX
  1785	loop8:
  1786		MOVQ (DI), SI
  1787		CMPQ SI,BP
  1788		JZ   success
  1789		ADDQ $1,DI
  1790		CMPQ DI,DX
  1791		JB loop8
  1792		JMP fail
  1793	_9_or_more:
  1794		CMPQ AX, $16
  1795		JA   _16_or_more
  1796		LEAQ 1(DI)(DX*1), DX
  1797		SUBQ AX, DX
  1798		MOVQ -8(BP)(AX*1), BX
  1799		MOVQ (BP), BP
  1800	loop9to15:
  1801		MOVQ (DI), SI
  1802		CMPQ SI,BP
  1803		JZ   partial_success9to15
  1804		ADDQ $1,DI
  1805		CMPQ DI,DX
  1806		JB loop9to15
  1807		JMP fail
  1808	partial_success9to15:
  1809		MOVQ -8(AX)(DI*1), SI
  1810		CMPQ SI,BX
  1811		JZ success
  1812		ADDQ $1,DI
  1813		CMPQ DI,DX
  1814		JB loop9to15
  1815		JMP fail
  1816	_16_or_more:
  1817		CMPQ AX, $16
  1818		JA   _17_to_31
  1819		MOVOU (BP), X1
  1820		LEAQ -15(DI)(DX*1), DX
  1821	loop16:
  1822		MOVOU (DI), X2
  1823		PCMPEQB X1, X2
  1824		PMOVMSKB X2, SI
  1825		CMPQ  SI, $0xffff
  1826		JE   success
  1827		ADDQ $1,DI
  1828		CMPQ DI,DX
  1829		JB loop16
  1830		JMP fail
  1831	_17_to_31:
  1832		LEAQ 1(DI)(DX*1), DX
  1833		SUBQ AX, DX
  1834		MOVOU -16(BP)(AX*1), X0
  1835		MOVOU (BP), X1
  1836	loop17to31:
  1837		MOVOU (DI), X2
  1838		PCMPEQB X1,X2
  1839		PMOVMSKB X2, SI
  1840		CMPQ  SI, $0xffff
  1841		JE   partial_success17to31
  1842		ADDQ $1,DI
  1843		CMPQ DI,DX
  1844		JB loop17to31
  1845		JMP fail
  1846	partial_success17to31:
  1847		MOVOU -16(AX)(DI*1), X3
  1848		PCMPEQB X0, X3
  1849		PMOVMSKB X3, SI
  1850		CMPQ  SI, $0xffff
  1851		JE success
  1852		ADDQ $1,DI
  1853		CMPQ DI,DX
  1854		JB loop17to31
  1855	fail:
  1856		MOVQ $-1, ret+32(FP)
  1857		RET
  1858	sse42:
  1859		MOVL runtime·cpuid_ecx(SB), CX
  1860		ANDL $0x100000, CX
  1861		JZ no_sse42
  1862		CMPQ AX, $12
  1863		// PCMPESTRI is slower than normal compare,
  1864		// so using it makes sense only if we advance 4+ bytes per compare
  1865		// This value was determined experimentally and is the ~same
  1866		// on Nehalem (first with SSE42) and Haswell.
  1867		JAE _9_or_more
  1868		LEAQ 16(BP), SI
  1869		TESTW $0xff0, SI
  1870		JEQ no_sse42
  1871		MOVOU (BP), X1
  1872		LEAQ -15(DI)(DX*1), SI
  1873		MOVQ $16, R9
  1874		SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1875	loop_sse42:
  1876		// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1877		// for equality (bits 2,3 are 11)
  1878		// result is not masked or inverted (bits 4,5 are 00)
  1879		// and corresponds to first matching byte (bit 6 is 0)
  1880		PCMPESTRI $0x0c, (DI), X1
  1881		// CX == 16 means no match,
  1882		// CX > R9 means partial match at the end of the string,
  1883		// otherwise sep is at offset CX from X1 start
  1884		CMPQ CX, R9
  1885		JBE sse42_success
  1886		ADDQ R9, DI
  1887		CMPQ DI, SI
  1888		JB loop_sse42
  1889		PCMPESTRI $0x0c, -1(SI), X1
  1890		CMPQ CX, R9
  1891		JA fail
  1892		LEAQ -1(SI), DI
  1893	sse42_success:
  1894		ADDQ CX, DI
  1895	success:
  1896		SUBQ s+0(FP), DI
  1897		MOVQ DI, ret+32(FP)
  1898		RET
  1899	
  1900	
  1901	TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1902		MOVQ s+0(FP), SI
  1903		MOVQ s_len+8(FP), BX
  1904		MOVB c+24(FP), AL
  1905		LEAQ ret+32(FP), R8
  1906		JMP  runtime·indexbytebody(SB)
  1907	
  1908	TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1909		MOVQ s+0(FP), SI
  1910		MOVQ s_len+8(FP), BX
  1911		MOVB c+16(FP), AL
  1912		LEAQ ret+24(FP), R8
  1913		JMP  runtime·indexbytebody(SB)
  1914	
  1915	// input:
  1916	//   SI: data
  1917	//   BX: data len
  1918	//   AL: byte sought
  1919	//   R8: address to put result
  1920	TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1921		// Shuffle X0 around so that each byte contains
  1922		// the character we're looking for.
  1923		MOVD AX, X0
  1924		PUNPCKLBW X0, X0
  1925		PUNPCKLBW X0, X0
  1926		PSHUFL $0, X0, X0
  1927		
  1928		CMPQ BX, $16
  1929		JLT small
  1930	
  1931		MOVQ SI, DI
  1932	
  1933		CMPQ BX, $32
  1934		JA avx2
  1935	sse:
  1936		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  1937		JMP	sseloopentry
  1938		
  1939	sseloop:
  1940		// Move the next 16-byte chunk of the data into X1.
  1941		MOVOU	(DI), X1
  1942		// Compare bytes in X0 to X1.
  1943		PCMPEQB	X0, X1
  1944		// Take the top bit of each byte in X1 and put the result in DX.
  1945		PMOVMSKB X1, DX
  1946		// Find first set bit, if any.
  1947		BSFL	DX, DX
  1948		JNZ	ssesuccess
  1949		// Advance to next block.
  1950		ADDQ	$16, DI
  1951	sseloopentry:
  1952		CMPQ	DI, AX
  1953		JB	sseloop
  1954	
  1955		// Search the last 16-byte chunk. This chunk may overlap with the
  1956		// chunks we've already searched, but that's ok.
  1957		MOVQ	AX, DI
  1958		MOVOU	(AX), X1
  1959		PCMPEQB	X0, X1
  1960		PMOVMSKB X1, DX
  1961		BSFL	DX, DX
  1962		JNZ	ssesuccess
  1963	
  1964	failure:
  1965		MOVQ $-1, (R8)
  1966		RET
  1967	
  1968	// We've found a chunk containing the byte.
  1969	// The chunk was loaded from DI.
  1970	// The index of the matching byte in the chunk is DX.
  1971	// The start of the data is SI.
  1972	ssesuccess:
  1973		SUBQ SI, DI	// Compute offset of chunk within data.
  1974		ADDQ DX, DI	// Add offset of byte within chunk.
  1975		MOVQ DI, (R8)
  1976		RET
  1977	
  1978	// handle for lengths < 16
  1979	small:
  1980		TESTQ	BX, BX
  1981		JEQ	failure
  1982	
  1983		// Check if we'll load across a page boundary.
  1984		LEAQ	16(SI), AX
  1985		TESTW	$0xff0, AX
  1986		JEQ	endofpage
  1987	
  1988		MOVOU	(SI), X1 // Load data
  1989		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  1990		PMOVMSKB X1, DX	// Move result bits to integer register.
  1991		BSFL	DX, DX	// Find first set bit.
  1992		JZ	failure	// No set bit, failure.
  1993		CMPL	DX, BX
  1994		JAE	failure	// Match is past end of data.
  1995		MOVQ	DX, (R8)
  1996		RET
  1997	
  1998	endofpage:
  1999		MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2000		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2001		PMOVMSKB X1, DX	// Move result bits to integer register.
  2002		MOVL	BX, CX
  2003		SHLL	CX, DX
  2004		SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2005		BSFL	DX, DX	// Find first set bit.
  2006		JZ	failure	// No set bit, failure.
  2007		MOVQ	DX, (R8)
  2008		RET
  2009	
  2010	avx2:
  2011		CMPB   runtime·support_avx2(SB), $1
  2012		JNE sse
  2013		MOVD AX, X0
  2014		LEAQ -32(SI)(BX*1), R11
  2015		VPBROADCASTB  X0, Y1
  2016	avx2_loop:
  2017		VMOVDQU (DI), Y2
  2018		VPCMPEQB Y1, Y2, Y3
  2019		VPTEST Y3, Y3
  2020		JNZ avx2success
  2021		ADDQ $32, DI
  2022		CMPQ DI, R11
  2023		JLT avx2_loop
  2024		MOVQ R11, DI
  2025		VMOVDQU (DI), Y2
  2026		VPCMPEQB Y1, Y2, Y3
  2027		VPTEST Y3, Y3
  2028		JNZ avx2success
  2029		VZEROUPPER
  2030		MOVQ $-1, (R8)
  2031		RET
  2032	
  2033	avx2success:
  2034		VPMOVMSKB Y3, DX
  2035		BSFL DX, DX
  2036		SUBQ SI, DI
  2037		ADDQ DI, DX
  2038		MOVQ DX, (R8)
  2039		VZEROUPPER
  2040		RET
  2041	
  2042	TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2043		MOVQ	a_len+8(FP), BX
  2044		MOVQ	b_len+32(FP), CX
  2045		CMPQ	BX, CX
  2046		JNE	eqret
  2047		MOVQ	a+0(FP), SI
  2048		MOVQ	b+24(FP), DI
  2049		LEAQ	ret+48(FP), AX
  2050		JMP	runtime·memeqbody(SB)
  2051	eqret:
  2052		MOVB	$0, ret+48(FP)
  2053		RET
  2054	
  2055	TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  2056		get_tls(CX)
  2057		MOVQ	g(CX), AX
  2058		MOVQ	g_m(AX), AX
  2059		MOVL	m_fastrand(AX), DX
  2060		ADDL	DX, DX
  2061		MOVL	DX, BX
  2062		XORL	$0x88888eef, DX
  2063		CMOVLMI	BX, DX
  2064		MOVL	DX, m_fastrand(AX)
  2065		MOVL	DX, ret+0(FP)
  2066		RET
  2067	
  2068	TEXT runtime·return0(SB), NOSPLIT, $0
  2069		MOVL	$0, AX
  2070		RET
  2071	
  2072	
  2073	// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2074	// Must obey the gcc calling convention.
  2075	TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2076		get_tls(CX)
  2077		MOVQ	g(CX), AX
  2078		MOVQ	g_m(AX), AX
  2079		MOVQ	m_curg(AX), AX
  2080		MOVQ	(g_stack+stack_hi)(AX), AX
  2081		RET
  2082	
  2083	// The top-most function running on a goroutine
  2084	// returns to goexit+PCQuantum.
  2085	TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2086		BYTE	$0x90	// NOP
  2087		CALL	runtime·goexit1(SB)	// does not return
  2088		// traceback from goexit1 must hit code range of goexit
  2089		BYTE	$0x90	// NOP
  2090	
  2091	TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2092		MOVQ	addr+0(FP), AX
  2093		PREFETCHT0	(AX)
  2094		RET
  2095	
  2096	TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2097		MOVQ	addr+0(FP), AX
  2098		PREFETCHT1	(AX)
  2099		RET
  2100	
  2101	TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2102		MOVQ	addr+0(FP), AX
  2103		PREFETCHT2	(AX)
  2104		RET
  2105	
  2106	TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2107		MOVQ	addr+0(FP), AX
  2108		PREFETCHNTA	(AX)
  2109		RET
  2110	
  2111	// This is called from .init_array and follows the platform, not Go, ABI.
  2112	TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2113		PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2114		MOVQ	runtime·lastmoduledatap(SB), AX
  2115		MOVQ	DI, moduledata_next(AX)
  2116		MOVQ	DI, runtime·lastmoduledatap(SB)
  2117		POPQ	R15
  2118		RET

View as plain text