...
Run Format

Text file src/runtime/asm_amd64.s

Documentation: runtime

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "go_tls.h"
     7	#include "funcdata.h"
     8	#include "textflag.h"
     9	
    10	TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11		// copy arguments forward on an even stack
    12		MOVQ	DI, AX		// argc
    13		MOVQ	SI, BX		// argv
    14		SUBQ	$(4*8+7), SP		// 2args 2auto
    15		ANDQ	$~15, SP
    16		MOVQ	AX, 16(SP)
    17		MOVQ	BX, 24(SP)
    18		
    19		// create istack out of the given (operating system) stack.
    20		// _cgo_init may update stackguard.
    21		MOVQ	$runtime·g0(SB), DI
    22		LEAQ	(-64*1024+104)(SP), BX
    23		MOVQ	BX, g_stackguard0(DI)
    24		MOVQ	BX, g_stackguard1(DI)
    25		MOVQ	BX, (g_stack+stack_lo)(DI)
    26		MOVQ	SP, (g_stack+stack_hi)(DI)
    27	
    28		// find out information about the processor we're on
    29		MOVQ	$0, AX
    30		CPUID
    31		MOVQ	AX, SI
    32		CMPQ	AX, $0
    33		JE	nocpuinfo
    34	
    35		// Figure out how to serialize RDTSC.
    36		// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37		// Don't know about the rest, so let's do MFENCE.
    38		CMPL	BX, $0x756E6547  // "Genu"
    39		JNE	notintel
    40		CMPL	DX, $0x49656E69  // "ineI"
    41		JNE	notintel
    42		CMPL	CX, $0x6C65746E  // "ntel"
    43		JNE	notintel
    44		MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45	notintel:
    46	
    47		// Load EAX=1 cpuid flags
    48		MOVQ	$1, AX
    49		CPUID
    50		MOVL	CX, runtime·cpuid_ecx(SB)
    51		MOVL	DX, runtime·cpuid_edx(SB)
    52	
    53		// Load EAX=7/ECX=0 cpuid flags
    54		CMPQ	SI, $7
    55		JLT	no7
    56		MOVL	$7, AX
    57		MOVL	$0, CX
    58		CPUID
    59		MOVL	BX, runtime·cpuid_ebx7(SB)
    60	no7:
    61		// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62		// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63		// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64		MOVL	runtime·cpuid_ecx(SB), CX
    65		ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66		CMPL    CX, $0x18000000
    67		JNE     noavx
    68		MOVL    $0, CX
    69		// For XGETBV, OSXSAVE bit is required and sufficient
    70		XGETBV
    71		ANDL    $6, AX
    72		CMPL    AX, $6 // Check for OS support of YMM registers
    73		JNE     noavx
    74		MOVB    $1, runtime·support_avx(SB)
    75		TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76		JEQ     noavx2
    77		MOVB    $1, runtime·support_avx2(SB)
    78		JMP     testbmi1
    79	noavx:
    80		MOVB    $0, runtime·support_avx(SB)
    81	noavx2:
    82		MOVB    $0, runtime·support_avx2(SB)
    83	testbmi1:
    84		// Detect BMI1 and BMI2 extensions as per
    85		// 5.1.16.1 Detection of VEX-encoded GPR Instructions,
    86		//   LZCNT and TZCNT, PREFETCHW chapter of [1]
    87		MOVB    $0, runtime·support_bmi1(SB)
    88		TESTL   $(1<<3), runtime·cpuid_ebx7(SB) // check for BMI1 bit
    89		JEQ     testbmi2
    90		MOVB    $1, runtime·support_bmi1(SB)
    91	testbmi2:
    92		MOVB    $0, runtime·support_bmi2(SB)
    93		TESTL   $(1<<8), runtime·cpuid_ebx7(SB) // check for BMI2 bit
    94		JEQ     nocpuinfo
    95		MOVB    $1, runtime·support_bmi2(SB)
    96	nocpuinfo:	
    97		
    98		// if there is an _cgo_init, call it.
    99		MOVQ	_cgo_init(SB), AX
   100		TESTQ	AX, AX
   101		JZ	needtls
   102		// g0 already in DI
   103		MOVQ	DI, CX	// Win64 uses CX for first parameter
   104		MOVQ	$setg_gcc<>(SB), SI
   105		CALL	AX
   106	
   107		// update stackguard after _cgo_init
   108		MOVQ	$runtime·g0(SB), CX
   109		MOVQ	(g_stack+stack_lo)(CX), AX
   110		ADDQ	$const__StackGuard, AX
   111		MOVQ	AX, g_stackguard0(CX)
   112		MOVQ	AX, g_stackguard1(CX)
   113	
   114	#ifndef GOOS_windows
   115		JMP ok
   116	#endif
   117	needtls:
   118	#ifdef GOOS_plan9
   119		// skip TLS setup on Plan 9
   120		JMP ok
   121	#endif
   122	#ifdef GOOS_solaris
   123		// skip TLS setup on Solaris
   124		JMP ok
   125	#endif
   126	
   127		LEAQ	runtime·m0+m_tls(SB), DI
   128		CALL	runtime·settls(SB)
   129	
   130		// store through it, to make sure it works
   131		get_tls(BX)
   132		MOVQ	$0x123, g(BX)
   133		MOVQ	runtime·m0+m_tls(SB), AX
   134		CMPQ	AX, $0x123
   135		JEQ 2(PC)
   136		MOVL	AX, 0	// abort
   137	ok:
   138		// set the per-goroutine and per-mach "registers"
   139		get_tls(BX)
   140		LEAQ	runtime·g0(SB), CX
   141		MOVQ	CX, g(BX)
   142		LEAQ	runtime·m0(SB), AX
   143	
   144		// save m->g0 = g0
   145		MOVQ	CX, m_g0(AX)
   146		// save m0 to g0->m
   147		MOVQ	AX, g_m(CX)
   148	
   149		CLD				// convention is D is always left cleared
   150		CALL	runtime·check(SB)
   151	
   152		MOVL	16(SP), AX		// copy argc
   153		MOVL	AX, 0(SP)
   154		MOVQ	24(SP), AX		// copy argv
   155		MOVQ	AX, 8(SP)
   156		CALL	runtime·args(SB)
   157		CALL	runtime·osinit(SB)
   158		CALL	runtime·schedinit(SB)
   159	
   160		// create a new goroutine to start program
   161		MOVQ	$runtime·mainPC(SB), AX		// entry
   162		PUSHQ	AX
   163		PUSHQ	$0			// arg size
   164		CALL	runtime·newproc(SB)
   165		POPQ	AX
   166		POPQ	AX
   167	
   168		// start this M
   169		CALL	runtime·mstart(SB)
   170	
   171		MOVL	$0xf1, 0xf1  // crash
   172		RET
   173	
   174	DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   175	GLOBL	runtime·mainPC(SB),RODATA,$8
   176	
   177	TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   178		BYTE	$0xcc
   179		RET
   180	
   181	TEXT runtime·asminit(SB),NOSPLIT,$0-0
   182		// No per-thread init.
   183		RET
   184	
   185	/*
   186	 *  go-routine
   187	 */
   188	
   189	// void gosave(Gobuf*)
   190	// save state in Gobuf; setjmp
   191	TEXT runtime·gosave(SB), NOSPLIT, $0-8
   192		MOVQ	buf+0(FP), AX		// gobuf
   193		LEAQ	buf+0(FP), BX		// caller's SP
   194		MOVQ	BX, gobuf_sp(AX)
   195		MOVQ	0(SP), BX		// caller's PC
   196		MOVQ	BX, gobuf_pc(AX)
   197		MOVQ	$0, gobuf_ret(AX)
   198		MOVQ	BP, gobuf_bp(AX)
   199		// Assert ctxt is zero. See func save.
   200		MOVQ	gobuf_ctxt(AX), BX
   201		TESTQ	BX, BX
   202		JZ	2(PC)
   203		CALL	runtime·badctxt(SB)
   204		get_tls(CX)
   205		MOVQ	g(CX), BX
   206		MOVQ	BX, gobuf_g(AX)
   207		RET
   208	
   209	// void gogo(Gobuf*)
   210	// restore state from Gobuf; longjmp
   211	TEXT runtime·gogo(SB), NOSPLIT, $16-8
   212		MOVQ	buf+0(FP), BX		// gobuf
   213	
   214		// If ctxt is not nil, invoke deletion barrier before overwriting.
   215		MOVQ	gobuf_ctxt(BX), AX
   216		TESTQ	AX, AX
   217		JZ	nilctxt
   218		LEAQ	gobuf_ctxt(BX), AX
   219		MOVQ	AX, 0(SP)
   220		MOVQ	$0, 8(SP)
   221		CALL	runtime·writebarrierptr_prewrite(SB)
   222		MOVQ	buf+0(FP), BX
   223	
   224	nilctxt:
   225		MOVQ	gobuf_g(BX), DX
   226		MOVQ	0(DX), CX		// make sure g != nil
   227		get_tls(CX)
   228		MOVQ	DX, g(CX)
   229		MOVQ	gobuf_sp(BX), SP	// restore SP
   230		MOVQ	gobuf_ret(BX), AX
   231		MOVQ	gobuf_ctxt(BX), DX
   232		MOVQ	gobuf_bp(BX), BP
   233		MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   234		MOVQ	$0, gobuf_ret(BX)
   235		MOVQ	$0, gobuf_ctxt(BX)
   236		MOVQ	$0, gobuf_bp(BX)
   237		MOVQ	gobuf_pc(BX), BX
   238		JMP	BX
   239	
   240	// func mcall(fn func(*g))
   241	// Switch to m->g0's stack, call fn(g).
   242	// Fn must never return. It should gogo(&g->sched)
   243	// to keep running g.
   244	TEXT runtime·mcall(SB), NOSPLIT, $0-8
   245		MOVQ	fn+0(FP), DI
   246		
   247		get_tls(CX)
   248		MOVQ	g(CX), AX	// save state in g->sched
   249		MOVQ	0(SP), BX	// caller's PC
   250		MOVQ	BX, (g_sched+gobuf_pc)(AX)
   251		LEAQ	fn+0(FP), BX	// caller's SP
   252		MOVQ	BX, (g_sched+gobuf_sp)(AX)
   253		MOVQ	AX, (g_sched+gobuf_g)(AX)
   254		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   255	
   256		// switch to m->g0 & its stack, call fn
   257		MOVQ	g(CX), BX
   258		MOVQ	g_m(BX), BX
   259		MOVQ	m_g0(BX), SI
   260		CMPQ	SI, AX	// if g == m->g0 call badmcall
   261		JNE	3(PC)
   262		MOVQ	$runtime·badmcall(SB), AX
   263		JMP	AX
   264		MOVQ	SI, g(CX)	// g = m->g0
   265		MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   266		PUSHQ	AX
   267		MOVQ	DI, DX
   268		MOVQ	0(DI), DI
   269		CALL	DI
   270		POPQ	AX
   271		MOVQ	$runtime·badmcall2(SB), AX
   272		JMP	AX
   273		RET
   274	
   275	// systemstack_switch is a dummy routine that systemstack leaves at the bottom
   276	// of the G stack. We need to distinguish the routine that
   277	// lives at the bottom of the G stack from the one that lives
   278	// at the top of the system stack because the one at the top of
   279	// the system stack terminates the stack walk (see topofstack()).
   280	TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   281		RET
   282	
   283	// func systemstack(fn func())
   284	TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   285		MOVQ	fn+0(FP), DI	// DI = fn
   286		get_tls(CX)
   287		MOVQ	g(CX), AX	// AX = g
   288		MOVQ	g_m(AX), BX	// BX = m
   289	
   290		MOVQ	m_gsignal(BX), DX	// DX = gsignal
   291		CMPQ	AX, DX
   292		JEQ	noswitch
   293	
   294		MOVQ	m_g0(BX), DX	// DX = g0
   295		CMPQ	AX, DX
   296		JEQ	noswitch
   297	
   298		MOVQ	m_curg(BX), R8
   299		CMPQ	AX, R8
   300		JEQ	switch
   301		
   302		// Bad: g is not gsignal, not g0, not curg. What is it?
   303		MOVQ	$runtime·badsystemstack(SB), AX
   304		CALL	AX
   305	
   306	switch:
   307		// save our state in g->sched. Pretend to
   308		// be systemstack_switch if the G stack is scanned.
   309		MOVQ	$runtime·systemstack_switch(SB), SI
   310		MOVQ	SI, (g_sched+gobuf_pc)(AX)
   311		MOVQ	SP, (g_sched+gobuf_sp)(AX)
   312		MOVQ	AX, (g_sched+gobuf_g)(AX)
   313		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   314	
   315		// switch to g0
   316		MOVQ	DX, g(CX)
   317		MOVQ	(g_sched+gobuf_sp)(DX), BX
   318		// make it look like mstart called systemstack on g0, to stop traceback
   319		SUBQ	$8, BX
   320		MOVQ	$runtime·mstart(SB), DX
   321		MOVQ	DX, 0(BX)
   322		MOVQ	BX, SP
   323	
   324		// call target function
   325		MOVQ	DI, DX
   326		MOVQ	0(DI), DI
   327		CALL	DI
   328	
   329		// switch back to g
   330		get_tls(CX)
   331		MOVQ	g(CX), AX
   332		MOVQ	g_m(AX), BX
   333		MOVQ	m_curg(BX), AX
   334		MOVQ	AX, g(CX)
   335		MOVQ	(g_sched+gobuf_sp)(AX), SP
   336		MOVQ	$0, (g_sched+gobuf_sp)(AX)
   337		RET
   338	
   339	noswitch:
   340		// already on m stack, just call directly
   341		MOVQ	DI, DX
   342		MOVQ	0(DI), DI
   343		CALL	DI
   344		RET
   345	
   346	/*
   347	 * support for morestack
   348	 */
   349	
   350	// Called during function prolog when more stack is needed.
   351	//
   352	// The traceback routines see morestack on a g0 as being
   353	// the top of a stack (for example, morestack calling newstack
   354	// calling the scheduler calling newm calling gc), so we must
   355	// record an argument size. For that purpose, it has no arguments.
   356	TEXT runtime·morestack(SB),NOSPLIT,$0-0
   357		// Cannot grow scheduler stack (m->g0).
   358		get_tls(CX)
   359		MOVQ	g(CX), BX
   360		MOVQ	g_m(BX), BX
   361		MOVQ	m_g0(BX), SI
   362		CMPQ	g(CX), SI
   363		JNE	3(PC)
   364		CALL	runtime·badmorestackg0(SB)
   365		INT	$3
   366	
   367		// Cannot grow signal stack (m->gsignal).
   368		MOVQ	m_gsignal(BX), SI
   369		CMPQ	g(CX), SI
   370		JNE	3(PC)
   371		CALL	runtime·badmorestackgsignal(SB)
   372		INT	$3
   373	
   374		// Called from f.
   375		// Set m->morebuf to f's caller.
   376		MOVQ	8(SP), AX	// f's caller's PC
   377		MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   378		LEAQ	16(SP), AX	// f's caller's SP
   379		MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   380		get_tls(CX)
   381		MOVQ	g(CX), SI
   382		MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   383	
   384		// Set g->sched to context in f.
   385		MOVQ	0(SP), AX // f's PC
   386		MOVQ	AX, (g_sched+gobuf_pc)(SI)
   387		MOVQ	SI, (g_sched+gobuf_g)(SI)
   388		LEAQ	8(SP), AX // f's SP
   389		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   390		MOVQ	BP, (g_sched+gobuf_bp)(SI)
   391		// newstack will fill gobuf.ctxt.
   392	
   393		// Call newstack on m->g0's stack.
   394		MOVQ	m_g0(BX), BX
   395		MOVQ	BX, g(CX)
   396		MOVQ	(g_sched+gobuf_sp)(BX), SP
   397		PUSHQ	DX	// ctxt argument
   398		CALL	runtime·newstack(SB)
   399		MOVQ	$0, 0x1003	// crash if newstack returns
   400		POPQ	DX	// keep balance check happy
   401		RET
   402	
   403	// morestack but not preserving ctxt.
   404	TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   405		MOVL	$0, DX
   406		JMP	runtime·morestack(SB)
   407	
   408	TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   409		// We came here via a RET to an overwritten return PC.
   410		// AX may be live. Other registers are available.
   411	
   412		// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   413		get_tls(CX)
   414		MOVQ	g(CX), CX
   415		MOVQ	(g_stkbar+slice_array)(CX), DX
   416		MOVQ	g_stkbarPos(CX), BX
   417		IMULQ	$stkbar__size, BX	// Too big for SIB.
   418		MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   419		MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   420		// Assert that we're popping the right saved LR.
   421		ADDQ	$8, R8
   422		CMPQ	R8, SP
   423		JEQ	2(PC)
   424		MOVL	$0, 0
   425		// Record that this stack barrier was hit.
   426		ADDQ	$1, g_stkbarPos(CX)
   427		// Jump to the original return PC.
   428		JMP	BX
   429	
   430	// reflectcall: call a function with the given argument list
   431	// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   432	// we don't have variable-sized frames, so we use a small number
   433	// of constant-sized-frame functions to encode a few bits of size in the pc.
   434	// Caution: ugly multiline assembly macros in your future!
   435	
   436	#define DISPATCH(NAME,MAXSIZE)		\
   437		CMPQ	CX, $MAXSIZE;		\
   438		JA	3(PC);			\
   439		MOVQ	$NAME(SB), AX;		\
   440		JMP	AX
   441	// Note: can't just "JMP NAME(SB)" - bad inlining results.
   442	
   443	TEXT reflect·call(SB), NOSPLIT, $0-0
   444		JMP	·reflectcall(SB)
   445	
   446	TEXT ·reflectcall(SB), NOSPLIT, $0-32
   447		MOVLQZX argsize+24(FP), CX
   448		DISPATCH(runtime·call32, 32)
   449		DISPATCH(runtime·call64, 64)
   450		DISPATCH(runtime·call128, 128)
   451		DISPATCH(runtime·call256, 256)
   452		DISPATCH(runtime·call512, 512)
   453		DISPATCH(runtime·call1024, 1024)
   454		DISPATCH(runtime·call2048, 2048)
   455		DISPATCH(runtime·call4096, 4096)
   456		DISPATCH(runtime·call8192, 8192)
   457		DISPATCH(runtime·call16384, 16384)
   458		DISPATCH(runtime·call32768, 32768)
   459		DISPATCH(runtime·call65536, 65536)
   460		DISPATCH(runtime·call131072, 131072)
   461		DISPATCH(runtime·call262144, 262144)
   462		DISPATCH(runtime·call524288, 524288)
   463		DISPATCH(runtime·call1048576, 1048576)
   464		DISPATCH(runtime·call2097152, 2097152)
   465		DISPATCH(runtime·call4194304, 4194304)
   466		DISPATCH(runtime·call8388608, 8388608)
   467		DISPATCH(runtime·call16777216, 16777216)
   468		DISPATCH(runtime·call33554432, 33554432)
   469		DISPATCH(runtime·call67108864, 67108864)
   470		DISPATCH(runtime·call134217728, 134217728)
   471		DISPATCH(runtime·call268435456, 268435456)
   472		DISPATCH(runtime·call536870912, 536870912)
   473		DISPATCH(runtime·call1073741824, 1073741824)
   474		MOVQ	$runtime·badreflectcall(SB), AX
   475		JMP	AX
   476	
   477	#define CALLFN(NAME,MAXSIZE)			\
   478	TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   479		NO_LOCAL_POINTERS;			\
   480		/* copy arguments to stack */		\
   481		MOVQ	argptr+16(FP), SI;		\
   482		MOVLQZX argsize+24(FP), CX;		\
   483		MOVQ	SP, DI;				\
   484		REP;MOVSB;				\
   485		/* call function */			\
   486		MOVQ	f+8(FP), DX;			\
   487		PCDATA  $PCDATA_StackMapIndex, $0;	\
   488		CALL	(DX);				\
   489		/* copy return values back */		\
   490		MOVQ	argtype+0(FP), DX;		\
   491		MOVQ	argptr+16(FP), DI;		\
   492		MOVLQZX	argsize+24(FP), CX;		\
   493		MOVLQZX	retoffset+28(FP), BX;		\
   494		MOVQ	SP, SI;				\
   495		ADDQ	BX, DI;				\
   496		ADDQ	BX, SI;				\
   497		SUBQ	BX, CX;				\
   498		CALL	callRet<>(SB);			\
   499		RET
   500	
   501	// callRet copies return values back at the end of call*. This is a
   502	// separate function so it can allocate stack space for the arguments
   503	// to reflectcallmove. It does not follow the Go ABI; it expects its
   504	// arguments in registers.
   505	TEXT callRet<>(SB), NOSPLIT, $32-0
   506		NO_LOCAL_POINTERS
   507		MOVQ	DX, 0(SP)
   508		MOVQ	DI, 8(SP)
   509		MOVQ	SI, 16(SP)
   510		MOVQ	CX, 24(SP)
   511		CALL	runtime·reflectcallmove(SB)
   512		RET
   513	
   514	CALLFN(·call32, 32)
   515	CALLFN(·call64, 64)
   516	CALLFN(·call128, 128)
   517	CALLFN(·call256, 256)
   518	CALLFN(·call512, 512)
   519	CALLFN(·call1024, 1024)
   520	CALLFN(·call2048, 2048)
   521	CALLFN(·call4096, 4096)
   522	CALLFN(·call8192, 8192)
   523	CALLFN(·call16384, 16384)
   524	CALLFN(·call32768, 32768)
   525	CALLFN(·call65536, 65536)
   526	CALLFN(·call131072, 131072)
   527	CALLFN(·call262144, 262144)
   528	CALLFN(·call524288, 524288)
   529	CALLFN(·call1048576, 1048576)
   530	CALLFN(·call2097152, 2097152)
   531	CALLFN(·call4194304, 4194304)
   532	CALLFN(·call8388608, 8388608)
   533	CALLFN(·call16777216, 16777216)
   534	CALLFN(·call33554432, 33554432)
   535	CALLFN(·call67108864, 67108864)
   536	CALLFN(·call134217728, 134217728)
   537	CALLFN(·call268435456, 268435456)
   538	CALLFN(·call536870912, 536870912)
   539	CALLFN(·call1073741824, 1073741824)
   540	
   541	TEXT runtime·procyield(SB),NOSPLIT,$0-0
   542		MOVL	cycles+0(FP), AX
   543	again:
   544		PAUSE
   545		SUBL	$1, AX
   546		JNZ	again
   547		RET
   548	
   549	
   550	TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   551		// Stores are already ordered on x86, so this is just a
   552		// compile barrier.
   553		RET
   554	
   555	// void jmpdefer(fn, sp);
   556	// called from deferreturn.
   557	// 1. pop the caller
   558	// 2. sub 5 bytes from the callers return
   559	// 3. jmp to the argument
   560	TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   561		MOVQ	fv+0(FP), DX	// fn
   562		MOVQ	argp+8(FP), BX	// caller sp
   563		LEAQ	-8(BX), SP	// caller sp after CALL
   564		MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   565		SUBQ	$5, (SP)	// return to CALL again
   566		MOVQ	0(DX), BX
   567		JMP	BX	// but first run the deferred function
   568	
   569	// Save state of caller into g->sched. Smashes R8, R9.
   570	TEXT gosave<>(SB),NOSPLIT,$0
   571		get_tls(R8)
   572		MOVQ	g(R8), R8
   573		MOVQ	0(SP), R9
   574		MOVQ	R9, (g_sched+gobuf_pc)(R8)
   575		LEAQ	8(SP), R9
   576		MOVQ	R9, (g_sched+gobuf_sp)(R8)
   577		MOVQ	$0, (g_sched+gobuf_ret)(R8)
   578		MOVQ	BP, (g_sched+gobuf_bp)(R8)
   579		// Assert ctxt is zero. See func save.
   580		MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   581		TESTQ	R9, R9
   582		JZ	2(PC)
   583		CALL	runtime·badctxt(SB)
   584		RET
   585	
   586	// func asmcgocall(fn, arg unsafe.Pointer) int32
   587	// Call fn(arg) on the scheduler stack,
   588	// aligned appropriately for the gcc ABI.
   589	// See cgocall.go for more details.
   590	TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   591		MOVQ	fn+0(FP), AX
   592		MOVQ	arg+8(FP), BX
   593	
   594		MOVQ	SP, DX
   595	
   596		// Figure out if we need to switch to m->g0 stack.
   597		// We get called to create new OS threads too, and those
   598		// come in on the m->g0 stack already.
   599		get_tls(CX)
   600		MOVQ	g(CX), R8
   601		CMPQ	R8, $0
   602		JEQ	nosave
   603		MOVQ	g_m(R8), R8
   604		MOVQ	m_g0(R8), SI
   605		MOVQ	g(CX), DI
   606		CMPQ	SI, DI
   607		JEQ	nosave
   608		MOVQ	m_gsignal(R8), SI
   609		CMPQ	SI, DI
   610		JEQ	nosave
   611		
   612		// Switch to system stack.
   613		MOVQ	m_g0(R8), SI
   614		CALL	gosave<>(SB)
   615		MOVQ	SI, g(CX)
   616		MOVQ	(g_sched+gobuf_sp)(SI), SP
   617	
   618		// Now on a scheduling stack (a pthread-created stack).
   619		// Make sure we have enough room for 4 stack-backed fast-call
   620		// registers as per windows amd64 calling convention.
   621		SUBQ	$64, SP
   622		ANDQ	$~15, SP	// alignment for gcc ABI
   623		MOVQ	DI, 48(SP)	// save g
   624		MOVQ	(g_stack+stack_hi)(DI), DI
   625		SUBQ	DX, DI
   626		MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   627		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   628		MOVQ	BX, CX		// CX = first argument in Win64
   629		CALL	AX
   630	
   631		// Restore registers, g, stack pointer.
   632		get_tls(CX)
   633		MOVQ	48(SP), DI
   634		MOVQ	(g_stack+stack_hi)(DI), SI
   635		SUBQ	40(SP), SI
   636		MOVQ	DI, g(CX)
   637		MOVQ	SI, SP
   638	
   639		MOVL	AX, ret+16(FP)
   640		RET
   641	
   642	nosave:
   643		// Running on a system stack, perhaps even without a g.
   644		// Having no g can happen during thread creation or thread teardown
   645		// (see needm/dropm on Solaris, for example).
   646		// This code is like the above sequence but without saving/restoring g
   647		// and without worrying about the stack moving out from under us
   648		// (because we're on a system stack, not a goroutine stack).
   649		// The above code could be used directly if already on a system stack,
   650		// but then the only path through this code would be a rare case on Solaris.
   651		// Using this code for all "already on system stack" calls exercises it more,
   652		// which should help keep it correct.
   653		SUBQ	$64, SP
   654		ANDQ	$~15, SP
   655		MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   656		MOVQ	DX, 40(SP)	// save original stack pointer
   657		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   658		MOVQ	BX, CX		// CX = first argument in Win64
   659		CALL	AX
   660		MOVQ	40(SP), SI	// restore original stack pointer
   661		MOVQ	SI, SP
   662		MOVL	AX, ret+16(FP)
   663		RET
   664	
   665	// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   666	// Turn the fn into a Go func (by taking its address) and call
   667	// cgocallback_gofunc.
   668	TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   669		LEAQ	fn+0(FP), AX
   670		MOVQ	AX, 0(SP)
   671		MOVQ	frame+8(FP), AX
   672		MOVQ	AX, 8(SP)
   673		MOVQ	framesize+16(FP), AX
   674		MOVQ	AX, 16(SP)
   675		MOVQ	ctxt+24(FP), AX
   676		MOVQ	AX, 24(SP)
   677		MOVQ	$runtime·cgocallback_gofunc(SB), AX
   678		CALL	AX
   679		RET
   680	
   681	// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   682	// See cgocall.go for more details.
   683	TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   684		NO_LOCAL_POINTERS
   685	
   686		// If g is nil, Go did not create the current thread.
   687		// Call needm to obtain one m for temporary use.
   688		// In this case, we're running on the thread stack, so there's
   689		// lots of space, but the linker doesn't know. Hide the call from
   690		// the linker analysis by using an indirect call through AX.
   691		get_tls(CX)
   692	#ifdef GOOS_windows
   693		MOVL	$0, BX
   694		CMPQ	CX, $0
   695		JEQ	2(PC)
   696	#endif
   697		MOVQ	g(CX), BX
   698		CMPQ	BX, $0
   699		JEQ	needm
   700		MOVQ	g_m(BX), BX
   701		MOVQ	BX, R8 // holds oldm until end of function
   702		JMP	havem
   703	needm:
   704		MOVQ	$0, 0(SP)
   705		MOVQ	$runtime·needm(SB), AX
   706		CALL	AX
   707		MOVQ	0(SP), R8
   708		get_tls(CX)
   709		MOVQ	g(CX), BX
   710		MOVQ	g_m(BX), BX
   711		
   712		// Set m->sched.sp = SP, so that if a panic happens
   713		// during the function we are about to execute, it will
   714		// have a valid SP to run on the g0 stack.
   715		// The next few lines (after the havem label)
   716		// will save this SP onto the stack and then write
   717		// the same SP back to m->sched.sp. That seems redundant,
   718		// but if an unrecovered panic happens, unwindm will
   719		// restore the g->sched.sp from the stack location
   720		// and then systemstack will try to use it. If we don't set it here,
   721		// that restored SP will be uninitialized (typically 0) and
   722		// will not be usable.
   723		MOVQ	m_g0(BX), SI
   724		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   725	
   726	havem:
   727		// Now there's a valid m, and we're running on its m->g0.
   728		// Save current m->g0->sched.sp on stack and then set it to SP.
   729		// Save current sp in m->g0->sched.sp in preparation for
   730		// switch back to m->curg stack.
   731		// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   732		MOVQ	m_g0(BX), SI
   733		MOVQ	(g_sched+gobuf_sp)(SI), AX
   734		MOVQ	AX, 0(SP)
   735		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   736	
   737		// Switch to m->curg stack and call runtime.cgocallbackg.
   738		// Because we are taking over the execution of m->curg
   739		// but *not* resuming what had been running, we need to
   740		// save that information (m->curg->sched) so we can restore it.
   741		// We can restore m->curg->sched.sp easily, because calling
   742		// runtime.cgocallbackg leaves SP unchanged upon return.
   743		// To save m->curg->sched.pc, we push it onto the stack.
   744		// This has the added benefit that it looks to the traceback
   745		// routine like cgocallbackg is going to return to that
   746		// PC (because the frame we allocate below has the same
   747		// size as cgocallback_gofunc's frame declared above)
   748		// so that the traceback will seamlessly trace back into
   749		// the earlier calls.
   750		//
   751		// In the new goroutine, 8(SP) holds the saved R8.
   752		MOVQ	m_curg(BX), SI
   753		MOVQ	SI, g(CX)
   754		MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   755		MOVQ	(g_sched+gobuf_pc)(SI), BX
   756		MOVQ	BX, -8(DI)
   757		// Compute the size of the frame, including return PC and, if
   758		// GOEXPERIMENT=framepointer, the saved base pointer
   759		MOVQ	ctxt+24(FP), BX
   760		LEAQ	fv+0(FP), AX
   761		SUBQ	SP, AX
   762		SUBQ	AX, DI
   763		MOVQ	DI, SP
   764	
   765		MOVQ	R8, 8(SP)
   766		MOVQ	BX, 0(SP)
   767		CALL	runtime·cgocallbackg(SB)
   768		MOVQ	8(SP), R8
   769	
   770		// Compute the size of the frame again. FP and SP have
   771		// completely different values here than they did above,
   772		// but only their difference matters.
   773		LEAQ	fv+0(FP), AX
   774		SUBQ	SP, AX
   775	
   776		// Restore g->sched (== m->curg->sched) from saved values.
   777		get_tls(CX)
   778		MOVQ	g(CX), SI
   779		MOVQ	SP, DI
   780		ADDQ	AX, DI
   781		MOVQ	-8(DI), BX
   782		MOVQ	BX, (g_sched+gobuf_pc)(SI)
   783		MOVQ	DI, (g_sched+gobuf_sp)(SI)
   784	
   785		// Switch back to m->g0's stack and restore m->g0->sched.sp.
   786		// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   787		// so we do not have to restore it.)
   788		MOVQ	g(CX), BX
   789		MOVQ	g_m(BX), BX
   790		MOVQ	m_g0(BX), SI
   791		MOVQ	SI, g(CX)
   792		MOVQ	(g_sched+gobuf_sp)(SI), SP
   793		MOVQ	0(SP), AX
   794		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   795		
   796		// If the m on entry was nil, we called needm above to borrow an m
   797		// for the duration of the call. Since the call is over, return it with dropm.
   798		CMPQ	R8, $0
   799		JNE 3(PC)
   800		MOVQ	$runtime·dropm(SB), AX
   801		CALL	AX
   802	
   803		// Done!
   804		RET
   805	
   806	// void setg(G*); set g. for use by needm.
   807	TEXT runtime·setg(SB), NOSPLIT, $0-8
   808		MOVQ	gg+0(FP), BX
   809	#ifdef GOOS_windows
   810		CMPQ	BX, $0
   811		JNE	settls
   812		MOVQ	$0, 0x28(GS)
   813		RET
   814	settls:
   815		MOVQ	g_m(BX), AX
   816		LEAQ	m_tls(AX), AX
   817		MOVQ	AX, 0x28(GS)
   818	#endif
   819		get_tls(CX)
   820		MOVQ	BX, g(CX)
   821		RET
   822	
   823	// void setg_gcc(G*); set g called from gcc.
   824	TEXT setg_gcc<>(SB),NOSPLIT,$0
   825		get_tls(AX)
   826		MOVQ	DI, g(AX)
   827		RET
   828	
   829	// check that SP is in range [g->stack.lo, g->stack.hi)
   830	TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   831		get_tls(CX)
   832		MOVQ	g(CX), AX
   833		CMPQ	(g_stack+stack_hi)(AX), SP
   834		JHI	2(PC)
   835		INT	$3
   836		CMPQ	SP, (g_stack+stack_lo)(AX)
   837		JHI	2(PC)
   838		INT	$3
   839		RET
   840	
   841	TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   842		MOVQ	argp+0(FP),AX		// addr of first arg
   843		MOVQ	-8(AX),AX		// get calling pc
   844		CMPQ	AX, runtime·stackBarrierPC(SB)
   845		JNE	nobar
   846		// Get original return PC.
   847		CALL	runtime·nextBarrierPC(SB)
   848		MOVQ	0(SP), AX
   849	nobar:
   850		MOVQ	AX, ret+8(FP)
   851		RET
   852	
   853	TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   854		MOVQ	argp+0(FP),AX		// addr of first arg
   855		MOVQ	pc+8(FP), BX
   856		MOVQ	-8(AX), CX
   857		CMPQ	CX, runtime·stackBarrierPC(SB)
   858		JEQ	setbar
   859		MOVQ	BX, -8(AX)		// set calling pc
   860		RET
   861	setbar:
   862		// Set the stack barrier return PC.
   863		MOVQ	BX, 0(SP)
   864		CALL	runtime·setNextBarrierPC(SB)
   865		RET
   866	
   867	// func cputicks() int64
   868	TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   869		CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   870		JNE	mfence
   871		LFENCE
   872		JMP	done
   873	mfence:
   874		MFENCE
   875	done:
   876		RDTSC
   877		SHLQ	$32, DX
   878		ADDQ	DX, AX
   879		MOVQ	AX, ret+0(FP)
   880		RET
   881	
   882	// memhash_varlen(p unsafe.Pointer, h seed) uintptr
   883	// redirects to memhash(p, h, size) using the size
   884	// stored in the closure.
   885	TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   886		GO_ARGS
   887		NO_LOCAL_POINTERS
   888		MOVQ	p+0(FP), AX
   889		MOVQ	h+8(FP), BX
   890		MOVQ	8(DX), CX
   891		MOVQ	AX, 0(SP)
   892		MOVQ	BX, 8(SP)
   893		MOVQ	CX, 16(SP)
   894		CALL	runtime·memhash(SB)
   895		MOVQ	24(SP), AX
   896		MOVQ	AX, ret+16(FP)
   897		RET
   898	
   899	// hash function using AES hardware instructions
   900	TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   901		MOVQ	p+0(FP), AX	// ptr to data
   902		MOVQ	s+16(FP), CX	// size
   903		LEAQ	ret+24(FP), DX
   904		JMP	runtime·aeshashbody(SB)
   905	
   906	TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   907		MOVQ	p+0(FP), AX	// ptr to string struct
   908		MOVQ	8(AX), CX	// length of string
   909		MOVQ	(AX), AX	// string data
   910		LEAQ	ret+16(FP), DX
   911		JMP	runtime·aeshashbody(SB)
   912	
   913	// AX: data
   914	// CX: length
   915	// DX: address to put return value
   916	TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   917		// Fill an SSE register with our seeds.
   918		MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   919		PINSRW	$4, CX, X0			// 16 bits of length
   920		PSHUFHW $0, X0, X0			// repeat length 4 times total
   921		MOVO	X0, X1				// save unscrambled seed
   922		PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   923		AESENC	X0, X0				// scramble seed
   924	
   925		CMPQ	CX, $16
   926		JB	aes0to15
   927		JE	aes16
   928		CMPQ	CX, $32
   929		JBE	aes17to32
   930		CMPQ	CX, $64
   931		JBE	aes33to64
   932		CMPQ	CX, $128
   933		JBE	aes65to128
   934		JMP	aes129plus
   935	
   936	aes0to15:
   937		TESTQ	CX, CX
   938		JE	aes0
   939	
   940		ADDQ	$16, AX
   941		TESTW	$0xff0, AX
   942		JE	endofpage
   943	
   944		// 16 bytes loaded at this address won't cross
   945		// a page boundary, so we can load it directly.
   946		MOVOU	-16(AX), X1
   947		ADDQ	CX, CX
   948		MOVQ	$masks<>(SB), AX
   949		PAND	(AX)(CX*8), X1
   950	final1:
   951		PXOR	X0, X1	// xor data with seed
   952		AESENC	X1, X1	// scramble combo 3 times
   953		AESENC	X1, X1
   954		AESENC	X1, X1
   955		MOVQ	X1, (DX)
   956		RET
   957	
   958	endofpage:
   959		// address ends in 1111xxxx. Might be up against
   960		// a page boundary, so load ending at last byte.
   961		// Then shift bytes down using pshufb.
   962		MOVOU	-32(AX)(CX*1), X1
   963		ADDQ	CX, CX
   964		MOVQ	$shifts<>(SB), AX
   965		PSHUFB	(AX)(CX*8), X1
   966		JMP	final1
   967	
   968	aes0:
   969		// Return scrambled input seed
   970		AESENC	X0, X0
   971		MOVQ	X0, (DX)
   972		RET
   973	
   974	aes16:
   975		MOVOU	(AX), X1
   976		JMP	final1
   977	
   978	aes17to32:
   979		// make second starting seed
   980		PXOR	runtime·aeskeysched+16(SB), X1
   981		AESENC	X1, X1
   982		
   983		// load data to be hashed
   984		MOVOU	(AX), X2
   985		MOVOU	-16(AX)(CX*1), X3
   986	
   987		// xor with seed
   988		PXOR	X0, X2
   989		PXOR	X1, X3
   990	
   991		// scramble 3 times
   992		AESENC	X2, X2
   993		AESENC	X3, X3
   994		AESENC	X2, X2
   995		AESENC	X3, X3
   996		AESENC	X2, X2
   997		AESENC	X3, X3
   998	
   999		// combine results
  1000		PXOR	X3, X2
  1001		MOVQ	X2, (DX)
  1002		RET
  1003	
  1004	aes33to64:
  1005		// make 3 more starting seeds
  1006		MOVO	X1, X2
  1007		MOVO	X1, X3
  1008		PXOR	runtime·aeskeysched+16(SB), X1
  1009		PXOR	runtime·aeskeysched+32(SB), X2
  1010		PXOR	runtime·aeskeysched+48(SB), X3
  1011		AESENC	X1, X1
  1012		AESENC	X2, X2
  1013		AESENC	X3, X3
  1014		
  1015		MOVOU	(AX), X4
  1016		MOVOU	16(AX), X5
  1017		MOVOU	-32(AX)(CX*1), X6
  1018		MOVOU	-16(AX)(CX*1), X7
  1019	
  1020		PXOR	X0, X4
  1021		PXOR	X1, X5
  1022		PXOR	X2, X6
  1023		PXOR	X3, X7
  1024		
  1025		AESENC	X4, X4
  1026		AESENC	X5, X5
  1027		AESENC	X6, X6
  1028		AESENC	X7, X7
  1029		
  1030		AESENC	X4, X4
  1031		AESENC	X5, X5
  1032		AESENC	X6, X6
  1033		AESENC	X7, X7
  1034		
  1035		AESENC	X4, X4
  1036		AESENC	X5, X5
  1037		AESENC	X6, X6
  1038		AESENC	X7, X7
  1039	
  1040		PXOR	X6, X4
  1041		PXOR	X7, X5
  1042		PXOR	X5, X4
  1043		MOVQ	X4, (DX)
  1044		RET
  1045	
  1046	aes65to128:
  1047		// make 7 more starting seeds
  1048		MOVO	X1, X2
  1049		MOVO	X1, X3
  1050		MOVO	X1, X4
  1051		MOVO	X1, X5
  1052		MOVO	X1, X6
  1053		MOVO	X1, X7
  1054		PXOR	runtime·aeskeysched+16(SB), X1
  1055		PXOR	runtime·aeskeysched+32(SB), X2
  1056		PXOR	runtime·aeskeysched+48(SB), X3
  1057		PXOR	runtime·aeskeysched+64(SB), X4
  1058		PXOR	runtime·aeskeysched+80(SB), X5
  1059		PXOR	runtime·aeskeysched+96(SB), X6
  1060		PXOR	runtime·aeskeysched+112(SB), X7
  1061		AESENC	X1, X1
  1062		AESENC	X2, X2
  1063		AESENC	X3, X3
  1064		AESENC	X4, X4
  1065		AESENC	X5, X5
  1066		AESENC	X6, X6
  1067		AESENC	X7, X7
  1068	
  1069		// load data
  1070		MOVOU	(AX), X8
  1071		MOVOU	16(AX), X9
  1072		MOVOU	32(AX), X10
  1073		MOVOU	48(AX), X11
  1074		MOVOU	-64(AX)(CX*1), X12
  1075		MOVOU	-48(AX)(CX*1), X13
  1076		MOVOU	-32(AX)(CX*1), X14
  1077		MOVOU	-16(AX)(CX*1), X15
  1078	
  1079		// xor with seed
  1080		PXOR	X0, X8
  1081		PXOR	X1, X9
  1082		PXOR	X2, X10
  1083		PXOR	X3, X11
  1084		PXOR	X4, X12
  1085		PXOR	X5, X13
  1086		PXOR	X6, X14
  1087		PXOR	X7, X15
  1088	
  1089		// scramble 3 times
  1090		AESENC	X8, X8
  1091		AESENC	X9, X9
  1092		AESENC	X10, X10
  1093		AESENC	X11, X11
  1094		AESENC	X12, X12
  1095		AESENC	X13, X13
  1096		AESENC	X14, X14
  1097		AESENC	X15, X15
  1098	
  1099		AESENC	X8, X8
  1100		AESENC	X9, X9
  1101		AESENC	X10, X10
  1102		AESENC	X11, X11
  1103		AESENC	X12, X12
  1104		AESENC	X13, X13
  1105		AESENC	X14, X14
  1106		AESENC	X15, X15
  1107	
  1108		AESENC	X8, X8
  1109		AESENC	X9, X9
  1110		AESENC	X10, X10
  1111		AESENC	X11, X11
  1112		AESENC	X12, X12
  1113		AESENC	X13, X13
  1114		AESENC	X14, X14
  1115		AESENC	X15, X15
  1116	
  1117		// combine results
  1118		PXOR	X12, X8
  1119		PXOR	X13, X9
  1120		PXOR	X14, X10
  1121		PXOR	X15, X11
  1122		PXOR	X10, X8
  1123		PXOR	X11, X9
  1124		PXOR	X9, X8
  1125		MOVQ	X8, (DX)
  1126		RET
  1127	
  1128	aes129plus:
  1129		// make 7 more starting seeds
  1130		MOVO	X1, X2
  1131		MOVO	X1, X3
  1132		MOVO	X1, X4
  1133		MOVO	X1, X5
  1134		MOVO	X1, X6
  1135		MOVO	X1, X7
  1136		PXOR	runtime·aeskeysched+16(SB), X1
  1137		PXOR	runtime·aeskeysched+32(SB), X2
  1138		PXOR	runtime·aeskeysched+48(SB), X3
  1139		PXOR	runtime·aeskeysched+64(SB), X4
  1140		PXOR	runtime·aeskeysched+80(SB), X5
  1141		PXOR	runtime·aeskeysched+96(SB), X6
  1142		PXOR	runtime·aeskeysched+112(SB), X7
  1143		AESENC	X1, X1
  1144		AESENC	X2, X2
  1145		AESENC	X3, X3
  1146		AESENC	X4, X4
  1147		AESENC	X5, X5
  1148		AESENC	X6, X6
  1149		AESENC	X7, X7
  1150		
  1151		// start with last (possibly overlapping) block
  1152		MOVOU	-128(AX)(CX*1), X8
  1153		MOVOU	-112(AX)(CX*1), X9
  1154		MOVOU	-96(AX)(CX*1), X10
  1155		MOVOU	-80(AX)(CX*1), X11
  1156		MOVOU	-64(AX)(CX*1), X12
  1157		MOVOU	-48(AX)(CX*1), X13
  1158		MOVOU	-32(AX)(CX*1), X14
  1159		MOVOU	-16(AX)(CX*1), X15
  1160	
  1161		// xor in seed
  1162		PXOR	X0, X8
  1163		PXOR	X1, X9
  1164		PXOR	X2, X10
  1165		PXOR	X3, X11
  1166		PXOR	X4, X12
  1167		PXOR	X5, X13
  1168		PXOR	X6, X14
  1169		PXOR	X7, X15
  1170		
  1171		// compute number of remaining 128-byte blocks
  1172		DECQ	CX
  1173		SHRQ	$7, CX
  1174		
  1175	aesloop:
  1176		// scramble state
  1177		AESENC	X8, X8
  1178		AESENC	X9, X9
  1179		AESENC	X10, X10
  1180		AESENC	X11, X11
  1181		AESENC	X12, X12
  1182		AESENC	X13, X13
  1183		AESENC	X14, X14
  1184		AESENC	X15, X15
  1185	
  1186		// scramble state, xor in a block
  1187		MOVOU	(AX), X0
  1188		MOVOU	16(AX), X1
  1189		MOVOU	32(AX), X2
  1190		MOVOU	48(AX), X3
  1191		AESENC	X0, X8
  1192		AESENC	X1, X9
  1193		AESENC	X2, X10
  1194		AESENC	X3, X11
  1195		MOVOU	64(AX), X4
  1196		MOVOU	80(AX), X5
  1197		MOVOU	96(AX), X6
  1198		MOVOU	112(AX), X7
  1199		AESENC	X4, X12
  1200		AESENC	X5, X13
  1201		AESENC	X6, X14
  1202		AESENC	X7, X15
  1203	
  1204		ADDQ	$128, AX
  1205		DECQ	CX
  1206		JNE	aesloop
  1207	
  1208		// 3 more scrambles to finish
  1209		AESENC	X8, X8
  1210		AESENC	X9, X9
  1211		AESENC	X10, X10
  1212		AESENC	X11, X11
  1213		AESENC	X12, X12
  1214		AESENC	X13, X13
  1215		AESENC	X14, X14
  1216		AESENC	X15, X15
  1217		AESENC	X8, X8
  1218		AESENC	X9, X9
  1219		AESENC	X10, X10
  1220		AESENC	X11, X11
  1221		AESENC	X12, X12
  1222		AESENC	X13, X13
  1223		AESENC	X14, X14
  1224		AESENC	X15, X15
  1225		AESENC	X8, X8
  1226		AESENC	X9, X9
  1227		AESENC	X10, X10
  1228		AESENC	X11, X11
  1229		AESENC	X12, X12
  1230		AESENC	X13, X13
  1231		AESENC	X14, X14
  1232		AESENC	X15, X15
  1233	
  1234		PXOR	X12, X8
  1235		PXOR	X13, X9
  1236		PXOR	X14, X10
  1237		PXOR	X15, X11
  1238		PXOR	X10, X8
  1239		PXOR	X11, X9
  1240		PXOR	X9, X8
  1241		MOVQ	X8, (DX)
  1242		RET
  1243		
  1244	TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1245		MOVQ	p+0(FP), AX	// ptr to data
  1246		MOVQ	h+8(FP), X0	// seed
  1247		PINSRD	$2, (AX), X0	// data
  1248		AESENC	runtime·aeskeysched+0(SB), X0
  1249		AESENC	runtime·aeskeysched+16(SB), X0
  1250		AESENC	runtime·aeskeysched+32(SB), X0
  1251		MOVQ	X0, ret+16(FP)
  1252		RET
  1253	
  1254	TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1255		MOVQ	p+0(FP), AX	// ptr to data
  1256		MOVQ	h+8(FP), X0	// seed
  1257		PINSRQ	$1, (AX), X0	// data
  1258		AESENC	runtime·aeskeysched+0(SB), X0
  1259		AESENC	runtime·aeskeysched+16(SB), X0
  1260		AESENC	runtime·aeskeysched+32(SB), X0
  1261		MOVQ	X0, ret+16(FP)
  1262		RET
  1263	
  1264	// simple mask to get rid of data in the high part of the register.
  1265	DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1266	DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1267	DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1268	DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1269	DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1270	DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1271	DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1272	DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1273	DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1274	DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1275	DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1276	DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1277	DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1278	DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1279	DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1280	DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1281	DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1282	DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1283	DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1284	DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1285	DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1286	DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1287	DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1288	DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1289	DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1290	DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1291	DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1292	DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1293	DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1294	DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1295	DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1296	DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1297	GLOBL masks<>(SB),RODATA,$256
  1298	
  1299	TEXT ·checkASM(SB),NOSPLIT,$0-1
  1300		// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1301		MOVQ	$masks<>(SB), AX
  1302		MOVQ	$shifts<>(SB), BX
  1303		ORQ	BX, AX
  1304		TESTQ	$15, AX
  1305		SETEQ	ret+0(FP)
  1306		RET
  1307	
  1308	// these are arguments to pshufb. They move data down from
  1309	// the high bytes of the register to the low bytes of the register.
  1310	// index is how many bytes to move.
  1311	DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1312	DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1313	DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1314	DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1315	DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1316	DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1317	DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1318	DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1319	DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1320	DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1321	DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1322	DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1323	DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1324	DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1325	DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1326	DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1327	DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1328	DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1329	DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1330	DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1331	DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1332	DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1333	DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1334	DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1335	DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1336	DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1337	DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1338	DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1339	DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1340	DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1341	DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1342	DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1343	GLOBL shifts<>(SB),RODATA,$256
  1344	
  1345	// memequal(p, q unsafe.Pointer, size uintptr) bool
  1346	TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1347		MOVQ	a+0(FP), SI
  1348		MOVQ	b+8(FP), DI
  1349		CMPQ	SI, DI
  1350		JEQ	eq
  1351		MOVQ	size+16(FP), BX
  1352		LEAQ	ret+24(FP), AX
  1353		JMP	runtime·memeqbody(SB)
  1354	eq:
  1355		MOVB	$1, ret+24(FP)
  1356		RET
  1357	
  1358	// memequal_varlen(a, b unsafe.Pointer) bool
  1359	TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1360		MOVQ	a+0(FP), SI
  1361		MOVQ	b+8(FP), DI
  1362		CMPQ	SI, DI
  1363		JEQ	eq
  1364		MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1365		LEAQ	ret+16(FP), AX
  1366		JMP	runtime·memeqbody(SB)
  1367	eq:
  1368		MOVB	$1, ret+16(FP)
  1369		RET
  1370	
  1371	// eqstring tests whether two strings are equal.
  1372	// The compiler guarantees that strings passed
  1373	// to eqstring have equal length.
  1374	// See runtime_test.go:eqstring_generic for
  1375	// equivalent Go code.
  1376	TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1377		MOVQ	s1_base+0(FP), SI
  1378		MOVQ	s2_base+16(FP), DI
  1379		CMPQ	SI, DI
  1380		JEQ	eq
  1381		MOVQ	s1_len+8(FP), BX
  1382		LEAQ	ret+32(FP), AX
  1383		JMP	runtime·memeqbody(SB)
  1384	eq:
  1385		MOVB	$1, ret+32(FP)
  1386		RET
  1387	
  1388	// a in SI
  1389	// b in DI
  1390	// count in BX
  1391	// address of result byte in AX
  1392	TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1393		CMPQ	BX, $8
  1394		JB	small
  1395		CMPQ	BX, $64
  1396		JB	bigloop
  1397		CMPB    runtime·support_avx2(SB), $1
  1398		JE	hugeloop_avx2
  1399		
  1400		// 64 bytes at a time using xmm registers
  1401	hugeloop:
  1402		CMPQ	BX, $64
  1403		JB	bigloop
  1404		MOVOU	(SI), X0
  1405		MOVOU	(DI), X1
  1406		MOVOU	16(SI), X2
  1407		MOVOU	16(DI), X3
  1408		MOVOU	32(SI), X4
  1409		MOVOU	32(DI), X5
  1410		MOVOU	48(SI), X6
  1411		MOVOU	48(DI), X7
  1412		PCMPEQB	X1, X0
  1413		PCMPEQB	X3, X2
  1414		PCMPEQB	X5, X4
  1415		PCMPEQB	X7, X6
  1416		PAND	X2, X0
  1417		PAND	X6, X4
  1418		PAND	X4, X0
  1419		PMOVMSKB X0, DX
  1420		ADDQ	$64, SI
  1421		ADDQ	$64, DI
  1422		SUBQ	$64, BX
  1423		CMPL	DX, $0xffff
  1424		JEQ	hugeloop
  1425		MOVB	$0, (AX)
  1426		RET
  1427	
  1428		// 64 bytes at a time using ymm registers
  1429	hugeloop_avx2:
  1430		CMPQ	BX, $64
  1431		JB	bigloop_avx2
  1432		VMOVDQU	(SI), Y0
  1433		VMOVDQU	(DI), Y1
  1434		VMOVDQU	32(SI), Y2
  1435		VMOVDQU	32(DI), Y3
  1436		VPCMPEQB	Y1, Y0, Y4
  1437		VPCMPEQB	Y2, Y3, Y5
  1438		VPAND	Y4, Y5, Y6
  1439		VPMOVMSKB Y6, DX
  1440		ADDQ	$64, SI
  1441		ADDQ	$64, DI
  1442		SUBQ	$64, BX
  1443		CMPL	DX, $0xffffffff
  1444		JEQ	hugeloop_avx2
  1445		VZEROUPPER
  1446		MOVB	$0, (AX)
  1447		RET
  1448	
  1449	bigloop_avx2:
  1450		VZEROUPPER
  1451	
  1452		// 8 bytes at a time using 64-bit register
  1453	bigloop:
  1454		CMPQ	BX, $8
  1455		JBE	leftover
  1456		MOVQ	(SI), CX
  1457		MOVQ	(DI), DX
  1458		ADDQ	$8, SI
  1459		ADDQ	$8, DI
  1460		SUBQ	$8, BX
  1461		CMPQ	CX, DX
  1462		JEQ	bigloop
  1463		MOVB	$0, (AX)
  1464		RET
  1465	
  1466		// remaining 0-8 bytes
  1467	leftover:
  1468		MOVQ	-8(SI)(BX*1), CX
  1469		MOVQ	-8(DI)(BX*1), DX
  1470		CMPQ	CX, DX
  1471		SETEQ	(AX)
  1472		RET
  1473	
  1474	small:
  1475		CMPQ	BX, $0
  1476		JEQ	equal
  1477	
  1478		LEAQ	0(BX*8), CX
  1479		NEGQ	CX
  1480	
  1481		CMPB	SI, $0xf8
  1482		JA	si_high
  1483	
  1484		// load at SI won't cross a page boundary.
  1485		MOVQ	(SI), SI
  1486		JMP	si_finish
  1487	si_high:
  1488		// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1489		MOVQ	-8(SI)(BX*1), SI
  1490		SHRQ	CX, SI
  1491	si_finish:
  1492	
  1493		// same for DI.
  1494		CMPB	DI, $0xf8
  1495		JA	di_high
  1496		MOVQ	(DI), DI
  1497		JMP	di_finish
  1498	di_high:
  1499		MOVQ	-8(DI)(BX*1), DI
  1500		SHRQ	CX, DI
  1501	di_finish:
  1502	
  1503		SUBQ	SI, DI
  1504		SHLQ	CX, DI
  1505	equal:
  1506		SETEQ	(AX)
  1507		RET
  1508	
  1509	TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1510		MOVQ	s1_base+0(FP), SI
  1511		MOVQ	s1_len+8(FP), BX
  1512		MOVQ	s2_base+16(FP), DI
  1513		MOVQ	s2_len+24(FP), DX
  1514		LEAQ	ret+32(FP), R9
  1515		JMP	runtime·cmpbody(SB)
  1516	
  1517	TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1518		MOVQ	s1+0(FP), SI
  1519		MOVQ	s1+8(FP), BX
  1520		MOVQ	s2+24(FP), DI
  1521		MOVQ	s2+32(FP), DX
  1522		LEAQ	res+48(FP), R9
  1523		JMP	runtime·cmpbody(SB)
  1524	
  1525	// input:
  1526	//   SI = a
  1527	//   DI = b
  1528	//   BX = alen
  1529	//   DX = blen
  1530	//   R9 = address of output word (stores -1/0/1 here)
  1531	TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1532		CMPQ	SI, DI
  1533		JEQ	allsame
  1534		CMPQ	BX, DX
  1535		MOVQ	DX, R8
  1536		CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1537		CMPQ	R8, $8
  1538		JB	small
  1539	
  1540		CMPQ	R8, $63
  1541		JBE	loop
  1542		CMPB    runtime·support_avx2(SB), $1
  1543		JEQ     big_loop_avx2
  1544		JMP	big_loop
  1545	loop:
  1546		CMPQ	R8, $16
  1547		JBE	_0through16
  1548		MOVOU	(SI), X0
  1549		MOVOU	(DI), X1
  1550		PCMPEQB X0, X1
  1551		PMOVMSKB X1, AX
  1552		XORQ	$0xffff, AX	// convert EQ to NE
  1553		JNE	diff16	// branch if at least one byte is not equal
  1554		ADDQ	$16, SI
  1555		ADDQ	$16, DI
  1556		SUBQ	$16, R8
  1557		JMP	loop
  1558		
  1559	diff64:
  1560		ADDQ	$48, SI
  1561		ADDQ	$48, DI
  1562		JMP	diff16
  1563	diff48:
  1564		ADDQ	$32, SI
  1565		ADDQ	$32, DI
  1566		JMP	diff16
  1567	diff32:
  1568		ADDQ	$16, SI
  1569		ADDQ	$16, DI
  1570		// AX = bit mask of differences
  1571	diff16:
  1572		BSFQ	AX, BX	// index of first byte that differs
  1573		XORQ	AX, AX
  1574		MOVB	(SI)(BX*1), CX
  1575		CMPB	CX, (DI)(BX*1)
  1576		SETHI	AX
  1577		LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1578		MOVQ	AX, (R9)
  1579		RET
  1580	
  1581		// 0 through 16 bytes left, alen>=8, blen>=8
  1582	_0through16:
  1583		CMPQ	R8, $8
  1584		JBE	_0through8
  1585		MOVQ	(SI), AX
  1586		MOVQ	(DI), CX
  1587		CMPQ	AX, CX
  1588		JNE	diff8
  1589	_0through8:
  1590		MOVQ	-8(SI)(R8*1), AX
  1591		MOVQ	-8(DI)(R8*1), CX
  1592		CMPQ	AX, CX
  1593		JEQ	allsame
  1594	
  1595		// AX and CX contain parts of a and b that differ.
  1596	diff8:
  1597		BSWAPQ	AX	// reverse order of bytes
  1598		BSWAPQ	CX
  1599		XORQ	AX, CX
  1600		BSRQ	CX, CX	// index of highest bit difference
  1601		SHRQ	CX, AX	// move a's bit to bottom
  1602		ANDQ	$1, AX	// mask bit
  1603		LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1604		MOVQ	AX, (R9)
  1605		RET
  1606	
  1607		// 0-7 bytes in common
  1608	small:
  1609		LEAQ	(R8*8), CX	// bytes left -> bits left
  1610		NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1611		JEQ	allsame
  1612	
  1613		// load bytes of a into high bytes of AX
  1614		CMPB	SI, $0xf8
  1615		JA	si_high
  1616		MOVQ	(SI), SI
  1617		JMP	si_finish
  1618	si_high:
  1619		MOVQ	-8(SI)(R8*1), SI
  1620		SHRQ	CX, SI
  1621	si_finish:
  1622		SHLQ	CX, SI
  1623	
  1624		// load bytes of b in to high bytes of BX
  1625		CMPB	DI, $0xf8
  1626		JA	di_high
  1627		MOVQ	(DI), DI
  1628		JMP	di_finish
  1629	di_high:
  1630		MOVQ	-8(DI)(R8*1), DI
  1631		SHRQ	CX, DI
  1632	di_finish:
  1633		SHLQ	CX, DI
  1634	
  1635		BSWAPQ	SI	// reverse order of bytes
  1636		BSWAPQ	DI
  1637		XORQ	SI, DI	// find bit differences
  1638		JEQ	allsame
  1639		BSRQ	DI, CX	// index of highest bit difference
  1640		SHRQ	CX, SI	// move a's bit to bottom
  1641		ANDQ	$1, SI	// mask bit
  1642		LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1643		MOVQ	AX, (R9)
  1644		RET
  1645	
  1646	allsame:
  1647		XORQ	AX, AX
  1648		XORQ	CX, CX
  1649		CMPQ	BX, DX
  1650		SETGT	AX	// 1 if alen > blen
  1651		SETEQ	CX	// 1 if alen == blen
  1652		LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1653		MOVQ	AX, (R9)
  1654		RET
  1655	
  1656		// this works for >= 64 bytes of data.
  1657	big_loop:
  1658		MOVOU	(SI), X0
  1659		MOVOU	(DI), X1
  1660		PCMPEQB X0, X1
  1661		PMOVMSKB X1, AX
  1662		XORQ	$0xffff, AX
  1663		JNE	diff16
  1664	
  1665		MOVOU	16(SI), X0
  1666		MOVOU	16(DI), X1
  1667		PCMPEQB X0, X1
  1668		PMOVMSKB X1, AX
  1669		XORQ	$0xffff, AX
  1670		JNE	diff32
  1671	
  1672		MOVOU	32(SI), X0
  1673		MOVOU	32(DI), X1
  1674		PCMPEQB X0, X1
  1675		PMOVMSKB X1, AX
  1676		XORQ	$0xffff, AX
  1677		JNE	diff48
  1678	
  1679		MOVOU	48(SI), X0
  1680		MOVOU	48(DI), X1
  1681		PCMPEQB X0, X1
  1682		PMOVMSKB X1, AX
  1683		XORQ	$0xffff, AX
  1684		JNE	diff64
  1685	
  1686		ADDQ	$64, SI
  1687		ADDQ	$64, DI
  1688		SUBQ	$64, R8
  1689		CMPQ	R8, $64
  1690		JBE	loop
  1691		JMP	big_loop
  1692	
  1693		// Compare 64-bytes per loop iteration.
  1694		// Loop is unrolled and uses AVX2.
  1695	big_loop_avx2:
  1696		VMOVDQU	(SI), Y2
  1697		VMOVDQU	(DI), Y3
  1698		VMOVDQU	32(SI), Y4
  1699		VMOVDQU	32(DI), Y5
  1700		VPCMPEQB Y2, Y3, Y0
  1701		VPMOVMSKB Y0, AX
  1702		XORL	$0xffffffff, AX
  1703		JNE	diff32_avx2
  1704		VPCMPEQB Y4, Y5, Y6
  1705		VPMOVMSKB Y6, AX
  1706		XORL	$0xffffffff, AX
  1707		JNE	diff64_avx2
  1708	
  1709		ADDQ	$64, SI
  1710		ADDQ	$64, DI
  1711		SUBQ	$64, R8
  1712		CMPQ	R8, $64
  1713		JB	big_loop_avx2_exit
  1714		JMP	big_loop_avx2
  1715	
  1716		// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1717	diff32_avx2:
  1718		VZEROUPPER
  1719		JMP diff16
  1720	
  1721		// Same as diff32_avx2, but for last 32 bytes.
  1722	diff64_avx2:
  1723		VZEROUPPER
  1724		JMP diff48
  1725	
  1726		// For <64 bytes remainder jump to normal loop.
  1727	big_loop_avx2_exit:
  1728		VZEROUPPER
  1729		JMP loop
  1730	
  1731	
  1732	TEXT strings·supportAVX2(SB),NOSPLIT,$0-1
  1733		MOVBLZX runtime·support_avx2(SB), AX
  1734		MOVB AX, ret+0(FP)
  1735		RET
  1736	
  1737	TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
  1738		MOVBLZX runtime·support_avx2(SB), AX
  1739		MOVB AX, ret+0(FP)
  1740		RET
  1741	
  1742	TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1743		MOVQ s+0(FP), DI
  1744		// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1745		MOVQ s_len+8(FP), DX
  1746		MOVQ c+16(FP), BP
  1747		MOVQ c_len+24(FP), AX
  1748		MOVQ DI, R10
  1749		LEAQ ret+32(FP), R11
  1750		JMP  runtime·indexShortStr(SB)
  1751	
  1752	TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1753		MOVQ s+0(FP), DI
  1754		MOVQ s_len+8(FP), DX
  1755		MOVQ c+24(FP), BP
  1756		MOVQ c_len+32(FP), AX
  1757		MOVQ DI, R10
  1758		LEAQ ret+48(FP), R11
  1759		JMP  runtime·indexShortStr(SB)
  1760	
  1761	// AX: length of string, that we are searching for
  1762	// DX: length of string, in which we are searching
  1763	// DI: pointer to string, in which we are searching
  1764	// BP: pointer to string, that we are searching for
  1765	// R11: address, where to put return value
  1766	TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1767		CMPQ AX, DX
  1768		JA fail
  1769		CMPQ DX, $16
  1770		JAE sse42
  1771	no_sse42:
  1772		CMPQ AX, $2
  1773		JA   _3_or_more
  1774		MOVW (BP), BP
  1775		LEAQ -1(DI)(DX*1), DX
  1776	loop2:
  1777		MOVW (DI), SI
  1778		CMPW SI,BP
  1779		JZ success
  1780		ADDQ $1,DI
  1781		CMPQ DI,DX
  1782		JB loop2
  1783		JMP fail
  1784	_3_or_more:
  1785		CMPQ AX, $3
  1786		JA   _4_or_more
  1787		MOVW 1(BP), BX
  1788		MOVW (BP), BP
  1789		LEAQ -2(DI)(DX*1), DX
  1790	loop3:
  1791		MOVW (DI), SI
  1792		CMPW SI,BP
  1793		JZ   partial_success3
  1794		ADDQ $1,DI
  1795		CMPQ DI,DX
  1796		JB loop3
  1797		JMP fail
  1798	partial_success3:
  1799		MOVW 1(DI), SI
  1800		CMPW SI,BX
  1801		JZ success
  1802		ADDQ $1,DI
  1803		CMPQ DI,DX
  1804		JB loop3
  1805		JMP fail
  1806	_4_or_more:
  1807		CMPQ AX, $4
  1808		JA   _5_or_more
  1809		MOVL (BP), BP
  1810		LEAQ -3(DI)(DX*1), DX
  1811	loop4:
  1812		MOVL (DI), SI
  1813		CMPL SI,BP
  1814		JZ   success
  1815		ADDQ $1,DI
  1816		CMPQ DI,DX
  1817		JB loop4
  1818		JMP fail
  1819	_5_or_more:
  1820		CMPQ AX, $7
  1821		JA   _8_or_more
  1822		LEAQ 1(DI)(DX*1), DX
  1823		SUBQ AX, DX
  1824		MOVL -4(BP)(AX*1), BX
  1825		MOVL (BP), BP
  1826	loop5to7:
  1827		MOVL (DI), SI
  1828		CMPL SI,BP
  1829		JZ   partial_success5to7
  1830		ADDQ $1,DI
  1831		CMPQ DI,DX
  1832		JB loop5to7
  1833		JMP fail
  1834	partial_success5to7:
  1835		MOVL -4(AX)(DI*1), SI
  1836		CMPL SI,BX
  1837		JZ success
  1838		ADDQ $1,DI
  1839		CMPQ DI,DX
  1840		JB loop5to7
  1841		JMP fail
  1842	_8_or_more:
  1843		CMPQ AX, $8
  1844		JA   _9_or_more
  1845		MOVQ (BP), BP
  1846		LEAQ -7(DI)(DX*1), DX
  1847	loop8:
  1848		MOVQ (DI), SI
  1849		CMPQ SI,BP
  1850		JZ   success
  1851		ADDQ $1,DI
  1852		CMPQ DI,DX
  1853		JB loop8
  1854		JMP fail
  1855	_9_or_more:
  1856		CMPQ AX, $15
  1857		JA   _16_or_more
  1858		LEAQ 1(DI)(DX*1), DX
  1859		SUBQ AX, DX
  1860		MOVQ -8(BP)(AX*1), BX
  1861		MOVQ (BP), BP
  1862	loop9to15:
  1863		MOVQ (DI), SI
  1864		CMPQ SI,BP
  1865		JZ   partial_success9to15
  1866		ADDQ $1,DI
  1867		CMPQ DI,DX
  1868		JB loop9to15
  1869		JMP fail
  1870	partial_success9to15:
  1871		MOVQ -8(AX)(DI*1), SI
  1872		CMPQ SI,BX
  1873		JZ success
  1874		ADDQ $1,DI
  1875		CMPQ DI,DX
  1876		JB loop9to15
  1877		JMP fail
  1878	_16_or_more:
  1879		CMPQ AX, $16
  1880		JA   _17_or_more
  1881		MOVOU (BP), X1
  1882		LEAQ -15(DI)(DX*1), DX
  1883	loop16:
  1884		MOVOU (DI), X2
  1885		PCMPEQB X1, X2
  1886		PMOVMSKB X2, SI
  1887		CMPQ  SI, $0xffff
  1888		JE   success
  1889		ADDQ $1,DI
  1890		CMPQ DI,DX
  1891		JB loop16
  1892		JMP fail
  1893	_17_or_more:
  1894		CMPQ AX, $31
  1895		JA   _32_or_more
  1896		LEAQ 1(DI)(DX*1), DX
  1897		SUBQ AX, DX
  1898		MOVOU -16(BP)(AX*1), X0
  1899		MOVOU (BP), X1
  1900	loop17to31:
  1901		MOVOU (DI), X2
  1902		PCMPEQB X1,X2
  1903		PMOVMSKB X2, SI
  1904		CMPQ  SI, $0xffff
  1905		JE   partial_success17to31
  1906		ADDQ $1,DI
  1907		CMPQ DI,DX
  1908		JB loop17to31
  1909		JMP fail
  1910	partial_success17to31:
  1911		MOVOU -16(AX)(DI*1), X3
  1912		PCMPEQB X0, X3
  1913		PMOVMSKB X3, SI
  1914		CMPQ  SI, $0xffff
  1915		JE success
  1916		ADDQ $1,DI
  1917		CMPQ DI,DX
  1918		JB loop17to31
  1919		JMP fail
  1920	// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1921	// So no need to check cpuid
  1922	_32_or_more:
  1923		CMPQ AX, $32
  1924		JA   _33_to_63
  1925		VMOVDQU (BP), Y1
  1926		LEAQ -31(DI)(DX*1), DX
  1927	loop32:
  1928		VMOVDQU (DI), Y2
  1929		VPCMPEQB Y1, Y2, Y3
  1930		VPMOVMSKB Y3, SI
  1931		CMPL  SI, $0xffffffff
  1932		JE   success_avx2
  1933		ADDQ $1,DI
  1934		CMPQ DI,DX
  1935		JB loop32
  1936		JMP fail_avx2
  1937	_33_to_63:
  1938		LEAQ 1(DI)(DX*1), DX
  1939		SUBQ AX, DX
  1940		VMOVDQU -32(BP)(AX*1), Y0
  1941		VMOVDQU (BP), Y1
  1942	loop33to63:
  1943		VMOVDQU (DI), Y2
  1944		VPCMPEQB Y1, Y2, Y3
  1945		VPMOVMSKB Y3, SI
  1946		CMPL  SI, $0xffffffff
  1947		JE   partial_success33to63
  1948		ADDQ $1,DI
  1949		CMPQ DI,DX
  1950		JB loop33to63
  1951		JMP fail_avx2
  1952	partial_success33to63:
  1953		VMOVDQU -32(AX)(DI*1), Y3
  1954		VPCMPEQB Y0, Y3, Y4
  1955		VPMOVMSKB Y4, SI
  1956		CMPL  SI, $0xffffffff
  1957		JE success_avx2
  1958		ADDQ $1,DI
  1959		CMPQ DI,DX
  1960		JB loop33to63
  1961	fail_avx2:
  1962		VZEROUPPER
  1963	fail:
  1964		MOVQ $-1, (R11)
  1965		RET
  1966	success_avx2:
  1967		VZEROUPPER
  1968		JMP success
  1969	sse42:
  1970		MOVL runtime·cpuid_ecx(SB), CX
  1971		ANDL $0x100000, CX
  1972		JZ no_sse42
  1973		CMPQ AX, $12
  1974		// PCMPESTRI is slower than normal compare,
  1975		// so using it makes sense only if we advance 4+ bytes per compare
  1976		// This value was determined experimentally and is the ~same
  1977		// on Nehalem (first with SSE42) and Haswell.
  1978		JAE _9_or_more
  1979		LEAQ 16(BP), SI
  1980		TESTW $0xff0, SI
  1981		JEQ no_sse42
  1982		MOVOU (BP), X1
  1983		LEAQ -15(DI)(DX*1), SI
  1984		MOVQ $16, R9
  1985		SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1986	loop_sse42:
  1987		// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1988		// for equality (bits 2,3 are 11)
  1989		// result is not masked or inverted (bits 4,5 are 00)
  1990		// and corresponds to first matching byte (bit 6 is 0)
  1991		PCMPESTRI $0x0c, (DI), X1
  1992		// CX == 16 means no match,
  1993		// CX > R9 means partial match at the end of the string,
  1994		// otherwise sep is at offset CX from X1 start
  1995		CMPQ CX, R9
  1996		JBE sse42_success
  1997		ADDQ R9, DI
  1998		CMPQ DI, SI
  1999		JB loop_sse42
  2000		PCMPESTRI $0x0c, -1(SI), X1
  2001		CMPQ CX, R9
  2002		JA fail
  2003		LEAQ -1(SI), DI
  2004	sse42_success:
  2005		ADDQ CX, DI
  2006	success:
  2007		SUBQ R10, DI
  2008		MOVQ DI, (R11)
  2009		RET
  2010	
  2011	
  2012	TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  2013		MOVQ s+0(FP), SI
  2014		MOVQ s_len+8(FP), BX
  2015		MOVB c+24(FP), AL
  2016		LEAQ ret+32(FP), R8
  2017		JMP  runtime·indexbytebody(SB)
  2018	
  2019	TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  2020		MOVQ s+0(FP), SI
  2021		MOVQ s_len+8(FP), BX
  2022		MOVB c+16(FP), AL
  2023		LEAQ ret+24(FP), R8
  2024		JMP  runtime·indexbytebody(SB)
  2025	
  2026	// input:
  2027	//   SI: data
  2028	//   BX: data len
  2029	//   AL: byte sought
  2030	//   R8: address to put result
  2031	TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  2032		// Shuffle X0 around so that each byte contains
  2033		// the character we're looking for.
  2034		MOVD AX, X0
  2035		PUNPCKLBW X0, X0
  2036		PUNPCKLBW X0, X0
  2037		PSHUFL $0, X0, X0
  2038		
  2039		CMPQ BX, $16
  2040		JLT small
  2041	
  2042		MOVQ SI, DI
  2043	
  2044		CMPQ BX, $32
  2045		JA avx2
  2046	sse:
  2047		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2048		JMP	sseloopentry
  2049		
  2050	sseloop:
  2051		// Move the next 16-byte chunk of the data into X1.
  2052		MOVOU	(DI), X1
  2053		// Compare bytes in X0 to X1.
  2054		PCMPEQB	X0, X1
  2055		// Take the top bit of each byte in X1 and put the result in DX.
  2056		PMOVMSKB X1, DX
  2057		// Find first set bit, if any.
  2058		BSFL	DX, DX
  2059		JNZ	ssesuccess
  2060		// Advance to next block.
  2061		ADDQ	$16, DI
  2062	sseloopentry:
  2063		CMPQ	DI, AX
  2064		JB	sseloop
  2065	
  2066		// Search the last 16-byte chunk. This chunk may overlap with the
  2067		// chunks we've already searched, but that's ok.
  2068		MOVQ	AX, DI
  2069		MOVOU	(AX), X1
  2070		PCMPEQB	X0, X1
  2071		PMOVMSKB X1, DX
  2072		BSFL	DX, DX
  2073		JNZ	ssesuccess
  2074	
  2075	failure:
  2076		MOVQ $-1, (R8)
  2077		RET
  2078	
  2079	// We've found a chunk containing the byte.
  2080	// The chunk was loaded from DI.
  2081	// The index of the matching byte in the chunk is DX.
  2082	// The start of the data is SI.
  2083	ssesuccess:
  2084		SUBQ SI, DI	// Compute offset of chunk within data.
  2085		ADDQ DX, DI	// Add offset of byte within chunk.
  2086		MOVQ DI, (R8)
  2087		RET
  2088	
  2089	// handle for lengths < 16
  2090	small:
  2091		TESTQ	BX, BX
  2092		JEQ	failure
  2093	
  2094		// Check if we'll load across a page boundary.
  2095		LEAQ	16(SI), AX
  2096		TESTW	$0xff0, AX
  2097		JEQ	endofpage
  2098	
  2099		MOVOU	(SI), X1 // Load data
  2100		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2101		PMOVMSKB X1, DX	// Move result bits to integer register.
  2102		BSFL	DX, DX	// Find first set bit.
  2103		JZ	failure	// No set bit, failure.
  2104		CMPL	DX, BX
  2105		JAE	failure	// Match is past end of data.
  2106		MOVQ	DX, (R8)
  2107		RET
  2108	
  2109	endofpage:
  2110		MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2111		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2112		PMOVMSKB X1, DX	// Move result bits to integer register.
  2113		MOVL	BX, CX
  2114		SHLL	CX, DX
  2115		SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2116		BSFL	DX, DX	// Find first set bit.
  2117		JZ	failure	// No set bit, failure.
  2118		MOVQ	DX, (R8)
  2119		RET
  2120	
  2121	avx2:
  2122		CMPB   runtime·support_avx2(SB), $1
  2123		JNE sse
  2124		MOVD AX, X0
  2125		LEAQ -32(SI)(BX*1), R11
  2126		VPBROADCASTB  X0, Y1
  2127	avx2_loop:
  2128		VMOVDQU (DI), Y2
  2129		VPCMPEQB Y1, Y2, Y3
  2130		VPTEST Y3, Y3
  2131		JNZ avx2success
  2132		ADDQ $32, DI
  2133		CMPQ DI, R11
  2134		JLT avx2_loop
  2135		MOVQ R11, DI
  2136		VMOVDQU (DI), Y2
  2137		VPCMPEQB Y1, Y2, Y3
  2138		VPTEST Y3, Y3
  2139		JNZ avx2success
  2140		VZEROUPPER
  2141		MOVQ $-1, (R8)
  2142		RET
  2143	
  2144	avx2success:
  2145		VPMOVMSKB Y3, DX
  2146		BSFL DX, DX
  2147		SUBQ SI, DI
  2148		ADDQ DI, DX
  2149		MOVQ DX, (R8)
  2150		VZEROUPPER
  2151		RET
  2152	
  2153	TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2154		MOVQ	a_len+8(FP), BX
  2155		MOVQ	b_len+32(FP), CX
  2156		CMPQ	BX, CX
  2157		JNE	eqret
  2158		MOVQ	a+0(FP), SI
  2159		MOVQ	b+24(FP), DI
  2160		LEAQ	ret+48(FP), AX
  2161		JMP	runtime·memeqbody(SB)
  2162	eqret:
  2163		MOVB	$0, ret+48(FP)
  2164		RET
  2165	
  2166	TEXT runtime·fastrand(SB), NOSPLIT, $0-4
  2167		get_tls(CX)
  2168		MOVQ	g(CX), AX
  2169		MOVQ	g_m(AX), AX
  2170		MOVL	m_fastrand(AX), DX
  2171		ADDL	DX, DX
  2172		MOVL	DX, BX
  2173		XORL	$0x88888eef, DX
  2174		CMOVLMI	BX, DX
  2175		MOVL	DX, m_fastrand(AX)
  2176		MOVL	DX, ret+0(FP)
  2177		RET
  2178	
  2179	TEXT runtime·return0(SB), NOSPLIT, $0
  2180		MOVL	$0, AX
  2181		RET
  2182	
  2183	
  2184	// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2185	// Must obey the gcc calling convention.
  2186	TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2187		get_tls(CX)
  2188		MOVQ	g(CX), AX
  2189		MOVQ	g_m(AX), AX
  2190		MOVQ	m_curg(AX), AX
  2191		MOVQ	(g_stack+stack_hi)(AX), AX
  2192		RET
  2193	
  2194	// The top-most function running on a goroutine
  2195	// returns to goexit+PCQuantum.
  2196	TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2197		BYTE	$0x90	// NOP
  2198		CALL	runtime·goexit1(SB)	// does not return
  2199		// traceback from goexit1 must hit code range of goexit
  2200		BYTE	$0x90	// NOP
  2201	
  2202	TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2203		MOVQ	addr+0(FP), AX
  2204		PREFETCHT0	(AX)
  2205		RET
  2206	
  2207	TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2208		MOVQ	addr+0(FP), AX
  2209		PREFETCHT1	(AX)
  2210		RET
  2211	
  2212	TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2213		MOVQ	addr+0(FP), AX
  2214		PREFETCHT2	(AX)
  2215		RET
  2216	
  2217	TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2218		MOVQ	addr+0(FP), AX
  2219		PREFETCHNTA	(AX)
  2220		RET
  2221	
  2222	// This is called from .init_array and follows the platform, not Go, ABI.
  2223	TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2224		PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2225		MOVQ	runtime·lastmoduledatap(SB), AX
  2226		MOVQ	DI, moduledata_next(AX)
  2227		MOVQ	DI, runtime·lastmoduledatap(SB)
  2228		POPQ	R15
  2229		RET

View as plain text