...
Run Format

Text file src/runtime/asm_amd64.s

Documentation: runtime

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "go_tls.h"
     7	#include "funcdata.h"
     8	#include "textflag.h"
     9	
    10	TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11		// copy arguments forward on an even stack
    12		MOVQ	DI, AX		// argc
    13		MOVQ	SI, BX		// argv
    14		SUBQ	$(4*8+7), SP		// 2args 2auto
    15		ANDQ	$~15, SP
    16		MOVQ	AX, 16(SP)
    17		MOVQ	BX, 24(SP)
    18		
    19		// create istack out of the given (operating system) stack.
    20		// _cgo_init may update stackguard.
    21		MOVQ	$runtime·g0(SB), DI
    22		LEAQ	(-64*1024+104)(SP), BX
    23		MOVQ	BX, g_stackguard0(DI)
    24		MOVQ	BX, g_stackguard1(DI)
    25		MOVQ	BX, (g_stack+stack_lo)(DI)
    26		MOVQ	SP, (g_stack+stack_hi)(DI)
    27	
    28		// find out information about the processor we're on
    29		MOVL	$0, AX
    30		CPUID
    31		MOVL	AX, SI
    32		CMPL	AX, $0
    33		JE	nocpuinfo
    34	
    35		// Figure out how to serialize RDTSC.
    36		// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37		// Don't know about the rest, so let's do MFENCE.
    38		CMPL	BX, $0x756E6547  // "Genu"
    39		JNE	notintel
    40		CMPL	DX, $0x49656E69  // "ineI"
    41		JNE	notintel
    42		CMPL	CX, $0x6C65746E  // "ntel"
    43		JNE	notintel
    44		MOVB	$1, runtime·isIntel(SB)
    45		MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    46	notintel:
    47	
    48		// Load EAX=1 cpuid flags
    49		MOVL	$1, AX
    50		CPUID
    51		MOVL	AX, runtime·processorVersionInfo(SB)
    52	
    53		TESTL	$(1<<26), DX // SSE2
    54		SETNE	runtime·support_sse2(SB)
    55	
    56		TESTL	$(1<<9), CX // SSSE3
    57		SETNE	runtime·support_ssse3(SB)
    58	
    59		TESTL	$(1<<19), CX // SSE4.1
    60		SETNE	runtime·support_sse41(SB)
    61	
    62		TESTL	$(1<<20), CX // SSE4.2
    63		SETNE	runtime·support_sse42(SB)
    64	
    65		TESTL	$(1<<23), CX // POPCNT
    66		SETNE	runtime·support_popcnt(SB)
    67	
    68		TESTL	$(1<<25), CX // AES
    69		SETNE	runtime·support_aes(SB)
    70	
    71		TESTL	$(1<<27), CX // OSXSAVE
    72		SETNE	runtime·support_osxsave(SB)
    73	
    74		// If OS support for XMM and YMM is not present
    75		// support_avx will be set back to false later.
    76		TESTL	$(1<<28), CX // AVX
    77		SETNE	runtime·support_avx(SB)
    78	
    79	eax7:
    80		// Load EAX=7/ECX=0 cpuid flags
    81		CMPL	SI, $7
    82		JLT	osavx
    83		MOVL	$7, AX
    84		MOVL	$0, CX
    85		CPUID
    86	
    87		TESTL	$(1<<3), BX // BMI1
    88		SETNE	runtime·support_bmi1(SB)
    89	
    90		// If OS support for XMM and YMM is not present
    91		// support_avx2 will be set back to false later.
    92		TESTL	$(1<<5), BX
    93		SETNE	runtime·support_avx2(SB)
    94	
    95		TESTL	$(1<<8), BX // BMI2
    96		SETNE	runtime·support_bmi2(SB)
    97	
    98		TESTL	$(1<<9), BX // ERMS
    99		SETNE	runtime·support_erms(SB)
   100	
   101	osavx:
   102		CMPB	runtime·support_osxsave(SB), $1
   103		JNE	noavx
   104		MOVL	$0, CX
   105		// For XGETBV, OSXSAVE bit is required and sufficient
   106		XGETBV
   107		ANDL	$6, AX
   108		CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   109		JE nocpuinfo
   110	noavx:
   111		MOVB $0, runtime·support_avx(SB)
   112		MOVB $0, runtime·support_avx2(SB)
   113	
   114	nocpuinfo:
   115		// if there is an _cgo_init, call it.
   116		MOVQ	_cgo_init(SB), AX
   117		TESTQ	AX, AX
   118		JZ	needtls
   119		// g0 already in DI
   120		MOVQ	DI, CX	// Win64 uses CX for first parameter
   121		MOVQ	$setg_gcc<>(SB), SI
   122		CALL	AX
   123	
   124		// update stackguard after _cgo_init
   125		MOVQ	$runtime·g0(SB), CX
   126		MOVQ	(g_stack+stack_lo)(CX), AX
   127		ADDQ	$const__StackGuard, AX
   128		MOVQ	AX, g_stackguard0(CX)
   129		MOVQ	AX, g_stackguard1(CX)
   130	
   131	#ifndef GOOS_windows
   132		JMP ok
   133	#endif
   134	needtls:
   135	#ifdef GOOS_plan9
   136		// skip TLS setup on Plan 9
   137		JMP ok
   138	#endif
   139	#ifdef GOOS_solaris
   140		// skip TLS setup on Solaris
   141		JMP ok
   142	#endif
   143	
   144		LEAQ	runtime·m0+m_tls(SB), DI
   145		CALL	runtime·settls(SB)
   146	
   147		// store through it, to make sure it works
   148		get_tls(BX)
   149		MOVQ	$0x123, g(BX)
   150		MOVQ	runtime·m0+m_tls(SB), AX
   151		CMPQ	AX, $0x123
   152		JEQ 2(PC)
   153		MOVL	AX, 0	// abort
   154	ok:
   155		// set the per-goroutine and per-mach "registers"
   156		get_tls(BX)
   157		LEAQ	runtime·g0(SB), CX
   158		MOVQ	CX, g(BX)
   159		LEAQ	runtime·m0(SB), AX
   160	
   161		// save m->g0 = g0
   162		MOVQ	CX, m_g0(AX)
   163		// save m0 to g0->m
   164		MOVQ	AX, g_m(CX)
   165	
   166		CLD				// convention is D is always left cleared
   167		CALL	runtime·check(SB)
   168	
   169		MOVL	16(SP), AX		// copy argc
   170		MOVL	AX, 0(SP)
   171		MOVQ	24(SP), AX		// copy argv
   172		MOVQ	AX, 8(SP)
   173		CALL	runtime·args(SB)
   174		CALL	runtime·osinit(SB)
   175		CALL	runtime·schedinit(SB)
   176	
   177		// create a new goroutine to start program
   178		MOVQ	$runtime·mainPC(SB), AX		// entry
   179		PUSHQ	AX
   180		PUSHQ	$0			// arg size
   181		CALL	runtime·newproc(SB)
   182		POPQ	AX
   183		POPQ	AX
   184	
   185		// start this M
   186		CALL	runtime·mstart(SB)
   187	
   188		MOVL	$0xf1, 0xf1  // crash
   189		RET
   190	
   191	DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   192	GLOBL	runtime·mainPC(SB),RODATA,$8
   193	
   194	TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   195		BYTE	$0xcc
   196		RET
   197	
   198	TEXT runtime·asminit(SB),NOSPLIT,$0-0
   199		// No per-thread init.
   200		RET
   201	
   202	/*
   203	 *  go-routine
   204	 */
   205	
   206	// void gosave(Gobuf*)
   207	// save state in Gobuf; setjmp
   208	TEXT runtime·gosave(SB), NOSPLIT, $0-8
   209		MOVQ	buf+0(FP), AX		// gobuf
   210		LEAQ	buf+0(FP), BX		// caller's SP
   211		MOVQ	BX, gobuf_sp(AX)
   212		MOVQ	0(SP), BX		// caller's PC
   213		MOVQ	BX, gobuf_pc(AX)
   214		MOVQ	$0, gobuf_ret(AX)
   215		MOVQ	BP, gobuf_bp(AX)
   216		// Assert ctxt is zero. See func save.
   217		MOVQ	gobuf_ctxt(AX), BX
   218		TESTQ	BX, BX
   219		JZ	2(PC)
   220		CALL	runtime·badctxt(SB)
   221		get_tls(CX)
   222		MOVQ	g(CX), BX
   223		MOVQ	BX, gobuf_g(AX)
   224		RET
   225	
   226	// void gogo(Gobuf*)
   227	// restore state from Gobuf; longjmp
   228	TEXT runtime·gogo(SB), NOSPLIT, $16-8
   229		MOVQ	buf+0(FP), BX		// gobuf
   230	
   231		// If ctxt is not nil, invoke deletion barrier before overwriting.
   232		MOVQ	gobuf_ctxt(BX), AX
   233		TESTQ	AX, AX
   234		JZ	nilctxt
   235		LEAQ	gobuf_ctxt(BX), AX
   236		MOVQ	AX, 0(SP)
   237		MOVQ	$0, 8(SP)
   238		CALL	runtime·writebarrierptr_prewrite(SB)
   239		MOVQ	buf+0(FP), BX
   240	
   241	nilctxt:
   242		MOVQ	gobuf_g(BX), DX
   243		MOVQ	0(DX), CX		// make sure g != nil
   244		get_tls(CX)
   245		MOVQ	DX, g(CX)
   246		MOVQ	gobuf_sp(BX), SP	// restore SP
   247		MOVQ	gobuf_ret(BX), AX
   248		MOVQ	gobuf_ctxt(BX), DX
   249		MOVQ	gobuf_bp(BX), BP
   250		MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   251		MOVQ	$0, gobuf_ret(BX)
   252		MOVQ	$0, gobuf_ctxt(BX)
   253		MOVQ	$0, gobuf_bp(BX)
   254		MOVQ	gobuf_pc(BX), BX
   255		JMP	BX
   256	
   257	// func mcall(fn func(*g))
   258	// Switch to m->g0's stack, call fn(g).
   259	// Fn must never return. It should gogo(&g->sched)
   260	// to keep running g.
   261	TEXT runtime·mcall(SB), NOSPLIT, $0-8
   262		MOVQ	fn+0(FP), DI
   263		
   264		get_tls(CX)
   265		MOVQ	g(CX), AX	// save state in g->sched
   266		MOVQ	0(SP), BX	// caller's PC
   267		MOVQ	BX, (g_sched+gobuf_pc)(AX)
   268		LEAQ	fn+0(FP), BX	// caller's SP
   269		MOVQ	BX, (g_sched+gobuf_sp)(AX)
   270		MOVQ	AX, (g_sched+gobuf_g)(AX)
   271		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   272	
   273		// switch to m->g0 & its stack, call fn
   274		MOVQ	g(CX), BX
   275		MOVQ	g_m(BX), BX
   276		MOVQ	m_g0(BX), SI
   277		CMPQ	SI, AX	// if g == m->g0 call badmcall
   278		JNE	3(PC)
   279		MOVQ	$runtime·badmcall(SB), AX
   280		JMP	AX
   281		MOVQ	SI, g(CX)	// g = m->g0
   282		MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   283		PUSHQ	AX
   284		MOVQ	DI, DX
   285		MOVQ	0(DI), DI
   286		CALL	DI
   287		POPQ	AX
   288		MOVQ	$runtime·badmcall2(SB), AX
   289		JMP	AX
   290		RET
   291	
   292	// systemstack_switch is a dummy routine that systemstack leaves at the bottom
   293	// of the G stack. We need to distinguish the routine that
   294	// lives at the bottom of the G stack from the one that lives
   295	// at the top of the system stack because the one at the top of
   296	// the system stack terminates the stack walk (see topofstack()).
   297	TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   298		RET
   299	
   300	// func systemstack(fn func())
   301	TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   302		MOVQ	fn+0(FP), DI	// DI = fn
   303		get_tls(CX)
   304		MOVQ	g(CX), AX	// AX = g
   305		MOVQ	g_m(AX), BX	// BX = m
   306	
   307		MOVQ	m_gsignal(BX), DX	// DX = gsignal
   308		CMPQ	AX, DX
   309		JEQ	noswitch
   310	
   311		MOVQ	m_g0(BX), DX	// DX = g0
   312		CMPQ	AX, DX
   313		JEQ	noswitch
   314	
   315		MOVQ	m_curg(BX), R8
   316		CMPQ	AX, R8
   317		JEQ	switch
   318		
   319		// Bad: g is not gsignal, not g0, not curg. What is it?
   320		MOVQ	$runtime·badsystemstack(SB), AX
   321		CALL	AX
   322	
   323	switch:
   324		// save our state in g->sched. Pretend to
   325		// be systemstack_switch if the G stack is scanned.
   326		MOVQ	$runtime·systemstack_switch(SB), SI
   327		MOVQ	SI, (g_sched+gobuf_pc)(AX)
   328		MOVQ	SP, (g_sched+gobuf_sp)(AX)
   329		MOVQ	AX, (g_sched+gobuf_g)(AX)
   330		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   331	
   332		// switch to g0
   333		MOVQ	DX, g(CX)
   334		MOVQ	(g_sched+gobuf_sp)(DX), BX
   335		// make it look like mstart called systemstack on g0, to stop traceback
   336		SUBQ	$8, BX
   337		MOVQ	$runtime·mstart(SB), DX
   338		MOVQ	DX, 0(BX)
   339		MOVQ	BX, SP
   340	
   341		// call target function
   342		MOVQ	DI, DX
   343		MOVQ	0(DI), DI
   344		CALL	DI
   345	
   346		// switch back to g
   347		get_tls(CX)
   348		MOVQ	g(CX), AX
   349		MOVQ	g_m(AX), BX
   350		MOVQ	m_curg(BX), AX
   351		MOVQ	AX, g(CX)
   352		MOVQ	(g_sched+gobuf_sp)(AX), SP
   353		MOVQ	$0, (g_sched+gobuf_sp)(AX)
   354		RET
   355	
   356	noswitch:
   357		// already on m stack, just call directly
   358		MOVQ	DI, DX
   359		MOVQ	0(DI), DI
   360		CALL	DI
   361		RET
   362	
   363	/*
   364	 * support for morestack
   365	 */
   366	
   367	// Called during function prolog when more stack is needed.
   368	//
   369	// The traceback routines see morestack on a g0 as being
   370	// the top of a stack (for example, morestack calling newstack
   371	// calling the scheduler calling newm calling gc), so we must
   372	// record an argument size. For that purpose, it has no arguments.
   373	TEXT runtime·morestack(SB),NOSPLIT,$0-0
   374		// Cannot grow scheduler stack (m->g0).
   375		get_tls(CX)
   376		MOVQ	g(CX), BX
   377		MOVQ	g_m(BX), BX
   378		MOVQ	m_g0(BX), SI
   379		CMPQ	g(CX), SI
   380		JNE	3(PC)
   381		CALL	runtime·badmorestackg0(SB)
   382		INT	$3
   383	
   384		// Cannot grow signal stack (m->gsignal).
   385		MOVQ	m_gsignal(BX), SI
   386		CMPQ	g(CX), SI
   387		JNE	3(PC)
   388		CALL	runtime·badmorestackgsignal(SB)
   389		INT	$3
   390	
   391		// Called from f.
   392		// Set m->morebuf to f's caller.
   393		MOVQ	8(SP), AX	// f's caller's PC
   394		MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   395		LEAQ	16(SP), AX	// f's caller's SP
   396		MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   397		get_tls(CX)
   398		MOVQ	g(CX), SI
   399		MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   400	
   401		// Set g->sched to context in f.
   402		MOVQ	0(SP), AX // f's PC
   403		MOVQ	AX, (g_sched+gobuf_pc)(SI)
   404		MOVQ	SI, (g_sched+gobuf_g)(SI)
   405		LEAQ	8(SP), AX // f's SP
   406		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   407		MOVQ	BP, (g_sched+gobuf_bp)(SI)
   408		// newstack will fill gobuf.ctxt.
   409	
   410		// Call newstack on m->g0's stack.
   411		MOVQ	m_g0(BX), BX
   412		MOVQ	BX, g(CX)
   413		MOVQ	(g_sched+gobuf_sp)(BX), SP
   414		PUSHQ	DX	// ctxt argument
   415		CALL	runtime·newstack(SB)
   416		MOVQ	$0, 0x1003	// crash if newstack returns
   417		POPQ	DX	// keep balance check happy
   418		RET
   419	
   420	// morestack but not preserving ctxt.
   421	TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   422		MOVL	$0, DX
   423		JMP	runtime·morestack(SB)
   424	
   425	// reflectcall: call a function with the given argument list
   426	// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   427	// we don't have variable-sized frames, so we use a small number
   428	// of constant-sized-frame functions to encode a few bits of size in the pc.
   429	// Caution: ugly multiline assembly macros in your future!
   430	
   431	#define DISPATCH(NAME,MAXSIZE)		\
   432		CMPQ	CX, $MAXSIZE;		\
   433		JA	3(PC);			\
   434		MOVQ	$NAME(SB), AX;		\
   435		JMP	AX
   436	// Note: can't just "JMP NAME(SB)" - bad inlining results.
   437	
   438	TEXT reflect·call(SB), NOSPLIT, $0-0
   439		JMP	·reflectcall(SB)
   440	
   441	TEXT ·reflectcall(SB), NOSPLIT, $0-32
   442		MOVLQZX argsize+24(FP), CX
   443		DISPATCH(runtime·call32, 32)
   444		DISPATCH(runtime·call64, 64)
   445		DISPATCH(runtime·call128, 128)
   446		DISPATCH(runtime·call256, 256)
   447		DISPATCH(runtime·call512, 512)
   448		DISPATCH(runtime·call1024, 1024)
   449		DISPATCH(runtime·call2048, 2048)
   450		DISPATCH(runtime·call4096, 4096)
   451		DISPATCH(runtime·call8192, 8192)
   452		DISPATCH(runtime·call16384, 16384)
   453		DISPATCH(runtime·call32768, 32768)
   454		DISPATCH(runtime·call65536, 65536)
   455		DISPATCH(runtime·call131072, 131072)
   456		DISPATCH(runtime·call262144, 262144)
   457		DISPATCH(runtime·call524288, 524288)
   458		DISPATCH(runtime·call1048576, 1048576)
   459		DISPATCH(runtime·call2097152, 2097152)
   460		DISPATCH(runtime·call4194304, 4194304)
   461		DISPATCH(runtime·call8388608, 8388608)
   462		DISPATCH(runtime·call16777216, 16777216)
   463		DISPATCH(runtime·call33554432, 33554432)
   464		DISPATCH(runtime·call67108864, 67108864)
   465		DISPATCH(runtime·call134217728, 134217728)
   466		DISPATCH(runtime·call268435456, 268435456)
   467		DISPATCH(runtime·call536870912, 536870912)
   468		DISPATCH(runtime·call1073741824, 1073741824)
   469		MOVQ	$runtime·badreflectcall(SB), AX
   470		JMP	AX
   471	
   472	#define CALLFN(NAME,MAXSIZE)			\
   473	TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   474		NO_LOCAL_POINTERS;			\
   475		/* copy arguments to stack */		\
   476		MOVQ	argptr+16(FP), SI;		\
   477		MOVLQZX argsize+24(FP), CX;		\
   478		MOVQ	SP, DI;				\
   479		REP;MOVSB;				\
   480		/* call function */			\
   481		MOVQ	f+8(FP), DX;			\
   482		PCDATA  $PCDATA_StackMapIndex, $0;	\
   483		CALL	(DX);				\
   484		/* copy return values back */		\
   485		MOVQ	argtype+0(FP), DX;		\
   486		MOVQ	argptr+16(FP), DI;		\
   487		MOVLQZX	argsize+24(FP), CX;		\
   488		MOVLQZX	retoffset+28(FP), BX;		\
   489		MOVQ	SP, SI;				\
   490		ADDQ	BX, DI;				\
   491		ADDQ	BX, SI;				\
   492		SUBQ	BX, CX;				\
   493		CALL	callRet<>(SB);			\
   494		RET
   495	
   496	// callRet copies return values back at the end of call*. This is a
   497	// separate function so it can allocate stack space for the arguments
   498	// to reflectcallmove. It does not follow the Go ABI; it expects its
   499	// arguments in registers.
   500	TEXT callRet<>(SB), NOSPLIT, $32-0
   501		NO_LOCAL_POINTERS
   502		MOVQ	DX, 0(SP)
   503		MOVQ	DI, 8(SP)
   504		MOVQ	SI, 16(SP)
   505		MOVQ	CX, 24(SP)
   506		CALL	runtime·reflectcallmove(SB)
   507		RET
   508	
   509	CALLFN(·call32, 32)
   510	CALLFN(·call64, 64)
   511	CALLFN(·call128, 128)
   512	CALLFN(·call256, 256)
   513	CALLFN(·call512, 512)
   514	CALLFN(·call1024, 1024)
   515	CALLFN(·call2048, 2048)
   516	CALLFN(·call4096, 4096)
   517	CALLFN(·call8192, 8192)
   518	CALLFN(·call16384, 16384)
   519	CALLFN(·call32768, 32768)
   520	CALLFN(·call65536, 65536)
   521	CALLFN(·call131072, 131072)
   522	CALLFN(·call262144, 262144)
   523	CALLFN(·call524288, 524288)
   524	CALLFN(·call1048576, 1048576)
   525	CALLFN(·call2097152, 2097152)
   526	CALLFN(·call4194304, 4194304)
   527	CALLFN(·call8388608, 8388608)
   528	CALLFN(·call16777216, 16777216)
   529	CALLFN(·call33554432, 33554432)
   530	CALLFN(·call67108864, 67108864)
   531	CALLFN(·call134217728, 134217728)
   532	CALLFN(·call268435456, 268435456)
   533	CALLFN(·call536870912, 536870912)
   534	CALLFN(·call1073741824, 1073741824)
   535	
   536	TEXT runtime·procyield(SB),NOSPLIT,$0-0
   537		MOVL	cycles+0(FP), AX
   538	again:
   539		PAUSE
   540		SUBL	$1, AX
   541		JNZ	again
   542		RET
   543	
   544	
   545	TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   546		// Stores are already ordered on x86, so this is just a
   547		// compile barrier.
   548		RET
   549	
   550	// void jmpdefer(fn, sp);
   551	// called from deferreturn.
   552	// 1. pop the caller
   553	// 2. sub 5 bytes from the callers return
   554	// 3. jmp to the argument
   555	TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   556		MOVQ	fv+0(FP), DX	// fn
   557		MOVQ	argp+8(FP), BX	// caller sp
   558		LEAQ	-8(BX), SP	// caller sp after CALL
   559		MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   560		SUBQ	$5, (SP)	// return to CALL again
   561		MOVQ	0(DX), BX
   562		JMP	BX	// but first run the deferred function
   563	
   564	// Save state of caller into g->sched. Smashes R8, R9.
   565	TEXT gosave<>(SB),NOSPLIT,$0
   566		get_tls(R8)
   567		MOVQ	g(R8), R8
   568		MOVQ	0(SP), R9
   569		MOVQ	R9, (g_sched+gobuf_pc)(R8)
   570		LEAQ	8(SP), R9
   571		MOVQ	R9, (g_sched+gobuf_sp)(R8)
   572		MOVQ	$0, (g_sched+gobuf_ret)(R8)
   573		MOVQ	BP, (g_sched+gobuf_bp)(R8)
   574		// Assert ctxt is zero. See func save.
   575		MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   576		TESTQ	R9, R9
   577		JZ	2(PC)
   578		CALL	runtime·badctxt(SB)
   579		RET
   580	
   581	// func asmcgocall(fn, arg unsafe.Pointer) int32
   582	// Call fn(arg) on the scheduler stack,
   583	// aligned appropriately for the gcc ABI.
   584	// See cgocall.go for more details.
   585	TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   586		MOVQ	fn+0(FP), AX
   587		MOVQ	arg+8(FP), BX
   588	
   589		MOVQ	SP, DX
   590	
   591		// Figure out if we need to switch to m->g0 stack.
   592		// We get called to create new OS threads too, and those
   593		// come in on the m->g0 stack already.
   594		get_tls(CX)
   595		MOVQ	g(CX), R8
   596		CMPQ	R8, $0
   597		JEQ	nosave
   598		MOVQ	g_m(R8), R8
   599		MOVQ	m_g0(R8), SI
   600		MOVQ	g(CX), DI
   601		CMPQ	SI, DI
   602		JEQ	nosave
   603		MOVQ	m_gsignal(R8), SI
   604		CMPQ	SI, DI
   605		JEQ	nosave
   606		
   607		// Switch to system stack.
   608		MOVQ	m_g0(R8), SI
   609		CALL	gosave<>(SB)
   610		MOVQ	SI, g(CX)
   611		MOVQ	(g_sched+gobuf_sp)(SI), SP
   612	
   613		// Now on a scheduling stack (a pthread-created stack).
   614		// Make sure we have enough room for 4 stack-backed fast-call
   615		// registers as per windows amd64 calling convention.
   616		SUBQ	$64, SP
   617		ANDQ	$~15, SP	// alignment for gcc ABI
   618		MOVQ	DI, 48(SP)	// save g
   619		MOVQ	(g_stack+stack_hi)(DI), DI
   620		SUBQ	DX, DI
   621		MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   622		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   623		MOVQ	BX, CX		// CX = first argument in Win64
   624		CALL	AX
   625	
   626		// Restore registers, g, stack pointer.
   627		get_tls(CX)
   628		MOVQ	48(SP), DI
   629		MOVQ	(g_stack+stack_hi)(DI), SI
   630		SUBQ	40(SP), SI
   631		MOVQ	DI, g(CX)
   632		MOVQ	SI, SP
   633	
   634		MOVL	AX, ret+16(FP)
   635		RET
   636	
   637	nosave:
   638		// Running on a system stack, perhaps even without a g.
   639		// Having no g can happen during thread creation or thread teardown
   640		// (see needm/dropm on Solaris, for example).
   641		// This code is like the above sequence but without saving/restoring g
   642		// and without worrying about the stack moving out from under us
   643		// (because we're on a system stack, not a goroutine stack).
   644		// The above code could be used directly if already on a system stack,
   645		// but then the only path through this code would be a rare case on Solaris.
   646		// Using this code for all "already on system stack" calls exercises it more,
   647		// which should help keep it correct.
   648		SUBQ	$64, SP
   649		ANDQ	$~15, SP
   650		MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   651		MOVQ	DX, 40(SP)	// save original stack pointer
   652		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   653		MOVQ	BX, CX		// CX = first argument in Win64
   654		CALL	AX
   655		MOVQ	40(SP), SI	// restore original stack pointer
   656		MOVQ	SI, SP
   657		MOVL	AX, ret+16(FP)
   658		RET
   659	
   660	// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   661	// Turn the fn into a Go func (by taking its address) and call
   662	// cgocallback_gofunc.
   663	TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   664		LEAQ	fn+0(FP), AX
   665		MOVQ	AX, 0(SP)
   666		MOVQ	frame+8(FP), AX
   667		MOVQ	AX, 8(SP)
   668		MOVQ	framesize+16(FP), AX
   669		MOVQ	AX, 16(SP)
   670		MOVQ	ctxt+24(FP), AX
   671		MOVQ	AX, 24(SP)
   672		MOVQ	$runtime·cgocallback_gofunc(SB), AX
   673		CALL	AX
   674		RET
   675	
   676	// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   677	// See cgocall.go for more details.
   678	TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   679		NO_LOCAL_POINTERS
   680	
   681		// If g is nil, Go did not create the current thread.
   682		// Call needm to obtain one m for temporary use.
   683		// In this case, we're running on the thread stack, so there's
   684		// lots of space, but the linker doesn't know. Hide the call from
   685		// the linker analysis by using an indirect call through AX.
   686		get_tls(CX)
   687	#ifdef GOOS_windows
   688		MOVL	$0, BX
   689		CMPQ	CX, $0
   690		JEQ	2(PC)
   691	#endif
   692		MOVQ	g(CX), BX
   693		CMPQ	BX, $0
   694		JEQ	needm
   695		MOVQ	g_m(BX), BX
   696		MOVQ	BX, R8 // holds oldm until end of function
   697		JMP	havem
   698	needm:
   699		MOVQ	$0, 0(SP)
   700		MOVQ	$runtime·needm(SB), AX
   701		CALL	AX
   702		MOVQ	0(SP), R8
   703		get_tls(CX)
   704		MOVQ	g(CX), BX
   705		MOVQ	g_m(BX), BX
   706		
   707		// Set m->sched.sp = SP, so that if a panic happens
   708		// during the function we are about to execute, it will
   709		// have a valid SP to run on the g0 stack.
   710		// The next few lines (after the havem label)
   711		// will save this SP onto the stack and then write
   712		// the same SP back to m->sched.sp. That seems redundant,
   713		// but if an unrecovered panic happens, unwindm will
   714		// restore the g->sched.sp from the stack location
   715		// and then systemstack will try to use it. If we don't set it here,
   716		// that restored SP will be uninitialized (typically 0) and
   717		// will not be usable.
   718		MOVQ	m_g0(BX), SI
   719		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   720	
   721	havem:
   722		// Now there's a valid m, and we're running on its m->g0.
   723		// Save current m->g0->sched.sp on stack and then set it to SP.
   724		// Save current sp in m->g0->sched.sp in preparation for
   725		// switch back to m->curg stack.
   726		// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   727		MOVQ	m_g0(BX), SI
   728		MOVQ	(g_sched+gobuf_sp)(SI), AX
   729		MOVQ	AX, 0(SP)
   730		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   731	
   732		// Switch to m->curg stack and call runtime.cgocallbackg.
   733		// Because we are taking over the execution of m->curg
   734		// but *not* resuming what had been running, we need to
   735		// save that information (m->curg->sched) so we can restore it.
   736		// We can restore m->curg->sched.sp easily, because calling
   737		// runtime.cgocallbackg leaves SP unchanged upon return.
   738		// To save m->curg->sched.pc, we push it onto the stack.
   739		// This has the added benefit that it looks to the traceback
   740		// routine like cgocallbackg is going to return to that
   741		// PC (because the frame we allocate below has the same
   742		// size as cgocallback_gofunc's frame declared above)
   743		// so that the traceback will seamlessly trace back into
   744		// the earlier calls.
   745		//
   746		// In the new goroutine, 8(SP) holds the saved R8.
   747		MOVQ	m_curg(BX), SI
   748		MOVQ	SI, g(CX)
   749		MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   750		MOVQ	(g_sched+gobuf_pc)(SI), BX
   751		MOVQ	BX, -8(DI)
   752		// Compute the size of the frame, including return PC and, if
   753		// GOEXPERIMENT=framepointer, the saved base pointer
   754		MOVQ	ctxt+24(FP), BX
   755		LEAQ	fv+0(FP), AX
   756		SUBQ	SP, AX
   757		SUBQ	AX, DI
   758		MOVQ	DI, SP
   759	
   760		MOVQ	R8, 8(SP)
   761		MOVQ	BX, 0(SP)
   762		CALL	runtime·cgocallbackg(SB)
   763		MOVQ	8(SP), R8
   764	
   765		// Compute the size of the frame again. FP and SP have
   766		// completely different values here than they did above,
   767		// but only their difference matters.
   768		LEAQ	fv+0(FP), AX
   769		SUBQ	SP, AX
   770	
   771		// Restore g->sched (== m->curg->sched) from saved values.
   772		get_tls(CX)
   773		MOVQ	g(CX), SI
   774		MOVQ	SP, DI
   775		ADDQ	AX, DI
   776		MOVQ	-8(DI), BX
   777		MOVQ	BX, (g_sched+gobuf_pc)(SI)
   778		MOVQ	DI, (g_sched+gobuf_sp)(SI)
   779	
   780		// Switch back to m->g0's stack and restore m->g0->sched.sp.
   781		// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   782		// so we do not have to restore it.)
   783		MOVQ	g(CX), BX
   784		MOVQ	g_m(BX), BX
   785		MOVQ	m_g0(BX), SI
   786		MOVQ	SI, g(CX)
   787		MOVQ	(g_sched+gobuf_sp)(SI), SP
   788		MOVQ	0(SP), AX
   789		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   790		
   791		// If the m on entry was nil, we called needm above to borrow an m
   792		// for the duration of the call. Since the call is over, return it with dropm.
   793		CMPQ	R8, $0
   794		JNE 3(PC)
   795		MOVQ	$runtime·dropm(SB), AX
   796		CALL	AX
   797	
   798		// Done!
   799		RET
   800	
   801	// void setg(G*); set g. for use by needm.
   802	TEXT runtime·setg(SB), NOSPLIT, $0-8
   803		MOVQ	gg+0(FP), BX
   804	#ifdef GOOS_windows
   805		CMPQ	BX, $0
   806		JNE	settls
   807		MOVQ	$0, 0x28(GS)
   808		RET
   809	settls:
   810		MOVQ	g_m(BX), AX
   811		LEAQ	m_tls(AX), AX
   812		MOVQ	AX, 0x28(GS)
   813	#endif
   814		get_tls(CX)
   815		MOVQ	BX, g(CX)
   816		RET
   817	
   818	// void setg_gcc(G*); set g called from gcc.
   819	TEXT setg_gcc<>(SB),NOSPLIT,$0
   820		get_tls(AX)
   821		MOVQ	DI, g(AX)
   822		RET
   823	
   824	// check that SP is in range [g->stack.lo, g->stack.hi)
   825	TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   826		get_tls(CX)
   827		MOVQ	g(CX), AX
   828		CMPQ	(g_stack+stack_hi)(AX), SP
   829		JHI	2(PC)
   830		INT	$3
   831		CMPQ	SP, (g_stack+stack_lo)(AX)
   832		JHI	2(PC)
   833		INT	$3
   834		RET
   835	
   836	TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   837		MOVQ	argp+0(FP),AX		// addr of first arg
   838		MOVQ	-8(AX),AX		// get calling pc
   839		MOVQ	AX, ret+8(FP)
   840		RET
   841	
   842	// func cputicks() int64
   843	TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   844		CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   845		JNE	mfence
   846		LFENCE
   847		JMP	done
   848	mfence:
   849		MFENCE
   850	done:
   851		RDTSC
   852		SHLQ	$32, DX
   853		ADDQ	DX, AX
   854		MOVQ	AX, ret+0(FP)
   855		RET
   856	
   857	// memhash_varlen(p unsafe.Pointer, h seed) uintptr
   858	// redirects to memhash(p, h, size) using the size
   859	// stored in the closure.
   860	TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   861		GO_ARGS
   862		NO_LOCAL_POINTERS
   863		MOVQ	p+0(FP), AX
   864		MOVQ	h+8(FP), BX
   865		MOVQ	8(DX), CX
   866		MOVQ	AX, 0(SP)
   867		MOVQ	BX, 8(SP)
   868		MOVQ	CX, 16(SP)
   869		CALL	runtime·memhash(SB)
   870		MOVQ	24(SP), AX
   871		MOVQ	AX, ret+16(FP)
   872		RET
   873	
   874	// hash function using AES hardware instructions
   875	TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   876		MOVQ	p+0(FP), AX	// ptr to data
   877		MOVQ	s+16(FP), CX	// size
   878		LEAQ	ret+24(FP), DX
   879		JMP	runtime·aeshashbody(SB)
   880	
   881	TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   882		MOVQ	p+0(FP), AX	// ptr to string struct
   883		MOVQ	8(AX), CX	// length of string
   884		MOVQ	(AX), AX	// string data
   885		LEAQ	ret+16(FP), DX
   886		JMP	runtime·aeshashbody(SB)
   887	
   888	// AX: data
   889	// CX: length
   890	// DX: address to put return value
   891	TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   892		// Fill an SSE register with our seeds.
   893		MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   894		PINSRW	$4, CX, X0			// 16 bits of length
   895		PSHUFHW $0, X0, X0			// repeat length 4 times total
   896		MOVO	X0, X1				// save unscrambled seed
   897		PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   898		AESENC	X0, X0				// scramble seed
   899	
   900		CMPQ	CX, $16
   901		JB	aes0to15
   902		JE	aes16
   903		CMPQ	CX, $32
   904		JBE	aes17to32
   905		CMPQ	CX, $64
   906		JBE	aes33to64
   907		CMPQ	CX, $128
   908		JBE	aes65to128
   909		JMP	aes129plus
   910	
   911	aes0to15:
   912		TESTQ	CX, CX
   913		JE	aes0
   914	
   915		ADDQ	$16, AX
   916		TESTW	$0xff0, AX
   917		JE	endofpage
   918	
   919		// 16 bytes loaded at this address won't cross
   920		// a page boundary, so we can load it directly.
   921		MOVOU	-16(AX), X1
   922		ADDQ	CX, CX
   923		MOVQ	$masks<>(SB), AX
   924		PAND	(AX)(CX*8), X1
   925	final1:
   926		PXOR	X0, X1	// xor data with seed
   927		AESENC	X1, X1	// scramble combo 3 times
   928		AESENC	X1, X1
   929		AESENC	X1, X1
   930		MOVQ	X1, (DX)
   931		RET
   932	
   933	endofpage:
   934		// address ends in 1111xxxx. Might be up against
   935		// a page boundary, so load ending at last byte.
   936		// Then shift bytes down using pshufb.
   937		MOVOU	-32(AX)(CX*1), X1
   938		ADDQ	CX, CX
   939		MOVQ	$shifts<>(SB), AX
   940		PSHUFB	(AX)(CX*8), X1
   941		JMP	final1
   942	
   943	aes0:
   944		// Return scrambled input seed
   945		AESENC	X0, X0
   946		MOVQ	X0, (DX)
   947		RET
   948	
   949	aes16:
   950		MOVOU	(AX), X1
   951		JMP	final1
   952	
   953	aes17to32:
   954		// make second starting seed
   955		PXOR	runtime·aeskeysched+16(SB), X1
   956		AESENC	X1, X1
   957		
   958		// load data to be hashed
   959		MOVOU	(AX), X2
   960		MOVOU	-16(AX)(CX*1), X3
   961	
   962		// xor with seed
   963		PXOR	X0, X2
   964		PXOR	X1, X3
   965	
   966		// scramble 3 times
   967		AESENC	X2, X2
   968		AESENC	X3, X3
   969		AESENC	X2, X2
   970		AESENC	X3, X3
   971		AESENC	X2, X2
   972		AESENC	X3, X3
   973	
   974		// combine results
   975		PXOR	X3, X2
   976		MOVQ	X2, (DX)
   977		RET
   978	
   979	aes33to64:
   980		// make 3 more starting seeds
   981		MOVO	X1, X2
   982		MOVO	X1, X3
   983		PXOR	runtime·aeskeysched+16(SB), X1
   984		PXOR	runtime·aeskeysched+32(SB), X2
   985		PXOR	runtime·aeskeysched+48(SB), X3
   986		AESENC	X1, X1
   987		AESENC	X2, X2
   988		AESENC	X3, X3
   989		
   990		MOVOU	(AX), X4
   991		MOVOU	16(AX), X5
   992		MOVOU	-32(AX)(CX*1), X6
   993		MOVOU	-16(AX)(CX*1), X7
   994	
   995		PXOR	X0, X4
   996		PXOR	X1, X5
   997		PXOR	X2, X6
   998		PXOR	X3, X7
   999		
  1000		AESENC	X4, X4
  1001		AESENC	X5, X5
  1002		AESENC	X6, X6
  1003		AESENC	X7, X7
  1004		
  1005		AESENC	X4, X4
  1006		AESENC	X5, X5
  1007		AESENC	X6, X6
  1008		AESENC	X7, X7
  1009		
  1010		AESENC	X4, X4
  1011		AESENC	X5, X5
  1012		AESENC	X6, X6
  1013		AESENC	X7, X7
  1014	
  1015		PXOR	X6, X4
  1016		PXOR	X7, X5
  1017		PXOR	X5, X4
  1018		MOVQ	X4, (DX)
  1019		RET
  1020	
  1021	aes65to128:
  1022		// make 7 more starting seeds
  1023		MOVO	X1, X2
  1024		MOVO	X1, X3
  1025		MOVO	X1, X4
  1026		MOVO	X1, X5
  1027		MOVO	X1, X6
  1028		MOVO	X1, X7
  1029		PXOR	runtime·aeskeysched+16(SB), X1
  1030		PXOR	runtime·aeskeysched+32(SB), X2
  1031		PXOR	runtime·aeskeysched+48(SB), X3
  1032		PXOR	runtime·aeskeysched+64(SB), X4
  1033		PXOR	runtime·aeskeysched+80(SB), X5
  1034		PXOR	runtime·aeskeysched+96(SB), X6
  1035		PXOR	runtime·aeskeysched+112(SB), X7
  1036		AESENC	X1, X1
  1037		AESENC	X2, X2
  1038		AESENC	X3, X3
  1039		AESENC	X4, X4
  1040		AESENC	X5, X5
  1041		AESENC	X6, X6
  1042		AESENC	X7, X7
  1043	
  1044		// load data
  1045		MOVOU	(AX), X8
  1046		MOVOU	16(AX), X9
  1047		MOVOU	32(AX), X10
  1048		MOVOU	48(AX), X11
  1049		MOVOU	-64(AX)(CX*1), X12
  1050		MOVOU	-48(AX)(CX*1), X13
  1051		MOVOU	-32(AX)(CX*1), X14
  1052		MOVOU	-16(AX)(CX*1), X15
  1053	
  1054		// xor with seed
  1055		PXOR	X0, X8
  1056		PXOR	X1, X9
  1057		PXOR	X2, X10
  1058		PXOR	X3, X11
  1059		PXOR	X4, X12
  1060		PXOR	X5, X13
  1061		PXOR	X6, X14
  1062		PXOR	X7, X15
  1063	
  1064		// scramble 3 times
  1065		AESENC	X8, X8
  1066		AESENC	X9, X9
  1067		AESENC	X10, X10
  1068		AESENC	X11, X11
  1069		AESENC	X12, X12
  1070		AESENC	X13, X13
  1071		AESENC	X14, X14
  1072		AESENC	X15, X15
  1073	
  1074		AESENC	X8, X8
  1075		AESENC	X9, X9
  1076		AESENC	X10, X10
  1077		AESENC	X11, X11
  1078		AESENC	X12, X12
  1079		AESENC	X13, X13
  1080		AESENC	X14, X14
  1081		AESENC	X15, X15
  1082	
  1083		AESENC	X8, X8
  1084		AESENC	X9, X9
  1085		AESENC	X10, X10
  1086		AESENC	X11, X11
  1087		AESENC	X12, X12
  1088		AESENC	X13, X13
  1089		AESENC	X14, X14
  1090		AESENC	X15, X15
  1091	
  1092		// combine results
  1093		PXOR	X12, X8
  1094		PXOR	X13, X9
  1095		PXOR	X14, X10
  1096		PXOR	X15, X11
  1097		PXOR	X10, X8
  1098		PXOR	X11, X9
  1099		PXOR	X9, X8
  1100		MOVQ	X8, (DX)
  1101		RET
  1102	
  1103	aes129plus:
  1104		// make 7 more starting seeds
  1105		MOVO	X1, X2
  1106		MOVO	X1, X3
  1107		MOVO	X1, X4
  1108		MOVO	X1, X5
  1109		MOVO	X1, X6
  1110		MOVO	X1, X7
  1111		PXOR	runtime·aeskeysched+16(SB), X1
  1112		PXOR	runtime·aeskeysched+32(SB), X2
  1113		PXOR	runtime·aeskeysched+48(SB), X3
  1114		PXOR	runtime·aeskeysched+64(SB), X4
  1115		PXOR	runtime·aeskeysched+80(SB), X5
  1116		PXOR	runtime·aeskeysched+96(SB), X6
  1117		PXOR	runtime·aeskeysched+112(SB), X7
  1118		AESENC	X1, X1
  1119		AESENC	X2, X2
  1120		AESENC	X3, X3
  1121		AESENC	X4, X4
  1122		AESENC	X5, X5
  1123		AESENC	X6, X6
  1124		AESENC	X7, X7
  1125		
  1126		// start with last (possibly overlapping) block
  1127		MOVOU	-128(AX)(CX*1), X8
  1128		MOVOU	-112(AX)(CX*1), X9
  1129		MOVOU	-96(AX)(CX*1), X10
  1130		MOVOU	-80(AX)(CX*1), X11
  1131		MOVOU	-64(AX)(CX*1), X12
  1132		MOVOU	-48(AX)(CX*1), X13
  1133		MOVOU	-32(AX)(CX*1), X14
  1134		MOVOU	-16(AX)(CX*1), X15
  1135	
  1136		// xor in seed
  1137		PXOR	X0, X8
  1138		PXOR	X1, X9
  1139		PXOR	X2, X10
  1140		PXOR	X3, X11
  1141		PXOR	X4, X12
  1142		PXOR	X5, X13
  1143		PXOR	X6, X14
  1144		PXOR	X7, X15
  1145		
  1146		// compute number of remaining 128-byte blocks
  1147		DECQ	CX
  1148		SHRQ	$7, CX
  1149		
  1150	aesloop:
  1151		// scramble state
  1152		AESENC	X8, X8
  1153		AESENC	X9, X9
  1154		AESENC	X10, X10
  1155		AESENC	X11, X11
  1156		AESENC	X12, X12
  1157		AESENC	X13, X13
  1158		AESENC	X14, X14
  1159		AESENC	X15, X15
  1160	
  1161		// scramble state, xor in a block
  1162		MOVOU	(AX), X0
  1163		MOVOU	16(AX), X1
  1164		MOVOU	32(AX), X2
  1165		MOVOU	48(AX), X3
  1166		AESENC	X0, X8
  1167		AESENC	X1, X9
  1168		AESENC	X2, X10
  1169		AESENC	X3, X11
  1170		MOVOU	64(AX), X4
  1171		MOVOU	80(AX), X5
  1172		MOVOU	96(AX), X6
  1173		MOVOU	112(AX), X7
  1174		AESENC	X4, X12
  1175		AESENC	X5, X13
  1176		AESENC	X6, X14
  1177		AESENC	X7, X15
  1178	
  1179		ADDQ	$128, AX
  1180		DECQ	CX
  1181		JNE	aesloop
  1182	
  1183		// 3 more scrambles to finish
  1184		AESENC	X8, X8
  1185		AESENC	X9, X9
  1186		AESENC	X10, X10
  1187		AESENC	X11, X11
  1188		AESENC	X12, X12
  1189		AESENC	X13, X13
  1190		AESENC	X14, X14
  1191		AESENC	X15, X15
  1192		AESENC	X8, X8
  1193		AESENC	X9, X9
  1194		AESENC	X10, X10
  1195		AESENC	X11, X11
  1196		AESENC	X12, X12
  1197		AESENC	X13, X13
  1198		AESENC	X14, X14
  1199		AESENC	X15, X15
  1200		AESENC	X8, X8
  1201		AESENC	X9, X9
  1202		AESENC	X10, X10
  1203		AESENC	X11, X11
  1204		AESENC	X12, X12
  1205		AESENC	X13, X13
  1206		AESENC	X14, X14
  1207		AESENC	X15, X15
  1208	
  1209		PXOR	X12, X8
  1210		PXOR	X13, X9
  1211		PXOR	X14, X10
  1212		PXOR	X15, X11
  1213		PXOR	X10, X8
  1214		PXOR	X11, X9
  1215		PXOR	X9, X8
  1216		MOVQ	X8, (DX)
  1217		RET
  1218		
  1219	TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1220		MOVQ	p+0(FP), AX	// ptr to data
  1221		MOVQ	h+8(FP), X0	// seed
  1222		PINSRD	$2, (AX), X0	// data
  1223		AESENC	runtime·aeskeysched+0(SB), X0
  1224		AESENC	runtime·aeskeysched+16(SB), X0
  1225		AESENC	runtime·aeskeysched+32(SB), X0
  1226		MOVQ	X0, ret+16(FP)
  1227		RET
  1228	
  1229	TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1230		MOVQ	p+0(FP), AX	// ptr to data
  1231		MOVQ	h+8(FP), X0	// seed
  1232		PINSRQ	$1, (AX), X0	// data
  1233		AESENC	runtime·aeskeysched+0(SB), X0
  1234		AESENC	runtime·aeskeysched+16(SB), X0
  1235		AESENC	runtime·aeskeysched+32(SB), X0
  1236		MOVQ	X0, ret+16(FP)
  1237		RET
  1238	
  1239	// simple mask to get rid of data in the high part of the register.
  1240	DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1241	DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1242	DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1243	DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1244	DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1245	DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1246	DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1247	DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1248	DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1249	DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1250	DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1251	DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1252	DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1253	DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1254	DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1255	DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1256	DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1257	DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1258	DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1259	DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1260	DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1261	DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1262	DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1263	DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1264	DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1265	DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1266	DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1267	DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1268	DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1269	DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1270	DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1271	DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1272	GLOBL masks<>(SB),RODATA,$256
  1273	
  1274	TEXT ·checkASM(SB),NOSPLIT,$0-1
  1275		// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1276		MOVQ	$masks<>(SB), AX
  1277		MOVQ	$shifts<>(SB), BX
  1278		ORQ	BX, AX
  1279		TESTQ	$15, AX
  1280		SETEQ	ret+0(FP)
  1281		RET
  1282	
  1283	// these are arguments to pshufb. They move data down from
  1284	// the high bytes of the register to the low bytes of the register.
  1285	// index is how many bytes to move.
  1286	DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1287	DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1288	DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1289	DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1290	DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1291	DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1292	DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1293	DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1294	DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1295	DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1296	DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1297	DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1298	DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1299	DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1300	DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1301	DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1302	DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1303	DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1304	DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1305	DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1306	DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1307	DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1308	DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1309	DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1310	DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1311	DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1312	DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1313	DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1314	DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1315	DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1316	DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1317	DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1318	GLOBL shifts<>(SB),RODATA,$256
  1319	
  1320	// memequal(p, q unsafe.Pointer, size uintptr) bool
  1321	TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1322		MOVQ	a+0(FP), SI
  1323		MOVQ	b+8(FP), DI
  1324		CMPQ	SI, DI
  1325		JEQ	eq
  1326		MOVQ	size+16(FP), BX
  1327		LEAQ	ret+24(FP), AX
  1328		JMP	runtime·memeqbody(SB)
  1329	eq:
  1330		MOVB	$1, ret+24(FP)
  1331		RET
  1332	
  1333	// memequal_varlen(a, b unsafe.Pointer) bool
  1334	TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1335		MOVQ	a+0(FP), SI
  1336		MOVQ	b+8(FP), DI
  1337		CMPQ	SI, DI
  1338		JEQ	eq
  1339		MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1340		LEAQ	ret+16(FP), AX
  1341		JMP	runtime·memeqbody(SB)
  1342	eq:
  1343		MOVB	$1, ret+16(FP)
  1344		RET
  1345	
  1346	// eqstring tests whether two strings are equal.
  1347	// The compiler guarantees that strings passed
  1348	// to eqstring have equal length.
  1349	// See runtime_test.go:eqstring_generic for
  1350	// equivalent Go code.
  1351	TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1352		MOVQ	s1_base+0(FP), SI
  1353		MOVQ	s2_base+16(FP), DI
  1354		CMPQ	SI, DI
  1355		JEQ	eq
  1356		MOVQ	s1_len+8(FP), BX
  1357		LEAQ	ret+32(FP), AX
  1358		JMP	runtime·memeqbody(SB)
  1359	eq:
  1360		MOVB	$1, ret+32(FP)
  1361		RET
  1362	
  1363	// a in SI
  1364	// b in DI
  1365	// count in BX
  1366	// address of result byte in AX
  1367	TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1368		CMPQ	BX, $8
  1369		JB	small
  1370		CMPQ	BX, $64
  1371		JB	bigloop
  1372		CMPB    runtime·support_avx2(SB), $1
  1373		JE	hugeloop_avx2
  1374		
  1375		// 64 bytes at a time using xmm registers
  1376	hugeloop:
  1377		CMPQ	BX, $64
  1378		JB	bigloop
  1379		MOVOU	(SI), X0
  1380		MOVOU	(DI), X1
  1381		MOVOU	16(SI), X2
  1382		MOVOU	16(DI), X3
  1383		MOVOU	32(SI), X4
  1384		MOVOU	32(DI), X5
  1385		MOVOU	48(SI), X6
  1386		MOVOU	48(DI), X7
  1387		PCMPEQB	X1, X0
  1388		PCMPEQB	X3, X2
  1389		PCMPEQB	X5, X4
  1390		PCMPEQB	X7, X6
  1391		PAND	X2, X0
  1392		PAND	X6, X4
  1393		PAND	X4, X0
  1394		PMOVMSKB X0, DX
  1395		ADDQ	$64, SI
  1396		ADDQ	$64, DI
  1397		SUBQ	$64, BX
  1398		CMPL	DX, $0xffff
  1399		JEQ	hugeloop
  1400		MOVB	$0, (AX)
  1401		RET
  1402	
  1403		// 64 bytes at a time using ymm registers
  1404	hugeloop_avx2:
  1405		CMPQ	BX, $64
  1406		JB	bigloop_avx2
  1407		VMOVDQU	(SI), Y0
  1408		VMOVDQU	(DI), Y1
  1409		VMOVDQU	32(SI), Y2
  1410		VMOVDQU	32(DI), Y3
  1411		VPCMPEQB	Y1, Y0, Y4
  1412		VPCMPEQB	Y2, Y3, Y5
  1413		VPAND	Y4, Y5, Y6
  1414		VPMOVMSKB Y6, DX
  1415		ADDQ	$64, SI
  1416		ADDQ	$64, DI
  1417		SUBQ	$64, BX
  1418		CMPL	DX, $0xffffffff
  1419		JEQ	hugeloop_avx2
  1420		VZEROUPPER
  1421		MOVB	$0, (AX)
  1422		RET
  1423	
  1424	bigloop_avx2:
  1425		VZEROUPPER
  1426	
  1427		// 8 bytes at a time using 64-bit register
  1428	bigloop:
  1429		CMPQ	BX, $8
  1430		JBE	leftover
  1431		MOVQ	(SI), CX
  1432		MOVQ	(DI), DX
  1433		ADDQ	$8, SI
  1434		ADDQ	$8, DI
  1435		SUBQ	$8, BX
  1436		CMPQ	CX, DX
  1437		JEQ	bigloop
  1438		MOVB	$0, (AX)
  1439		RET
  1440	
  1441		// remaining 0-8 bytes
  1442	leftover:
  1443		MOVQ	-8(SI)(BX*1), CX
  1444		MOVQ	-8(DI)(BX*1), DX
  1445		CMPQ	CX, DX
  1446		SETEQ	(AX)
  1447		RET
  1448	
  1449	small:
  1450		CMPQ	BX, $0
  1451		JEQ	equal
  1452	
  1453		LEAQ	0(BX*8), CX
  1454		NEGQ	CX
  1455	
  1456		CMPB	SI, $0xf8
  1457		JA	si_high
  1458	
  1459		// load at SI won't cross a page boundary.
  1460		MOVQ	(SI), SI
  1461		JMP	si_finish
  1462	si_high:
  1463		// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1464		MOVQ	-8(SI)(BX*1), SI
  1465		SHRQ	CX, SI
  1466	si_finish:
  1467	
  1468		// same for DI.
  1469		CMPB	DI, $0xf8
  1470		JA	di_high
  1471		MOVQ	(DI), DI
  1472		JMP	di_finish
  1473	di_high:
  1474		MOVQ	-8(DI)(BX*1), DI
  1475		SHRQ	CX, DI
  1476	di_finish:
  1477	
  1478		SUBQ	SI, DI
  1479		SHLQ	CX, DI
  1480	equal:
  1481		SETEQ	(AX)
  1482		RET
  1483	
  1484	TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1485		MOVQ	s1_base+0(FP), SI
  1486		MOVQ	s1_len+8(FP), BX
  1487		MOVQ	s2_base+16(FP), DI
  1488		MOVQ	s2_len+24(FP), DX
  1489		LEAQ	ret+32(FP), R9
  1490		JMP	runtime·cmpbody(SB)
  1491	
  1492	TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1493		MOVQ	s1+0(FP), SI
  1494		MOVQ	s1+8(FP), BX
  1495		MOVQ	s2+24(FP), DI
  1496		MOVQ	s2+32(FP), DX
  1497		LEAQ	res+48(FP), R9
  1498		JMP	runtime·cmpbody(SB)
  1499	
  1500	// input:
  1501	//   SI = a
  1502	//   DI = b
  1503	//   BX = alen
  1504	//   DX = blen
  1505	//   R9 = address of output word (stores -1/0/1 here)
  1506	TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1507		CMPQ	SI, DI
  1508		JEQ	allsame
  1509		CMPQ	BX, DX
  1510		MOVQ	DX, R8
  1511		CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1512		CMPQ	R8, $8
  1513		JB	small
  1514	
  1515		CMPQ	R8, $63
  1516		JBE	loop
  1517		CMPB    runtime·support_avx2(SB), $1
  1518		JEQ     big_loop_avx2
  1519		JMP	big_loop
  1520	loop:
  1521		CMPQ	R8, $16
  1522		JBE	_0through16
  1523		MOVOU	(SI), X0
  1524		MOVOU	(DI), X1
  1525		PCMPEQB X0, X1
  1526		PMOVMSKB X1, AX
  1527		XORQ	$0xffff, AX	// convert EQ to NE
  1528		JNE	diff16	// branch if at least one byte is not equal
  1529		ADDQ	$16, SI
  1530		ADDQ	$16, DI
  1531		SUBQ	$16, R8
  1532		JMP	loop
  1533		
  1534	diff64:
  1535		ADDQ	$48, SI
  1536		ADDQ	$48, DI
  1537		JMP	diff16
  1538	diff48:
  1539		ADDQ	$32, SI
  1540		ADDQ	$32, DI
  1541		JMP	diff16
  1542	diff32:
  1543		ADDQ	$16, SI
  1544		ADDQ	$16, DI
  1545		// AX = bit mask of differences
  1546	diff16:
  1547		BSFQ	AX, BX	// index of first byte that differs
  1548		XORQ	AX, AX
  1549		MOVB	(SI)(BX*1), CX
  1550		CMPB	CX, (DI)(BX*1)
  1551		SETHI	AX
  1552		LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1553		MOVQ	AX, (R9)
  1554		RET
  1555	
  1556		// 0 through 16 bytes left, alen>=8, blen>=8
  1557	_0through16:
  1558		CMPQ	R8, $8
  1559		JBE	_0through8
  1560		MOVQ	(SI), AX
  1561		MOVQ	(DI), CX
  1562		CMPQ	AX, CX
  1563		JNE	diff8
  1564	_0through8:
  1565		MOVQ	-8(SI)(R8*1), AX
  1566		MOVQ	-8(DI)(R8*1), CX
  1567		CMPQ	AX, CX
  1568		JEQ	allsame
  1569	
  1570		// AX and CX contain parts of a and b that differ.
  1571	diff8:
  1572		BSWAPQ	AX	// reverse order of bytes
  1573		BSWAPQ	CX
  1574		XORQ	AX, CX
  1575		BSRQ	CX, CX	// index of highest bit difference
  1576		SHRQ	CX, AX	// move a's bit to bottom
  1577		ANDQ	$1, AX	// mask bit
  1578		LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1579		MOVQ	AX, (R9)
  1580		RET
  1581	
  1582		// 0-7 bytes in common
  1583	small:
  1584		LEAQ	(R8*8), CX	// bytes left -> bits left
  1585		NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1586		JEQ	allsame
  1587	
  1588		// load bytes of a into high bytes of AX
  1589		CMPB	SI, $0xf8
  1590		JA	si_high
  1591		MOVQ	(SI), SI
  1592		JMP	si_finish
  1593	si_high:
  1594		MOVQ	-8(SI)(R8*1), SI
  1595		SHRQ	CX, SI
  1596	si_finish:
  1597		SHLQ	CX, SI
  1598	
  1599		// load bytes of b in to high bytes of BX
  1600		CMPB	DI, $0xf8
  1601		JA	di_high
  1602		MOVQ	(DI), DI
  1603		JMP	di_finish
  1604	di_high:
  1605		MOVQ	-8(DI)(R8*1), DI
  1606		SHRQ	CX, DI
  1607	di_finish:
  1608		SHLQ	CX, DI
  1609	
  1610		BSWAPQ	SI	// reverse order of bytes
  1611		BSWAPQ	DI
  1612		XORQ	SI, DI	// find bit differences
  1613		JEQ	allsame
  1614		BSRQ	DI, CX	// index of highest bit difference
  1615		SHRQ	CX, SI	// move a's bit to bottom
  1616		ANDQ	$1, SI	// mask bit
  1617		LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1618		MOVQ	AX, (R9)
  1619		RET
  1620	
  1621	allsame:
  1622		XORQ	AX, AX
  1623		XORQ	CX, CX
  1624		CMPQ	BX, DX
  1625		SETGT	AX	// 1 if alen > blen
  1626		SETEQ	CX	// 1 if alen == blen
  1627		LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1628		MOVQ	AX, (R9)
  1629		RET
  1630	
  1631		// this works for >= 64 bytes of data.
  1632	big_loop:
  1633		MOVOU	(SI), X0
  1634		MOVOU	(DI), X1
  1635		PCMPEQB X0, X1
  1636		PMOVMSKB X1, AX
  1637		XORQ	$0xffff, AX
  1638		JNE	diff16
  1639	
  1640		MOVOU	16(SI), X0
  1641		MOVOU	16(DI), X1
  1642		PCMPEQB X0, X1
  1643		PMOVMSKB X1, AX
  1644		XORQ	$0xffff, AX
  1645		JNE	diff32
  1646	
  1647		MOVOU	32(SI), X0
  1648		MOVOU	32(DI), X1
  1649		PCMPEQB X0, X1
  1650		PMOVMSKB X1, AX
  1651		XORQ	$0xffff, AX
  1652		JNE	diff48
  1653	
  1654		MOVOU	48(SI), X0
  1655		MOVOU	48(DI), X1
  1656		PCMPEQB X0, X1
  1657		PMOVMSKB X1, AX
  1658		XORQ	$0xffff, AX
  1659		JNE	diff64
  1660	
  1661		ADDQ	$64, SI
  1662		ADDQ	$64, DI
  1663		SUBQ	$64, R8
  1664		CMPQ	R8, $64
  1665		JBE	loop
  1666		JMP	big_loop
  1667	
  1668		// Compare 64-bytes per loop iteration.
  1669		// Loop is unrolled and uses AVX2.
  1670	big_loop_avx2:
  1671		VMOVDQU	(SI), Y2
  1672		VMOVDQU	(DI), Y3
  1673		VMOVDQU	32(SI), Y4
  1674		VMOVDQU	32(DI), Y5
  1675		VPCMPEQB Y2, Y3, Y0
  1676		VPMOVMSKB Y0, AX
  1677		XORL	$0xffffffff, AX
  1678		JNE	diff32_avx2
  1679		VPCMPEQB Y4, Y5, Y6
  1680		VPMOVMSKB Y6, AX
  1681		XORL	$0xffffffff, AX
  1682		JNE	diff64_avx2
  1683	
  1684		ADDQ	$64, SI
  1685		ADDQ	$64, DI
  1686		SUBQ	$64, R8
  1687		CMPQ	R8, $64
  1688		JB	big_loop_avx2_exit
  1689		JMP	big_loop_avx2
  1690	
  1691		// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1692	diff32_avx2:
  1693		VZEROUPPER
  1694		JMP diff16
  1695	
  1696		// Same as diff32_avx2, but for last 32 bytes.
  1697	diff64_avx2:
  1698		VZEROUPPER
  1699		JMP diff48
  1700	
  1701		// For <64 bytes remainder jump to normal loop.
  1702	big_loop_avx2_exit:
  1703		VZEROUPPER
  1704		JMP loop
  1705	
  1706	TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1707		MOVQ s+0(FP), DI
  1708		// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1709		MOVQ s_len+8(FP), DX
  1710		MOVQ c+16(FP), BP
  1711		MOVQ c_len+24(FP), AX
  1712		MOVQ DI, R10
  1713		LEAQ ret+32(FP), R11
  1714		JMP  runtime·indexShortStr(SB)
  1715	
  1716	TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1717		MOVQ s+0(FP), DI
  1718		MOVQ s_len+8(FP), DX
  1719		MOVQ c+24(FP), BP
  1720		MOVQ c_len+32(FP), AX
  1721		MOVQ DI, R10
  1722		LEAQ ret+48(FP), R11
  1723		JMP  runtime·indexShortStr(SB)
  1724	
  1725	// AX: length of string, that we are searching for
  1726	// DX: length of string, in which we are searching
  1727	// DI: pointer to string, in which we are searching
  1728	// BP: pointer to string, that we are searching for
  1729	// R11: address, where to put return value
  1730	TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1731		CMPQ AX, DX
  1732		JA fail
  1733		CMPQ DX, $16
  1734		JAE sse42
  1735	no_sse42:
  1736		CMPQ AX, $2
  1737		JA   _3_or_more
  1738		MOVW (BP), BP
  1739		LEAQ -1(DI)(DX*1), DX
  1740	loop2:
  1741		MOVW (DI), SI
  1742		CMPW SI,BP
  1743		JZ success
  1744		ADDQ $1,DI
  1745		CMPQ DI,DX
  1746		JB loop2
  1747		JMP fail
  1748	_3_or_more:
  1749		CMPQ AX, $3
  1750		JA   _4_or_more
  1751		MOVW 1(BP), BX
  1752		MOVW (BP), BP
  1753		LEAQ -2(DI)(DX*1), DX
  1754	loop3:
  1755		MOVW (DI), SI
  1756		CMPW SI,BP
  1757		JZ   partial_success3
  1758		ADDQ $1,DI
  1759		CMPQ DI,DX
  1760		JB loop3
  1761		JMP fail
  1762	partial_success3:
  1763		MOVW 1(DI), SI
  1764		CMPW SI,BX
  1765		JZ success
  1766		ADDQ $1,DI
  1767		CMPQ DI,DX
  1768		JB loop3
  1769		JMP fail
  1770	_4_or_more:
  1771		CMPQ AX, $4
  1772		JA   _5_or_more
  1773		MOVL (BP), BP
  1774		LEAQ -3(DI)(DX*1), DX
  1775	loop4:
  1776		MOVL (DI), SI
  1777		CMPL SI,BP
  1778		JZ   success
  1779		ADDQ $1,DI
  1780		CMPQ DI,DX
  1781		JB loop4
  1782		JMP fail
  1783	_5_or_more:
  1784		CMPQ AX, $7
  1785		JA   _8_or_more
  1786		LEAQ 1(DI)(DX*1), DX
  1787		SUBQ AX, DX
  1788		MOVL -4(BP)(AX*1), BX
  1789		MOVL (BP), BP
  1790	loop5to7:
  1791		MOVL (DI), SI
  1792		CMPL SI,BP
  1793		JZ   partial_success5to7
  1794		ADDQ $1,DI
  1795		CMPQ DI,DX
  1796		JB loop5to7
  1797		JMP fail
  1798	partial_success5to7:
  1799		MOVL -4(AX)(DI*1), SI
  1800		CMPL SI,BX
  1801		JZ success
  1802		ADDQ $1,DI
  1803		CMPQ DI,DX
  1804		JB loop5to7
  1805		JMP fail
  1806	_8_or_more:
  1807		CMPQ AX, $8
  1808		JA   _9_or_more
  1809		MOVQ (BP), BP
  1810		LEAQ -7(DI)(DX*1), DX
  1811	loop8:
  1812		MOVQ (DI), SI
  1813		CMPQ SI,BP
  1814		JZ   success
  1815		ADDQ $1,DI
  1816		CMPQ DI,DX
  1817		JB loop8
  1818		JMP fail
  1819	_9_or_more:
  1820		CMPQ AX, $15
  1821		JA   _16_or_more
  1822		LEAQ 1(DI)(DX*1), DX
  1823		SUBQ AX, DX
  1824		MOVQ -8(BP)(AX*1), BX
  1825		MOVQ (BP), BP
  1826	loop9to15:
  1827		MOVQ (DI), SI
  1828		CMPQ SI,BP
  1829		JZ   partial_success9to15
  1830		ADDQ $1,DI
  1831		CMPQ DI,DX
  1832		JB loop9to15
  1833		JMP fail
  1834	partial_success9to15:
  1835		MOVQ -8(AX)(DI*1), SI
  1836		CMPQ SI,BX
  1837		JZ success
  1838		ADDQ $1,DI
  1839		CMPQ DI,DX
  1840		JB loop9to15
  1841		JMP fail
  1842	_16_or_more:
  1843		CMPQ AX, $16
  1844		JA   _17_or_more
  1845		MOVOU (BP), X1
  1846		LEAQ -15(DI)(DX*1), DX
  1847	loop16:
  1848		MOVOU (DI), X2
  1849		PCMPEQB X1, X2
  1850		PMOVMSKB X2, SI
  1851		CMPQ  SI, $0xffff
  1852		JE   success
  1853		ADDQ $1,DI
  1854		CMPQ DI,DX
  1855		JB loop16
  1856		JMP fail
  1857	_17_or_more:
  1858		CMPQ AX, $31
  1859		JA   _32_or_more
  1860		LEAQ 1(DI)(DX*1), DX
  1861		SUBQ AX, DX
  1862		MOVOU -16(BP)(AX*1), X0
  1863		MOVOU (BP), X1
  1864	loop17to31:
  1865		MOVOU (DI), X2
  1866		PCMPEQB X1,X2
  1867		PMOVMSKB X2, SI
  1868		CMPQ  SI, $0xffff
  1869		JE   partial_success17to31
  1870		ADDQ $1,DI
  1871		CMPQ DI,DX
  1872		JB loop17to31
  1873		JMP fail
  1874	partial_success17to31:
  1875		MOVOU -16(AX)(DI*1), X3
  1876		PCMPEQB X0, X3
  1877		PMOVMSKB X3, SI
  1878		CMPQ  SI, $0xffff
  1879		JE success
  1880		ADDQ $1,DI
  1881		CMPQ DI,DX
  1882		JB loop17to31
  1883		JMP fail
  1884	// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1885	// So no need to check cpuid
  1886	_32_or_more:
  1887		CMPQ AX, $32
  1888		JA   _33_to_63
  1889		VMOVDQU (BP), Y1
  1890		LEAQ -31(DI)(DX*1), DX
  1891	loop32:
  1892		VMOVDQU (DI), Y2
  1893		VPCMPEQB Y1, Y2, Y3
  1894		VPMOVMSKB Y3, SI
  1895		CMPL  SI, $0xffffffff
  1896		JE   success_avx2
  1897		ADDQ $1,DI
  1898		CMPQ DI,DX
  1899		JB loop32
  1900		JMP fail_avx2
  1901	_33_to_63:
  1902		LEAQ 1(DI)(DX*1), DX
  1903		SUBQ AX, DX
  1904		VMOVDQU -32(BP)(AX*1), Y0
  1905		VMOVDQU (BP), Y1
  1906	loop33to63:
  1907		VMOVDQU (DI), Y2
  1908		VPCMPEQB Y1, Y2, Y3
  1909		VPMOVMSKB Y3, SI
  1910		CMPL  SI, $0xffffffff
  1911		JE   partial_success33to63
  1912		ADDQ $1,DI
  1913		CMPQ DI,DX
  1914		JB loop33to63
  1915		JMP fail_avx2
  1916	partial_success33to63:
  1917		VMOVDQU -32(AX)(DI*1), Y3
  1918		VPCMPEQB Y0, Y3, Y4
  1919		VPMOVMSKB Y4, SI
  1920		CMPL  SI, $0xffffffff
  1921		JE success_avx2
  1922		ADDQ $1,DI
  1923		CMPQ DI,DX
  1924		JB loop33to63
  1925	fail_avx2:
  1926		VZEROUPPER
  1927	fail:
  1928		MOVQ $-1, (R11)
  1929		RET
  1930	success_avx2:
  1931		VZEROUPPER
  1932		JMP success
  1933	sse42:
  1934		CMPB runtime·support_sse42(SB), $1
  1935		JNE no_sse42
  1936		CMPQ AX, $12
  1937		// PCMPESTRI is slower than normal compare,
  1938		// so using it makes sense only if we advance 4+ bytes per compare
  1939		// This value was determined experimentally and is the ~same
  1940		// on Nehalem (first with SSE42) and Haswell.
  1941		JAE _9_or_more
  1942		LEAQ 16(BP), SI
  1943		TESTW $0xff0, SI
  1944		JEQ no_sse42
  1945		MOVOU (BP), X1
  1946		LEAQ -15(DI)(DX*1), SI
  1947		MOVQ $16, R9
  1948		SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1949	loop_sse42:
  1950		// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1951		// for equality (bits 2,3 are 11)
  1952		// result is not masked or inverted (bits 4,5 are 00)
  1953		// and corresponds to first matching byte (bit 6 is 0)
  1954		PCMPESTRI $0x0c, (DI), X1
  1955		// CX == 16 means no match,
  1956		// CX > R9 means partial match at the end of the string,
  1957		// otherwise sep is at offset CX from X1 start
  1958		CMPQ CX, R9
  1959		JBE sse42_success
  1960		ADDQ R9, DI
  1961		CMPQ DI, SI
  1962		JB loop_sse42
  1963		PCMPESTRI $0x0c, -1(SI), X1
  1964		CMPQ CX, R9
  1965		JA fail
  1966		LEAQ -1(SI), DI
  1967	sse42_success:
  1968		ADDQ CX, DI
  1969	success:
  1970		SUBQ R10, DI
  1971		MOVQ DI, (R11)
  1972		RET
  1973	
  1974	
  1975	TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1976		MOVQ s+0(FP), SI
  1977		MOVQ s_len+8(FP), BX
  1978		MOVB c+24(FP), AL
  1979		LEAQ ret+32(FP), R8
  1980		JMP  runtime·indexbytebody(SB)
  1981	
  1982	TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1983		MOVQ s+0(FP), SI
  1984		MOVQ s_len+8(FP), BX
  1985		MOVB c+16(FP), AL
  1986		LEAQ ret+24(FP), R8
  1987		JMP  runtime·indexbytebody(SB)
  1988	
  1989	// input:
  1990	//   SI: data
  1991	//   BX: data len
  1992	//   AL: byte sought
  1993	//   R8: address to put result
  1994	TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1995		// Shuffle X0 around so that each byte contains
  1996		// the character we're looking for.
  1997		MOVD AX, X0
  1998		PUNPCKLBW X0, X0
  1999		PUNPCKLBW X0, X0
  2000		PSHUFL $0, X0, X0
  2001		
  2002		CMPQ BX, $16
  2003		JLT small
  2004	
  2005		MOVQ SI, DI
  2006	
  2007		CMPQ BX, $32
  2008		JA avx2
  2009	sse:
  2010		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2011		JMP	sseloopentry
  2012		
  2013	sseloop:
  2014		// Move the next 16-byte chunk of the data into X1.
  2015		MOVOU	(DI), X1
  2016		// Compare bytes in X0 to X1.
  2017		PCMPEQB	X0, X1
  2018		// Take the top bit of each byte in X1 and put the result in DX.
  2019		PMOVMSKB X1, DX
  2020		// Find first set bit, if any.
  2021		BSFL	DX, DX
  2022		JNZ	ssesuccess
  2023		// Advance to next block.
  2024		ADDQ	$16, DI
  2025	sseloopentry:
  2026		CMPQ	DI, AX
  2027		JB	sseloop
  2028	
  2029		// Search the last 16-byte chunk. This chunk may overlap with the
  2030		// chunks we've already searched, but that's ok.
  2031		MOVQ	AX, DI
  2032		MOVOU	(AX), X1
  2033		PCMPEQB	X0, X1
  2034		PMOVMSKB X1, DX
  2035		BSFL	DX, DX
  2036		JNZ	ssesuccess
  2037	
  2038	failure:
  2039		MOVQ $-1, (R8)
  2040		RET
  2041	
  2042	// We've found a chunk containing the byte.
  2043	// The chunk was loaded from DI.
  2044	// The index of the matching byte in the chunk is DX.
  2045	// The start of the data is SI.
  2046	ssesuccess:
  2047		SUBQ SI, DI	// Compute offset of chunk within data.
  2048		ADDQ DX, DI	// Add offset of byte within chunk.
  2049		MOVQ DI, (R8)
  2050		RET
  2051	
  2052	// handle for lengths < 16
  2053	small:
  2054		TESTQ	BX, BX
  2055		JEQ	failure
  2056	
  2057		// Check if we'll load across a page boundary.
  2058		LEAQ	16(SI), AX
  2059		TESTW	$0xff0, AX
  2060		JEQ	endofpage
  2061	
  2062		MOVOU	(SI), X1 // Load data
  2063		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2064		PMOVMSKB X1, DX	// Move result bits to integer register.
  2065		BSFL	DX, DX	// Find first set bit.
  2066		JZ	failure	// No set bit, failure.
  2067		CMPL	DX, BX
  2068		JAE	failure	// Match is past end of data.
  2069		MOVQ	DX, (R8)
  2070		RET
  2071	
  2072	endofpage:
  2073		MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2074		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2075		PMOVMSKB X1, DX	// Move result bits to integer register.
  2076		MOVL	BX, CX
  2077		SHLL	CX, DX
  2078		SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2079		BSFL	DX, DX	// Find first set bit.
  2080		JZ	failure	// No set bit, failure.
  2081		MOVQ	DX, (R8)
  2082		RET
  2083	
  2084	avx2:
  2085		CMPB   runtime·support_avx2(SB), $1
  2086		JNE sse
  2087		MOVD AX, X0
  2088		LEAQ -32(SI)(BX*1), R11
  2089		VPBROADCASTB  X0, Y1
  2090	avx2_loop:
  2091		VMOVDQU (DI), Y2
  2092		VPCMPEQB Y1, Y2, Y3
  2093		VPTEST Y3, Y3
  2094		JNZ avx2success
  2095		ADDQ $32, DI
  2096		CMPQ DI, R11
  2097		JLT avx2_loop
  2098		MOVQ R11, DI
  2099		VMOVDQU (DI), Y2
  2100		VPCMPEQB Y1, Y2, Y3
  2101		VPTEST Y3, Y3
  2102		JNZ avx2success
  2103		VZEROUPPER
  2104		MOVQ $-1, (R8)
  2105		RET
  2106	
  2107	avx2success:
  2108		VPMOVMSKB Y3, DX
  2109		BSFL DX, DX
  2110		SUBQ SI, DI
  2111		ADDQ DI, DX
  2112		MOVQ DX, (R8)
  2113		VZEROUPPER
  2114		RET
  2115	
  2116	TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2117		MOVQ	a_len+8(FP), BX
  2118		MOVQ	b_len+32(FP), CX
  2119		CMPQ	BX, CX
  2120		JNE	eqret
  2121		MOVQ	a+0(FP), SI
  2122		MOVQ	b+24(FP), DI
  2123		LEAQ	ret+48(FP), AX
  2124		JMP	runtime·memeqbody(SB)
  2125	eqret:
  2126		MOVB	$0, ret+48(FP)
  2127		RET
  2128	
  2129	
  2130	TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2131		MOVQ s+0(FP), SI
  2132		MOVQ s_len+8(FP), BX
  2133		MOVB c+24(FP), AL
  2134		LEAQ ret+32(FP), R8
  2135		JMP  runtime·countByte(SB)
  2136	
  2137	TEXT strings·countByte(SB),NOSPLIT,$0-32
  2138		MOVQ s+0(FP), SI
  2139		MOVQ s_len+8(FP), BX
  2140		MOVB c+16(FP), AL
  2141		LEAQ ret+24(FP), R8
  2142		JMP  runtime·countByte(SB)
  2143	
  2144	// input:
  2145	//   SI: data
  2146	//   BX: data len
  2147	//   AL: byte sought
  2148	//   R8: address to put result
  2149	// This requires the POPCNT instruction
  2150	TEXT runtime·countByte(SB),NOSPLIT,$0
  2151		// Shuffle X0 around so that each byte contains
  2152		// the character we're looking for.
  2153		MOVD AX, X0
  2154		PUNPCKLBW X0, X0
  2155		PUNPCKLBW X0, X0
  2156		PSHUFL $0, X0, X0
  2157	
  2158		CMPQ BX, $16
  2159		JLT small
  2160	
  2161		MOVQ $0, R12 // Accumulator
  2162	
  2163		MOVQ SI, DI
  2164	
  2165		CMPQ BX, $32
  2166		JA avx2
  2167	sse:
  2168		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2169		JMP	sseloopentry
  2170	
  2171	sseloop:
  2172		// Move the next 16-byte chunk of the data into X1.
  2173		MOVOU	(DI), X1
  2174		// Compare bytes in X0 to X1.
  2175		PCMPEQB	X0, X1
  2176		// Take the top bit of each byte in X1 and put the result in DX.
  2177		PMOVMSKB X1, DX
  2178		// Count number of matching bytes
  2179		POPCNTL DX, DX
  2180		// Accumulate into R12
  2181		ADDQ DX, R12
  2182		// Advance to next block.
  2183		ADDQ	$16, DI
  2184	sseloopentry:
  2185		CMPQ	DI, AX
  2186		JBE	sseloop
  2187	
  2188		// Get the number of bytes to consider in the last 16 bytes
  2189		ANDQ $15, BX
  2190		JZ end
  2191	
  2192		// Create mask to ignore overlap between previous 16 byte block
  2193		// and the next.
  2194		MOVQ $16,CX
  2195		SUBQ BX, CX
  2196		MOVQ $0xFFFF, R10
  2197		SARQ CL, R10
  2198		SALQ CL, R10
  2199	
  2200		// Process the last 16-byte chunk. This chunk may overlap with the
  2201		// chunks we've already searched so we need to mask part of it.
  2202		MOVOU	(AX), X1
  2203		PCMPEQB	X0, X1
  2204		PMOVMSKB X1, DX
  2205		// Apply mask
  2206		ANDQ R10, DX
  2207		POPCNTL DX, DX
  2208		ADDQ DX, R12
  2209	end:
  2210		MOVQ R12, (R8)
  2211		RET
  2212	
  2213	// handle for lengths < 16
  2214	small:
  2215		TESTQ	BX, BX
  2216		JEQ	endzero
  2217	
  2218		// Check if we'll load across a page boundary.
  2219		LEAQ	16(SI), AX
  2220		TESTW	$0xff0, AX
  2221		JEQ	endofpage
  2222	
  2223		// We must ignore high bytes as they aren't part of our slice.
  2224		// Create mask.
  2225		MOVB BX, CX
  2226		MOVQ $1, R10
  2227		SALQ CL, R10
  2228		SUBQ $1, R10
  2229	
  2230		// Load data
  2231		MOVOU	(SI), X1
  2232		// Compare target byte with each byte in data.
  2233		PCMPEQB	X0, X1
  2234		// Move result bits to integer register.
  2235		PMOVMSKB X1, DX
  2236		// Apply mask
  2237		ANDQ R10, DX
  2238		POPCNTL DX, DX
  2239		// Directly return DX, we don't need to accumulate
  2240		// since we have <16 bytes.
  2241		MOVQ	DX, (R8)
  2242		RET
  2243	endzero:
  2244		MOVQ $0, (R8)
  2245		RET
  2246	
  2247	endofpage:
  2248		// We must ignore low bytes as they aren't part of our slice.
  2249		MOVQ $16,CX
  2250		SUBQ BX, CX
  2251		MOVQ $0xFFFF, R10
  2252		SARQ CL, R10
  2253		SALQ CL, R10
  2254	
  2255		// Load data into the high end of X1.
  2256		MOVOU	-16(SI)(BX*1), X1
  2257		// Compare target byte with each byte in data.
  2258		PCMPEQB	X0, X1
  2259		// Move result bits to integer register.
  2260		PMOVMSKB X1, DX
  2261		// Apply mask
  2262		ANDQ R10, DX
  2263		// Directly return DX, we don't need to accumulate
  2264		// since we have <16 bytes.
  2265		POPCNTL DX, DX
  2266		MOVQ	DX, (R8)
  2267		RET
  2268	
  2269	avx2:
  2270		CMPB   runtime·support_avx2(SB), $1
  2271		JNE sse
  2272		MOVD AX, X0
  2273		LEAQ -32(SI)(BX*1), R11
  2274		VPBROADCASTB  X0, Y1
  2275	avx2_loop:
  2276		VMOVDQU (DI), Y2
  2277		VPCMPEQB Y1, Y2, Y3
  2278		VPMOVMSKB Y3, DX
  2279		POPCNTL DX, DX
  2280		ADDQ DX, R12
  2281		ADDQ $32, DI
  2282		CMPQ DI, R11
  2283		JLE avx2_loop
  2284	
  2285		// If last block is already processed,
  2286		// skip to the end.
  2287		CMPQ DI, R11
  2288		JEQ endavx
  2289	
  2290		// Load address of the last 32 bytes.
  2291		// There is an overlap with the previous block.
  2292		MOVQ R11, DI
  2293		VMOVDQU (DI), Y2
  2294		VPCMPEQB Y1, Y2, Y3
  2295		VPMOVMSKB Y3, DX
  2296		// Exit AVX mode.
  2297		VZEROUPPER
  2298	
  2299		// Create mask to ignore overlap between previous 32 byte block
  2300		// and the next.
  2301		ANDQ $31, BX
  2302		MOVQ $32,CX
  2303		SUBQ BX, CX
  2304		MOVQ $0xFFFFFFFF, R10
  2305		SARQ CL, R10
  2306		SALQ CL, R10
  2307		// Apply mask
  2308		ANDQ R10, DX
  2309		POPCNTL DX, DX
  2310		ADDQ DX, R12
  2311		MOVQ R12, (R8)
  2312		RET
  2313	endavx:
  2314		// Exit AVX mode.
  2315		VZEROUPPER
  2316		MOVQ R12, (R8)
  2317		RET
  2318	
  2319	TEXT runtime·return0(SB), NOSPLIT, $0
  2320		MOVL	$0, AX
  2321		RET
  2322	
  2323	
  2324	// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2325	// Must obey the gcc calling convention.
  2326	TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2327		get_tls(CX)
  2328		MOVQ	g(CX), AX
  2329		MOVQ	g_m(AX), AX
  2330		MOVQ	m_curg(AX), AX
  2331		MOVQ	(g_stack+stack_hi)(AX), AX
  2332		RET
  2333	
  2334	// The top-most function running on a goroutine
  2335	// returns to goexit+PCQuantum.
  2336	TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2337		BYTE	$0x90	// NOP
  2338		CALL	runtime·goexit1(SB)	// does not return
  2339		// traceback from goexit1 must hit code range of goexit
  2340		BYTE	$0x90	// NOP
  2341	
  2342	TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2343		MOVQ	addr+0(FP), AX
  2344		PREFETCHT0	(AX)
  2345		RET
  2346	
  2347	TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2348		MOVQ	addr+0(FP), AX
  2349		PREFETCHT1	(AX)
  2350		RET
  2351	
  2352	TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2353		MOVQ	addr+0(FP), AX
  2354		PREFETCHT2	(AX)
  2355		RET
  2356	
  2357	TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2358		MOVQ	addr+0(FP), AX
  2359		PREFETCHNTA	(AX)
  2360		RET
  2361	
  2362	// This is called from .init_array and follows the platform, not Go, ABI.
  2363	TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2364		PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2365		MOVQ	runtime·lastmoduledatap(SB), AX
  2366		MOVQ	DI, moduledata_next(AX)
  2367		MOVQ	DI, runtime·lastmoduledatap(SB)
  2368		POPQ	R15
  2369		RET

View as plain text