...
Run Format

Text file src/runtime/asm_amd64.s

Documentation: runtime

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "go_tls.h"
     7	#include "funcdata.h"
     8	#include "textflag.h"
     9	
    10	// _rt0_amd64 is common startup code for most amd64 systems when using
    11	// internal linking. This is the entry point for the program from the
    12	// kernel for an ordinary -buildmode=exe program. The stack holds the
    13	// number of arguments and the C-style argv.
    14	TEXT _rt0_amd64(SB),NOSPLIT,$-8
    15		MOVQ	0(SP), DI	// argc
    16		LEAQ	8(SP), SI	// argv
    17		JMP	runtime·rt0_go(SB)
    18	
    19	// main is common startup code for most amd64 systems when using
    20	// external linking. The C startup code will call the symbol "main"
    21	// passing argc and argv in the usual C ABI registers DI and SI.
    22	TEXT main(SB),NOSPLIT,$-8
    23		JMP	runtime·rt0_go(SB)
    24	
    25	// _rt0_amd64_lib is common startup code for most amd64 systems when
    26	// using -buildmode=c-archive or -buildmode=c-shared. The linker will
    27	// arrange to invoke this function as a global constructor (for
    28	// c-archive) or when the shared library is loaded (for c-shared).
    29	// We expect argc and argv to be passed in the usual C ABI registers
    30	// DI and SI.
    31	TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
    32		// Align stack per ELF ABI requirements.
    33		MOVQ	SP, AX
    34		ANDQ	$~15, SP
    35		// Save C ABI callee-saved registers, as caller may need them.
    36		MOVQ	BX, 0x10(SP)
    37		MOVQ	BP, 0x18(SP)
    38		MOVQ	R12, 0x20(SP)
    39		MOVQ	R13, 0x28(SP)
    40		MOVQ	R14, 0x30(SP)
    41		MOVQ	R15, 0x38(SP)
    42		MOVQ	AX, 0x40(SP)
    43	
    44		MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
    45		MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
    46	
    47		// Synchronous initialization.
    48		CALL	runtime·libpreinit(SB)
    49	
    50		// Create a new thread to finish Go runtime initialization.
    51		MOVQ	_cgo_sys_thread_create(SB), AX
    52		TESTQ	AX, AX
    53		JZ	nocgo
    54		MOVQ	$_rt0_amd64_lib_go(SB), DI
    55		MOVQ	$0, SI
    56		CALL	AX
    57		JMP	restore
    58	
    59	nocgo:
    60		MOVQ	$0x800000, 0(SP)		// stacksize
    61		MOVQ	$_rt0_amd64_lib_go(SB), AX
    62		MOVQ	AX, 8(SP)			// fn
    63		CALL	runtime·newosproc0(SB)
    64	
    65	restore:
    66		MOVQ	0x10(SP), BX
    67		MOVQ	0x18(SP), BP
    68		MOVQ	0x20(SP), R12
    69		MOVQ	0x28(SP), R13
    70		MOVQ	0x30(SP), R14
    71		MOVQ	0x38(SP), R15
    72		MOVQ	0x40(SP), SP
    73		RET
    74	
    75	// _rt0_amd64_lib_go initializes the Go runtime.
    76	// This is started in a separate thread by _rt0_amd64_lib.
    77	TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
    78		MOVQ	_rt0_amd64_lib_argc<>(SB), DI
    79		MOVQ	_rt0_amd64_lib_argv<>(SB), SI
    80		JMP	runtime·rt0_go(SB)
    81	
    82	DATA _rt0_amd64_lib_argc<>(SB)/8, $0
    83	GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
    84	DATA _rt0_amd64_lib_argv<>(SB)/8, $0
    85	GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
    86	
    87	TEXT runtime·rt0_go(SB),NOSPLIT,$0
    88		// copy arguments forward on an even stack
    89		MOVQ	DI, AX		// argc
    90		MOVQ	SI, BX		// argv
    91		SUBQ	$(4*8+7), SP		// 2args 2auto
    92		ANDQ	$~15, SP
    93		MOVQ	AX, 16(SP)
    94		MOVQ	BX, 24(SP)
    95		
    96		// create istack out of the given (operating system) stack.
    97		// _cgo_init may update stackguard.
    98		MOVQ	$runtime·g0(SB), DI
    99		LEAQ	(-64*1024+104)(SP), BX
   100		MOVQ	BX, g_stackguard0(DI)
   101		MOVQ	BX, g_stackguard1(DI)
   102		MOVQ	BX, (g_stack+stack_lo)(DI)
   103		MOVQ	SP, (g_stack+stack_hi)(DI)
   104	
   105		// find out information about the processor we're on
   106		MOVL	$0, AX
   107		CPUID
   108		MOVL	AX, SI
   109		CMPL	AX, $0
   110		JE	nocpuinfo
   111	
   112		// Figure out how to serialize RDTSC.
   113		// On Intel processors LFENCE is enough. AMD requires MFENCE.
   114		// Don't know about the rest, so let's do MFENCE.
   115		CMPL	BX, $0x756E6547  // "Genu"
   116		JNE	notintel
   117		CMPL	DX, $0x49656E69  // "ineI"
   118		JNE	notintel
   119		CMPL	CX, $0x6C65746E  // "ntel"
   120		JNE	notintel
   121		MOVB	$1, runtime·isIntel(SB)
   122		MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
   123	notintel:
   124	
   125		// Load EAX=1 cpuid flags
   126		MOVL	$1, AX
   127		CPUID
   128		MOVL	AX, runtime·processorVersionInfo(SB)
   129	
   130		TESTL	$(1<<26), DX // SSE2
   131		SETNE	runtime·support_sse2(SB)
   132	
   133		TESTL	$(1<<9), CX // SSSE3
   134		SETNE	runtime·support_ssse3(SB)
   135	
   136		TESTL	$(1<<19), CX // SSE4.1
   137		SETNE	runtime·support_sse41(SB)
   138	
   139		TESTL	$(1<<20), CX // SSE4.2
   140		SETNE	runtime·support_sse42(SB)
   141	
   142		TESTL	$(1<<23), CX // POPCNT
   143		SETNE	runtime·support_popcnt(SB)
   144	
   145		TESTL	$(1<<25), CX // AES
   146		SETNE	runtime·support_aes(SB)
   147	
   148		TESTL	$(1<<27), CX // OSXSAVE
   149		SETNE	runtime·support_osxsave(SB)
   150	
   151		// If OS support for XMM and YMM is not present
   152		// support_avx will be set back to false later.
   153		TESTL	$(1<<28), CX // AVX
   154		SETNE	runtime·support_avx(SB)
   155	
   156	eax7:
   157		// Load EAX=7/ECX=0 cpuid flags
   158		CMPL	SI, $7
   159		JLT	osavx
   160		MOVL	$7, AX
   161		MOVL	$0, CX
   162		CPUID
   163	
   164		TESTL	$(1<<3), BX // BMI1
   165		SETNE	runtime·support_bmi1(SB)
   166	
   167		// If OS support for XMM and YMM is not present
   168		// support_avx2 will be set back to false later.
   169		TESTL	$(1<<5), BX
   170		SETNE	runtime·support_avx2(SB)
   171	
   172		TESTL	$(1<<8), BX // BMI2
   173		SETNE	runtime·support_bmi2(SB)
   174	
   175		TESTL	$(1<<9), BX // ERMS
   176		SETNE	runtime·support_erms(SB)
   177	
   178	osavx:
   179		CMPB	runtime·support_osxsave(SB), $1
   180		JNE	noavx
   181		MOVL	$0, CX
   182		// For XGETBV, OSXSAVE bit is required and sufficient
   183		XGETBV
   184		ANDL	$6, AX
   185		CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   186		JE nocpuinfo
   187	noavx:
   188		MOVB $0, runtime·support_avx(SB)
   189		MOVB $0, runtime·support_avx2(SB)
   190	
   191	nocpuinfo:
   192		// if there is an _cgo_init, call it.
   193		MOVQ	_cgo_init(SB), AX
   194		TESTQ	AX, AX
   195		JZ	needtls
   196		// g0 already in DI
   197		MOVQ	DI, CX	// Win64 uses CX for first parameter
   198		MOVQ	$setg_gcc<>(SB), SI
   199		CALL	AX
   200	
   201		// update stackguard after _cgo_init
   202		MOVQ	$runtime·g0(SB), CX
   203		MOVQ	(g_stack+stack_lo)(CX), AX
   204		ADDQ	$const__StackGuard, AX
   205		MOVQ	AX, g_stackguard0(CX)
   206		MOVQ	AX, g_stackguard1(CX)
   207	
   208	#ifndef GOOS_windows
   209		JMP ok
   210	#endif
   211	needtls:
   212	#ifdef GOOS_plan9
   213		// skip TLS setup on Plan 9
   214		JMP ok
   215	#endif
   216	#ifdef GOOS_solaris
   217		// skip TLS setup on Solaris
   218		JMP ok
   219	#endif
   220	
   221		LEAQ	runtime·m0+m_tls(SB), DI
   222		CALL	runtime·settls(SB)
   223	
   224		// store through it, to make sure it works
   225		get_tls(BX)
   226		MOVQ	$0x123, g(BX)
   227		MOVQ	runtime·m0+m_tls(SB), AX
   228		CMPQ	AX, $0x123
   229		JEQ 2(PC)
   230		MOVL	AX, 0	// abort
   231	ok:
   232		// set the per-goroutine and per-mach "registers"
   233		get_tls(BX)
   234		LEAQ	runtime·g0(SB), CX
   235		MOVQ	CX, g(BX)
   236		LEAQ	runtime·m0(SB), AX
   237	
   238		// save m->g0 = g0
   239		MOVQ	CX, m_g0(AX)
   240		// save m0 to g0->m
   241		MOVQ	AX, g_m(CX)
   242	
   243		CLD				// convention is D is always left cleared
   244		CALL	runtime·check(SB)
   245	
   246		MOVL	16(SP), AX		// copy argc
   247		MOVL	AX, 0(SP)
   248		MOVQ	24(SP), AX		// copy argv
   249		MOVQ	AX, 8(SP)
   250		CALL	runtime·args(SB)
   251		CALL	runtime·osinit(SB)
   252		CALL	runtime·schedinit(SB)
   253	
   254		// create a new goroutine to start program
   255		MOVQ	$runtime·mainPC(SB), AX		// entry
   256		PUSHQ	AX
   257		PUSHQ	$0			// arg size
   258		CALL	runtime·newproc(SB)
   259		POPQ	AX
   260		POPQ	AX
   261	
   262		// start this M
   263		CALL	runtime·mstart(SB)
   264	
   265		MOVL	$0xf1, 0xf1  // crash
   266		RET
   267	
   268	DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   269	GLOBL	runtime·mainPC(SB),RODATA,$8
   270	
   271	TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   272		BYTE	$0xcc
   273		RET
   274	
   275	TEXT runtime·asminit(SB),NOSPLIT,$0-0
   276		// No per-thread init.
   277		RET
   278	
   279	/*
   280	 *  go-routine
   281	 */
   282	
   283	// void gosave(Gobuf*)
   284	// save state in Gobuf; setjmp
   285	TEXT runtime·gosave(SB), NOSPLIT, $0-8
   286		MOVQ	buf+0(FP), AX		// gobuf
   287		LEAQ	buf+0(FP), BX		// caller's SP
   288		MOVQ	BX, gobuf_sp(AX)
   289		MOVQ	0(SP), BX		// caller's PC
   290		MOVQ	BX, gobuf_pc(AX)
   291		MOVQ	$0, gobuf_ret(AX)
   292		MOVQ	BP, gobuf_bp(AX)
   293		// Assert ctxt is zero. See func save.
   294		MOVQ	gobuf_ctxt(AX), BX
   295		TESTQ	BX, BX
   296		JZ	2(PC)
   297		CALL	runtime·badctxt(SB)
   298		get_tls(CX)
   299		MOVQ	g(CX), BX
   300		MOVQ	BX, gobuf_g(AX)
   301		RET
   302	
   303	// void gogo(Gobuf*)
   304	// restore state from Gobuf; longjmp
   305	TEXT runtime·gogo(SB), NOSPLIT, $16-8
   306		MOVQ	buf+0(FP), BX		// gobuf
   307		MOVQ	gobuf_g(BX), DX
   308		MOVQ	0(DX), CX		// make sure g != nil
   309		get_tls(CX)
   310		MOVQ	DX, g(CX)
   311		MOVQ	gobuf_sp(BX), SP	// restore SP
   312		MOVQ	gobuf_ret(BX), AX
   313		MOVQ	gobuf_ctxt(BX), DX
   314		MOVQ	gobuf_bp(BX), BP
   315		MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   316		MOVQ	$0, gobuf_ret(BX)
   317		MOVQ	$0, gobuf_ctxt(BX)
   318		MOVQ	$0, gobuf_bp(BX)
   319		MOVQ	gobuf_pc(BX), BX
   320		JMP	BX
   321	
   322	// func mcall(fn func(*g))
   323	// Switch to m->g0's stack, call fn(g).
   324	// Fn must never return. It should gogo(&g->sched)
   325	// to keep running g.
   326	TEXT runtime·mcall(SB), NOSPLIT, $0-8
   327		MOVQ	fn+0(FP), DI
   328		
   329		get_tls(CX)
   330		MOVQ	g(CX), AX	// save state in g->sched
   331		MOVQ	0(SP), BX	// caller's PC
   332		MOVQ	BX, (g_sched+gobuf_pc)(AX)
   333		LEAQ	fn+0(FP), BX	// caller's SP
   334		MOVQ	BX, (g_sched+gobuf_sp)(AX)
   335		MOVQ	AX, (g_sched+gobuf_g)(AX)
   336		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   337	
   338		// switch to m->g0 & its stack, call fn
   339		MOVQ	g(CX), BX
   340		MOVQ	g_m(BX), BX
   341		MOVQ	m_g0(BX), SI
   342		CMPQ	SI, AX	// if g == m->g0 call badmcall
   343		JNE	3(PC)
   344		MOVQ	$runtime·badmcall(SB), AX
   345		JMP	AX
   346		MOVQ	SI, g(CX)	// g = m->g0
   347		MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   348		PUSHQ	AX
   349		MOVQ	DI, DX
   350		MOVQ	0(DI), DI
   351		CALL	DI
   352		POPQ	AX
   353		MOVQ	$runtime·badmcall2(SB), AX
   354		JMP	AX
   355		RET
   356	
   357	// systemstack_switch is a dummy routine that systemstack leaves at the bottom
   358	// of the G stack. We need to distinguish the routine that
   359	// lives at the bottom of the G stack from the one that lives
   360	// at the top of the system stack because the one at the top of
   361	// the system stack terminates the stack walk (see topofstack()).
   362	TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   363		RET
   364	
   365	// func systemstack(fn func())
   366	TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   367		MOVQ	fn+0(FP), DI	// DI = fn
   368		get_tls(CX)
   369		MOVQ	g(CX), AX	// AX = g
   370		MOVQ	g_m(AX), BX	// BX = m
   371	
   372		MOVQ	m_gsignal(BX), DX	// DX = gsignal
   373		CMPQ	AX, DX
   374		JEQ	noswitch
   375	
   376		MOVQ	m_g0(BX), DX	// DX = g0
   377		CMPQ	AX, DX
   378		JEQ	noswitch
   379	
   380		MOVQ	m_curg(BX), R8
   381		CMPQ	AX, R8
   382		JEQ	switch
   383		
   384		// Bad: g is not gsignal, not g0, not curg. What is it?
   385		MOVQ	$runtime·badsystemstack(SB), AX
   386		CALL	AX
   387	
   388	switch:
   389		// save our state in g->sched. Pretend to
   390		// be systemstack_switch if the G stack is scanned.
   391		MOVQ	$runtime·systemstack_switch(SB), SI
   392		MOVQ	SI, (g_sched+gobuf_pc)(AX)
   393		MOVQ	SP, (g_sched+gobuf_sp)(AX)
   394		MOVQ	AX, (g_sched+gobuf_g)(AX)
   395		MOVQ	BP, (g_sched+gobuf_bp)(AX)
   396	
   397		// switch to g0
   398		MOVQ	DX, g(CX)
   399		MOVQ	(g_sched+gobuf_sp)(DX), BX
   400		// make it look like mstart called systemstack on g0, to stop traceback
   401		SUBQ	$8, BX
   402		MOVQ	$runtime·mstart(SB), DX
   403		MOVQ	DX, 0(BX)
   404		MOVQ	BX, SP
   405	
   406		// call target function
   407		MOVQ	DI, DX
   408		MOVQ	0(DI), DI
   409		CALL	DI
   410	
   411		// switch back to g
   412		get_tls(CX)
   413		MOVQ	g(CX), AX
   414		MOVQ	g_m(AX), BX
   415		MOVQ	m_curg(BX), AX
   416		MOVQ	AX, g(CX)
   417		MOVQ	(g_sched+gobuf_sp)(AX), SP
   418		MOVQ	$0, (g_sched+gobuf_sp)(AX)
   419		RET
   420	
   421	noswitch:
   422		// already on m stack; tail call the function
   423		// Using a tail call here cleans up tracebacks since we won't stop
   424		// at an intermediate systemstack.
   425		MOVQ	DI, DX
   426		MOVQ	0(DI), DI
   427		JMP	DI
   428	
   429	/*
   430	 * support for morestack
   431	 */
   432	
   433	// Called during function prolog when more stack is needed.
   434	//
   435	// The traceback routines see morestack on a g0 as being
   436	// the top of a stack (for example, morestack calling newstack
   437	// calling the scheduler calling newm calling gc), so we must
   438	// record an argument size. For that purpose, it has no arguments.
   439	TEXT runtime·morestack(SB),NOSPLIT,$0-0
   440		// Cannot grow scheduler stack (m->g0).
   441		get_tls(CX)
   442		MOVQ	g(CX), BX
   443		MOVQ	g_m(BX), BX
   444		MOVQ	m_g0(BX), SI
   445		CMPQ	g(CX), SI
   446		JNE	3(PC)
   447		CALL	runtime·badmorestackg0(SB)
   448		INT	$3
   449	
   450		// Cannot grow signal stack (m->gsignal).
   451		MOVQ	m_gsignal(BX), SI
   452		CMPQ	g(CX), SI
   453		JNE	3(PC)
   454		CALL	runtime·badmorestackgsignal(SB)
   455		INT	$3
   456	
   457		// Called from f.
   458		// Set m->morebuf to f's caller.
   459		MOVQ	8(SP), AX	// f's caller's PC
   460		MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   461		LEAQ	16(SP), AX	// f's caller's SP
   462		MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   463		get_tls(CX)
   464		MOVQ	g(CX), SI
   465		MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   466	
   467		// Set g->sched to context in f.
   468		MOVQ	0(SP), AX // f's PC
   469		MOVQ	AX, (g_sched+gobuf_pc)(SI)
   470		MOVQ	SI, (g_sched+gobuf_g)(SI)
   471		LEAQ	8(SP), AX // f's SP
   472		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   473		MOVQ	BP, (g_sched+gobuf_bp)(SI)
   474		MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   475	
   476		// Call newstack on m->g0's stack.
   477		MOVQ	m_g0(BX), BX
   478		MOVQ	BX, g(CX)
   479		MOVQ	(g_sched+gobuf_sp)(BX), SP
   480		CALL	runtime·newstack(SB)
   481		MOVQ	$0, 0x1003	// crash if newstack returns
   482		RET
   483	
   484	// morestack but not preserving ctxt.
   485	TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   486		MOVL	$0, DX
   487		JMP	runtime·morestack(SB)
   488	
   489	// reflectcall: call a function with the given argument list
   490	// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   491	// we don't have variable-sized frames, so we use a small number
   492	// of constant-sized-frame functions to encode a few bits of size in the pc.
   493	// Caution: ugly multiline assembly macros in your future!
   494	
   495	#define DISPATCH(NAME,MAXSIZE)		\
   496		CMPQ	CX, $MAXSIZE;		\
   497		JA	3(PC);			\
   498		MOVQ	$NAME(SB), AX;		\
   499		JMP	AX
   500	// Note: can't just "JMP NAME(SB)" - bad inlining results.
   501	
   502	TEXT reflect·call(SB), NOSPLIT, $0-0
   503		JMP	·reflectcall(SB)
   504	
   505	TEXT ·reflectcall(SB), NOSPLIT, $0-32
   506		MOVLQZX argsize+24(FP), CX
   507		DISPATCH(runtime·call32, 32)
   508		DISPATCH(runtime·call64, 64)
   509		DISPATCH(runtime·call128, 128)
   510		DISPATCH(runtime·call256, 256)
   511		DISPATCH(runtime·call512, 512)
   512		DISPATCH(runtime·call1024, 1024)
   513		DISPATCH(runtime·call2048, 2048)
   514		DISPATCH(runtime·call4096, 4096)
   515		DISPATCH(runtime·call8192, 8192)
   516		DISPATCH(runtime·call16384, 16384)
   517		DISPATCH(runtime·call32768, 32768)
   518		DISPATCH(runtime·call65536, 65536)
   519		DISPATCH(runtime·call131072, 131072)
   520		DISPATCH(runtime·call262144, 262144)
   521		DISPATCH(runtime·call524288, 524288)
   522		DISPATCH(runtime·call1048576, 1048576)
   523		DISPATCH(runtime·call2097152, 2097152)
   524		DISPATCH(runtime·call4194304, 4194304)
   525		DISPATCH(runtime·call8388608, 8388608)
   526		DISPATCH(runtime·call16777216, 16777216)
   527		DISPATCH(runtime·call33554432, 33554432)
   528		DISPATCH(runtime·call67108864, 67108864)
   529		DISPATCH(runtime·call134217728, 134217728)
   530		DISPATCH(runtime·call268435456, 268435456)
   531		DISPATCH(runtime·call536870912, 536870912)
   532		DISPATCH(runtime·call1073741824, 1073741824)
   533		MOVQ	$runtime·badreflectcall(SB), AX
   534		JMP	AX
   535	
   536	#define CALLFN(NAME,MAXSIZE)			\
   537	TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   538		NO_LOCAL_POINTERS;			\
   539		/* copy arguments to stack */		\
   540		MOVQ	argptr+16(FP), SI;		\
   541		MOVLQZX argsize+24(FP), CX;		\
   542		MOVQ	SP, DI;				\
   543		REP;MOVSB;				\
   544		/* call function */			\
   545		MOVQ	f+8(FP), DX;			\
   546		PCDATA  $PCDATA_StackMapIndex, $0;	\
   547		CALL	(DX);				\
   548		/* copy return values back */		\
   549		MOVQ	argtype+0(FP), DX;		\
   550		MOVQ	argptr+16(FP), DI;		\
   551		MOVLQZX	argsize+24(FP), CX;		\
   552		MOVLQZX	retoffset+28(FP), BX;		\
   553		MOVQ	SP, SI;				\
   554		ADDQ	BX, DI;				\
   555		ADDQ	BX, SI;				\
   556		SUBQ	BX, CX;				\
   557		CALL	callRet<>(SB);			\
   558		RET
   559	
   560	// callRet copies return values back at the end of call*. This is a
   561	// separate function so it can allocate stack space for the arguments
   562	// to reflectcallmove. It does not follow the Go ABI; it expects its
   563	// arguments in registers.
   564	TEXT callRet<>(SB), NOSPLIT, $32-0
   565		NO_LOCAL_POINTERS
   566		MOVQ	DX, 0(SP)
   567		MOVQ	DI, 8(SP)
   568		MOVQ	SI, 16(SP)
   569		MOVQ	CX, 24(SP)
   570		CALL	runtime·reflectcallmove(SB)
   571		RET
   572	
   573	CALLFN(·call32, 32)
   574	CALLFN(·call64, 64)
   575	CALLFN(·call128, 128)
   576	CALLFN(·call256, 256)
   577	CALLFN(·call512, 512)
   578	CALLFN(·call1024, 1024)
   579	CALLFN(·call2048, 2048)
   580	CALLFN(·call4096, 4096)
   581	CALLFN(·call8192, 8192)
   582	CALLFN(·call16384, 16384)
   583	CALLFN(·call32768, 32768)
   584	CALLFN(·call65536, 65536)
   585	CALLFN(·call131072, 131072)
   586	CALLFN(·call262144, 262144)
   587	CALLFN(·call524288, 524288)
   588	CALLFN(·call1048576, 1048576)
   589	CALLFN(·call2097152, 2097152)
   590	CALLFN(·call4194304, 4194304)
   591	CALLFN(·call8388608, 8388608)
   592	CALLFN(·call16777216, 16777216)
   593	CALLFN(·call33554432, 33554432)
   594	CALLFN(·call67108864, 67108864)
   595	CALLFN(·call134217728, 134217728)
   596	CALLFN(·call268435456, 268435456)
   597	CALLFN(·call536870912, 536870912)
   598	CALLFN(·call1073741824, 1073741824)
   599	
   600	TEXT runtime·procyield(SB),NOSPLIT,$0-0
   601		MOVL	cycles+0(FP), AX
   602	again:
   603		PAUSE
   604		SUBL	$1, AX
   605		JNZ	again
   606		RET
   607	
   608	
   609	TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   610		// Stores are already ordered on x86, so this is just a
   611		// compile barrier.
   612		RET
   613	
   614	// void jmpdefer(fn, sp);
   615	// called from deferreturn.
   616	// 1. pop the caller
   617	// 2. sub 5 bytes from the callers return
   618	// 3. jmp to the argument
   619	TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   620		MOVQ	fv+0(FP), DX	// fn
   621		MOVQ	argp+8(FP), BX	// caller sp
   622		LEAQ	-8(BX), SP	// caller sp after CALL
   623		MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   624		SUBQ	$5, (SP)	// return to CALL again
   625		MOVQ	0(DX), BX
   626		JMP	BX	// but first run the deferred function
   627	
   628	// Save state of caller into g->sched. Smashes R8, R9.
   629	TEXT gosave<>(SB),NOSPLIT,$0
   630		get_tls(R8)
   631		MOVQ	g(R8), R8
   632		MOVQ	0(SP), R9
   633		MOVQ	R9, (g_sched+gobuf_pc)(R8)
   634		LEAQ	8(SP), R9
   635		MOVQ	R9, (g_sched+gobuf_sp)(R8)
   636		MOVQ	$0, (g_sched+gobuf_ret)(R8)
   637		MOVQ	BP, (g_sched+gobuf_bp)(R8)
   638		// Assert ctxt is zero. See func save.
   639		MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   640		TESTQ	R9, R9
   641		JZ	2(PC)
   642		CALL	runtime·badctxt(SB)
   643		RET
   644	
   645	// func asmcgocall(fn, arg unsafe.Pointer) int32
   646	// Call fn(arg) on the scheduler stack,
   647	// aligned appropriately for the gcc ABI.
   648	// See cgocall.go for more details.
   649	TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   650		MOVQ	fn+0(FP), AX
   651		MOVQ	arg+8(FP), BX
   652	
   653		MOVQ	SP, DX
   654	
   655		// Figure out if we need to switch to m->g0 stack.
   656		// We get called to create new OS threads too, and those
   657		// come in on the m->g0 stack already.
   658		get_tls(CX)
   659		MOVQ	g(CX), R8
   660		CMPQ	R8, $0
   661		JEQ	nosave
   662		MOVQ	g_m(R8), R8
   663		MOVQ	m_g0(R8), SI
   664		MOVQ	g(CX), DI
   665		CMPQ	SI, DI
   666		JEQ	nosave
   667		MOVQ	m_gsignal(R8), SI
   668		CMPQ	SI, DI
   669		JEQ	nosave
   670		
   671		// Switch to system stack.
   672		MOVQ	m_g0(R8), SI
   673		CALL	gosave<>(SB)
   674		MOVQ	SI, g(CX)
   675		MOVQ	(g_sched+gobuf_sp)(SI), SP
   676	
   677		// Now on a scheduling stack (a pthread-created stack).
   678		// Make sure we have enough room for 4 stack-backed fast-call
   679		// registers as per windows amd64 calling convention.
   680		SUBQ	$64, SP
   681		ANDQ	$~15, SP	// alignment for gcc ABI
   682		MOVQ	DI, 48(SP)	// save g
   683		MOVQ	(g_stack+stack_hi)(DI), DI
   684		SUBQ	DX, DI
   685		MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   686		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   687		MOVQ	BX, CX		// CX = first argument in Win64
   688		CALL	AX
   689	
   690		// Restore registers, g, stack pointer.
   691		get_tls(CX)
   692		MOVQ	48(SP), DI
   693		MOVQ	(g_stack+stack_hi)(DI), SI
   694		SUBQ	40(SP), SI
   695		MOVQ	DI, g(CX)
   696		MOVQ	SI, SP
   697	
   698		MOVL	AX, ret+16(FP)
   699		RET
   700	
   701	nosave:
   702		// Running on a system stack, perhaps even without a g.
   703		// Having no g can happen during thread creation or thread teardown
   704		// (see needm/dropm on Solaris, for example).
   705		// This code is like the above sequence but without saving/restoring g
   706		// and without worrying about the stack moving out from under us
   707		// (because we're on a system stack, not a goroutine stack).
   708		// The above code could be used directly if already on a system stack,
   709		// but then the only path through this code would be a rare case on Solaris.
   710		// Using this code for all "already on system stack" calls exercises it more,
   711		// which should help keep it correct.
   712		SUBQ	$64, SP
   713		ANDQ	$~15, SP
   714		MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   715		MOVQ	DX, 40(SP)	// save original stack pointer
   716		MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   717		MOVQ	BX, CX		// CX = first argument in Win64
   718		CALL	AX
   719		MOVQ	40(SP), SI	// restore original stack pointer
   720		MOVQ	SI, SP
   721		MOVL	AX, ret+16(FP)
   722		RET
   723	
   724	// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   725	// Turn the fn into a Go func (by taking its address) and call
   726	// cgocallback_gofunc.
   727	TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   728		LEAQ	fn+0(FP), AX
   729		MOVQ	AX, 0(SP)
   730		MOVQ	frame+8(FP), AX
   731		MOVQ	AX, 8(SP)
   732		MOVQ	framesize+16(FP), AX
   733		MOVQ	AX, 16(SP)
   734		MOVQ	ctxt+24(FP), AX
   735		MOVQ	AX, 24(SP)
   736		MOVQ	$runtime·cgocallback_gofunc(SB), AX
   737		CALL	AX
   738		RET
   739	
   740	// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   741	// See cgocall.go for more details.
   742	TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   743		NO_LOCAL_POINTERS
   744	
   745		// If g is nil, Go did not create the current thread.
   746		// Call needm to obtain one m for temporary use.
   747		// In this case, we're running on the thread stack, so there's
   748		// lots of space, but the linker doesn't know. Hide the call from
   749		// the linker analysis by using an indirect call through AX.
   750		get_tls(CX)
   751	#ifdef GOOS_windows
   752		MOVL	$0, BX
   753		CMPQ	CX, $0
   754		JEQ	2(PC)
   755	#endif
   756		MOVQ	g(CX), BX
   757		CMPQ	BX, $0
   758		JEQ	needm
   759		MOVQ	g_m(BX), BX
   760		MOVQ	BX, R8 // holds oldm until end of function
   761		JMP	havem
   762	needm:
   763		MOVQ	$0, 0(SP)
   764		MOVQ	$runtime·needm(SB), AX
   765		CALL	AX
   766		MOVQ	0(SP), R8
   767		get_tls(CX)
   768		MOVQ	g(CX), BX
   769		MOVQ	g_m(BX), BX
   770		
   771		// Set m->sched.sp = SP, so that if a panic happens
   772		// during the function we are about to execute, it will
   773		// have a valid SP to run on the g0 stack.
   774		// The next few lines (after the havem label)
   775		// will save this SP onto the stack and then write
   776		// the same SP back to m->sched.sp. That seems redundant,
   777		// but if an unrecovered panic happens, unwindm will
   778		// restore the g->sched.sp from the stack location
   779		// and then systemstack will try to use it. If we don't set it here,
   780		// that restored SP will be uninitialized (typically 0) and
   781		// will not be usable.
   782		MOVQ	m_g0(BX), SI
   783		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   784	
   785	havem:
   786		// Now there's a valid m, and we're running on its m->g0.
   787		// Save current m->g0->sched.sp on stack and then set it to SP.
   788		// Save current sp in m->g0->sched.sp in preparation for
   789		// switch back to m->curg stack.
   790		// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   791		MOVQ	m_g0(BX), SI
   792		MOVQ	(g_sched+gobuf_sp)(SI), AX
   793		MOVQ	AX, 0(SP)
   794		MOVQ	SP, (g_sched+gobuf_sp)(SI)
   795	
   796		// Switch to m->curg stack and call runtime.cgocallbackg.
   797		// Because we are taking over the execution of m->curg
   798		// but *not* resuming what had been running, we need to
   799		// save that information (m->curg->sched) so we can restore it.
   800		// We can restore m->curg->sched.sp easily, because calling
   801		// runtime.cgocallbackg leaves SP unchanged upon return.
   802		// To save m->curg->sched.pc, we push it onto the stack.
   803		// This has the added benefit that it looks to the traceback
   804		// routine like cgocallbackg is going to return to that
   805		// PC (because the frame we allocate below has the same
   806		// size as cgocallback_gofunc's frame declared above)
   807		// so that the traceback will seamlessly trace back into
   808		// the earlier calls.
   809		//
   810		// In the new goroutine, 8(SP) holds the saved R8.
   811		MOVQ	m_curg(BX), SI
   812		MOVQ	SI, g(CX)
   813		MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   814		MOVQ	(g_sched+gobuf_pc)(SI), BX
   815		MOVQ	BX, -8(DI)
   816		// Compute the size of the frame, including return PC and, if
   817		// GOEXPERIMENT=framepointer, the saved base pointer
   818		MOVQ	ctxt+24(FP), BX
   819		LEAQ	fv+0(FP), AX
   820		SUBQ	SP, AX
   821		SUBQ	AX, DI
   822		MOVQ	DI, SP
   823	
   824		MOVQ	R8, 8(SP)
   825		MOVQ	BX, 0(SP)
   826		CALL	runtime·cgocallbackg(SB)
   827		MOVQ	8(SP), R8
   828	
   829		// Compute the size of the frame again. FP and SP have
   830		// completely different values here than they did above,
   831		// but only their difference matters.
   832		LEAQ	fv+0(FP), AX
   833		SUBQ	SP, AX
   834	
   835		// Restore g->sched (== m->curg->sched) from saved values.
   836		get_tls(CX)
   837		MOVQ	g(CX), SI
   838		MOVQ	SP, DI
   839		ADDQ	AX, DI
   840		MOVQ	-8(DI), BX
   841		MOVQ	BX, (g_sched+gobuf_pc)(SI)
   842		MOVQ	DI, (g_sched+gobuf_sp)(SI)
   843	
   844		// Switch back to m->g0's stack and restore m->g0->sched.sp.
   845		// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   846		// so we do not have to restore it.)
   847		MOVQ	g(CX), BX
   848		MOVQ	g_m(BX), BX
   849		MOVQ	m_g0(BX), SI
   850		MOVQ	SI, g(CX)
   851		MOVQ	(g_sched+gobuf_sp)(SI), SP
   852		MOVQ	0(SP), AX
   853		MOVQ	AX, (g_sched+gobuf_sp)(SI)
   854		
   855		// If the m on entry was nil, we called needm above to borrow an m
   856		// for the duration of the call. Since the call is over, return it with dropm.
   857		CMPQ	R8, $0
   858		JNE 3(PC)
   859		MOVQ	$runtime·dropm(SB), AX
   860		CALL	AX
   861	
   862		// Done!
   863		RET
   864	
   865	// void setg(G*); set g. for use by needm.
   866	TEXT runtime·setg(SB), NOSPLIT, $0-8
   867		MOVQ	gg+0(FP), BX
   868	#ifdef GOOS_windows
   869		CMPQ	BX, $0
   870		JNE	settls
   871		MOVQ	$0, 0x28(GS)
   872		RET
   873	settls:
   874		MOVQ	g_m(BX), AX
   875		LEAQ	m_tls(AX), AX
   876		MOVQ	AX, 0x28(GS)
   877	#endif
   878		get_tls(CX)
   879		MOVQ	BX, g(CX)
   880		RET
   881	
   882	// void setg_gcc(G*); set g called from gcc.
   883	TEXT setg_gcc<>(SB),NOSPLIT,$0
   884		get_tls(AX)
   885		MOVQ	DI, g(AX)
   886		RET
   887	
   888	// check that SP is in range [g->stack.lo, g->stack.hi)
   889	TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   890		get_tls(CX)
   891		MOVQ	g(CX), AX
   892		CMPQ	(g_stack+stack_hi)(AX), SP
   893		JHI	2(PC)
   894		INT	$3
   895		CMPQ	SP, (g_stack+stack_lo)(AX)
   896		JHI	2(PC)
   897		INT	$3
   898		RET
   899	
   900	// func cputicks() int64
   901	TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   902		CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   903		JNE	mfence
   904		LFENCE
   905		JMP	done
   906	mfence:
   907		MFENCE
   908	done:
   909		RDTSC
   910		SHLQ	$32, DX
   911		ADDQ	DX, AX
   912		MOVQ	AX, ret+0(FP)
   913		RET
   914	
   915	// hash function using AES hardware instructions
   916	TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   917		MOVQ	p+0(FP), AX	// ptr to data
   918		MOVQ	s+16(FP), CX	// size
   919		LEAQ	ret+24(FP), DX
   920		JMP	runtime·aeshashbody(SB)
   921	
   922	TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   923		MOVQ	p+0(FP), AX	// ptr to string struct
   924		MOVQ	8(AX), CX	// length of string
   925		MOVQ	(AX), AX	// string data
   926		LEAQ	ret+16(FP), DX
   927		JMP	runtime·aeshashbody(SB)
   928	
   929	// AX: data
   930	// CX: length
   931	// DX: address to put return value
   932	TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   933		// Fill an SSE register with our seeds.
   934		MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   935		PINSRW	$4, CX, X0			// 16 bits of length
   936		PSHUFHW $0, X0, X0			// repeat length 4 times total
   937		MOVO	X0, X1				// save unscrambled seed
   938		PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   939		AESENC	X0, X0				// scramble seed
   940	
   941		CMPQ	CX, $16
   942		JB	aes0to15
   943		JE	aes16
   944		CMPQ	CX, $32
   945		JBE	aes17to32
   946		CMPQ	CX, $64
   947		JBE	aes33to64
   948		CMPQ	CX, $128
   949		JBE	aes65to128
   950		JMP	aes129plus
   951	
   952	aes0to15:
   953		TESTQ	CX, CX
   954		JE	aes0
   955	
   956		ADDQ	$16, AX
   957		TESTW	$0xff0, AX
   958		JE	endofpage
   959	
   960		// 16 bytes loaded at this address won't cross
   961		// a page boundary, so we can load it directly.
   962		MOVOU	-16(AX), X1
   963		ADDQ	CX, CX
   964		MOVQ	$masks<>(SB), AX
   965		PAND	(AX)(CX*8), X1
   966	final1:
   967		PXOR	X0, X1	// xor data with seed
   968		AESENC	X1, X1	// scramble combo 3 times
   969		AESENC	X1, X1
   970		AESENC	X1, X1
   971		MOVQ	X1, (DX)
   972		RET
   973	
   974	endofpage:
   975		// address ends in 1111xxxx. Might be up against
   976		// a page boundary, so load ending at last byte.
   977		// Then shift bytes down using pshufb.
   978		MOVOU	-32(AX)(CX*1), X1
   979		ADDQ	CX, CX
   980		MOVQ	$shifts<>(SB), AX
   981		PSHUFB	(AX)(CX*8), X1
   982		JMP	final1
   983	
   984	aes0:
   985		// Return scrambled input seed
   986		AESENC	X0, X0
   987		MOVQ	X0, (DX)
   988		RET
   989	
   990	aes16:
   991		MOVOU	(AX), X1
   992		JMP	final1
   993	
   994	aes17to32:
   995		// make second starting seed
   996		PXOR	runtime·aeskeysched+16(SB), X1
   997		AESENC	X1, X1
   998		
   999		// load data to be hashed
  1000		MOVOU	(AX), X2
  1001		MOVOU	-16(AX)(CX*1), X3
  1002	
  1003		// xor with seed
  1004		PXOR	X0, X2
  1005		PXOR	X1, X3
  1006	
  1007		// scramble 3 times
  1008		AESENC	X2, X2
  1009		AESENC	X3, X3
  1010		AESENC	X2, X2
  1011		AESENC	X3, X3
  1012		AESENC	X2, X2
  1013		AESENC	X3, X3
  1014	
  1015		// combine results
  1016		PXOR	X3, X2
  1017		MOVQ	X2, (DX)
  1018		RET
  1019	
  1020	aes33to64:
  1021		// make 3 more starting seeds
  1022		MOVO	X1, X2
  1023		MOVO	X1, X3
  1024		PXOR	runtime·aeskeysched+16(SB), X1
  1025		PXOR	runtime·aeskeysched+32(SB), X2
  1026		PXOR	runtime·aeskeysched+48(SB), X3
  1027		AESENC	X1, X1
  1028		AESENC	X2, X2
  1029		AESENC	X3, X3
  1030		
  1031		MOVOU	(AX), X4
  1032		MOVOU	16(AX), X5
  1033		MOVOU	-32(AX)(CX*1), X6
  1034		MOVOU	-16(AX)(CX*1), X7
  1035	
  1036		PXOR	X0, X4
  1037		PXOR	X1, X5
  1038		PXOR	X2, X6
  1039		PXOR	X3, X7
  1040		
  1041		AESENC	X4, X4
  1042		AESENC	X5, X5
  1043		AESENC	X6, X6
  1044		AESENC	X7, X7
  1045		
  1046		AESENC	X4, X4
  1047		AESENC	X5, X5
  1048		AESENC	X6, X6
  1049		AESENC	X7, X7
  1050		
  1051		AESENC	X4, X4
  1052		AESENC	X5, X5
  1053		AESENC	X6, X6
  1054		AESENC	X7, X7
  1055	
  1056		PXOR	X6, X4
  1057		PXOR	X7, X5
  1058		PXOR	X5, X4
  1059		MOVQ	X4, (DX)
  1060		RET
  1061	
  1062	aes65to128:
  1063		// make 7 more starting seeds
  1064		MOVO	X1, X2
  1065		MOVO	X1, X3
  1066		MOVO	X1, X4
  1067		MOVO	X1, X5
  1068		MOVO	X1, X6
  1069		MOVO	X1, X7
  1070		PXOR	runtime·aeskeysched+16(SB), X1
  1071		PXOR	runtime·aeskeysched+32(SB), X2
  1072		PXOR	runtime·aeskeysched+48(SB), X3
  1073		PXOR	runtime·aeskeysched+64(SB), X4
  1074		PXOR	runtime·aeskeysched+80(SB), X5
  1075		PXOR	runtime·aeskeysched+96(SB), X6
  1076		PXOR	runtime·aeskeysched+112(SB), X7
  1077		AESENC	X1, X1
  1078		AESENC	X2, X2
  1079		AESENC	X3, X3
  1080		AESENC	X4, X4
  1081		AESENC	X5, X5
  1082		AESENC	X6, X6
  1083		AESENC	X7, X7
  1084	
  1085		// load data
  1086		MOVOU	(AX), X8
  1087		MOVOU	16(AX), X9
  1088		MOVOU	32(AX), X10
  1089		MOVOU	48(AX), X11
  1090		MOVOU	-64(AX)(CX*1), X12
  1091		MOVOU	-48(AX)(CX*1), X13
  1092		MOVOU	-32(AX)(CX*1), X14
  1093		MOVOU	-16(AX)(CX*1), X15
  1094	
  1095		// xor with seed
  1096		PXOR	X0, X8
  1097		PXOR	X1, X9
  1098		PXOR	X2, X10
  1099		PXOR	X3, X11
  1100		PXOR	X4, X12
  1101		PXOR	X5, X13
  1102		PXOR	X6, X14
  1103		PXOR	X7, X15
  1104	
  1105		// scramble 3 times
  1106		AESENC	X8, X8
  1107		AESENC	X9, X9
  1108		AESENC	X10, X10
  1109		AESENC	X11, X11
  1110		AESENC	X12, X12
  1111		AESENC	X13, X13
  1112		AESENC	X14, X14
  1113		AESENC	X15, X15
  1114	
  1115		AESENC	X8, X8
  1116		AESENC	X9, X9
  1117		AESENC	X10, X10
  1118		AESENC	X11, X11
  1119		AESENC	X12, X12
  1120		AESENC	X13, X13
  1121		AESENC	X14, X14
  1122		AESENC	X15, X15
  1123	
  1124		AESENC	X8, X8
  1125		AESENC	X9, X9
  1126		AESENC	X10, X10
  1127		AESENC	X11, X11
  1128		AESENC	X12, X12
  1129		AESENC	X13, X13
  1130		AESENC	X14, X14
  1131		AESENC	X15, X15
  1132	
  1133		// combine results
  1134		PXOR	X12, X8
  1135		PXOR	X13, X9
  1136		PXOR	X14, X10
  1137		PXOR	X15, X11
  1138		PXOR	X10, X8
  1139		PXOR	X11, X9
  1140		PXOR	X9, X8
  1141		MOVQ	X8, (DX)
  1142		RET
  1143	
  1144	aes129plus:
  1145		// make 7 more starting seeds
  1146		MOVO	X1, X2
  1147		MOVO	X1, X3
  1148		MOVO	X1, X4
  1149		MOVO	X1, X5
  1150		MOVO	X1, X6
  1151		MOVO	X1, X7
  1152		PXOR	runtime·aeskeysched+16(SB), X1
  1153		PXOR	runtime·aeskeysched+32(SB), X2
  1154		PXOR	runtime·aeskeysched+48(SB), X3
  1155		PXOR	runtime·aeskeysched+64(SB), X4
  1156		PXOR	runtime·aeskeysched+80(SB), X5
  1157		PXOR	runtime·aeskeysched+96(SB), X6
  1158		PXOR	runtime·aeskeysched+112(SB), X7
  1159		AESENC	X1, X1
  1160		AESENC	X2, X2
  1161		AESENC	X3, X3
  1162		AESENC	X4, X4
  1163		AESENC	X5, X5
  1164		AESENC	X6, X6
  1165		AESENC	X7, X7
  1166		
  1167		// start with last (possibly overlapping) block
  1168		MOVOU	-128(AX)(CX*1), X8
  1169		MOVOU	-112(AX)(CX*1), X9
  1170		MOVOU	-96(AX)(CX*1), X10
  1171		MOVOU	-80(AX)(CX*1), X11
  1172		MOVOU	-64(AX)(CX*1), X12
  1173		MOVOU	-48(AX)(CX*1), X13
  1174		MOVOU	-32(AX)(CX*1), X14
  1175		MOVOU	-16(AX)(CX*1), X15
  1176	
  1177		// xor in seed
  1178		PXOR	X0, X8
  1179		PXOR	X1, X9
  1180		PXOR	X2, X10
  1181		PXOR	X3, X11
  1182		PXOR	X4, X12
  1183		PXOR	X5, X13
  1184		PXOR	X6, X14
  1185		PXOR	X7, X15
  1186		
  1187		// compute number of remaining 128-byte blocks
  1188		DECQ	CX
  1189		SHRQ	$7, CX
  1190		
  1191	aesloop:
  1192		// scramble state
  1193		AESENC	X8, X8
  1194		AESENC	X9, X9
  1195		AESENC	X10, X10
  1196		AESENC	X11, X11
  1197		AESENC	X12, X12
  1198		AESENC	X13, X13
  1199		AESENC	X14, X14
  1200		AESENC	X15, X15
  1201	
  1202		// scramble state, xor in a block
  1203		MOVOU	(AX), X0
  1204		MOVOU	16(AX), X1
  1205		MOVOU	32(AX), X2
  1206		MOVOU	48(AX), X3
  1207		AESENC	X0, X8
  1208		AESENC	X1, X9
  1209		AESENC	X2, X10
  1210		AESENC	X3, X11
  1211		MOVOU	64(AX), X4
  1212		MOVOU	80(AX), X5
  1213		MOVOU	96(AX), X6
  1214		MOVOU	112(AX), X7
  1215		AESENC	X4, X12
  1216		AESENC	X5, X13
  1217		AESENC	X6, X14
  1218		AESENC	X7, X15
  1219	
  1220		ADDQ	$128, AX
  1221		DECQ	CX
  1222		JNE	aesloop
  1223	
  1224		// 3 more scrambles to finish
  1225		AESENC	X8, X8
  1226		AESENC	X9, X9
  1227		AESENC	X10, X10
  1228		AESENC	X11, X11
  1229		AESENC	X12, X12
  1230		AESENC	X13, X13
  1231		AESENC	X14, X14
  1232		AESENC	X15, X15
  1233		AESENC	X8, X8
  1234		AESENC	X9, X9
  1235		AESENC	X10, X10
  1236		AESENC	X11, X11
  1237		AESENC	X12, X12
  1238		AESENC	X13, X13
  1239		AESENC	X14, X14
  1240		AESENC	X15, X15
  1241		AESENC	X8, X8
  1242		AESENC	X9, X9
  1243		AESENC	X10, X10
  1244		AESENC	X11, X11
  1245		AESENC	X12, X12
  1246		AESENC	X13, X13
  1247		AESENC	X14, X14
  1248		AESENC	X15, X15
  1249	
  1250		PXOR	X12, X8
  1251		PXOR	X13, X9
  1252		PXOR	X14, X10
  1253		PXOR	X15, X11
  1254		PXOR	X10, X8
  1255		PXOR	X11, X9
  1256		PXOR	X9, X8
  1257		MOVQ	X8, (DX)
  1258		RET
  1259		
  1260	TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1261		MOVQ	p+0(FP), AX	// ptr to data
  1262		MOVQ	h+8(FP), X0	// seed
  1263		PINSRD	$2, (AX), X0	// data
  1264		AESENC	runtime·aeskeysched+0(SB), X0
  1265		AESENC	runtime·aeskeysched+16(SB), X0
  1266		AESENC	runtime·aeskeysched+32(SB), X0
  1267		MOVQ	X0, ret+16(FP)
  1268		RET
  1269	
  1270	TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1271		MOVQ	p+0(FP), AX	// ptr to data
  1272		MOVQ	h+8(FP), X0	// seed
  1273		PINSRQ	$1, (AX), X0	// data
  1274		AESENC	runtime·aeskeysched+0(SB), X0
  1275		AESENC	runtime·aeskeysched+16(SB), X0
  1276		AESENC	runtime·aeskeysched+32(SB), X0
  1277		MOVQ	X0, ret+16(FP)
  1278		RET
  1279	
  1280	// simple mask to get rid of data in the high part of the register.
  1281	DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1282	DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1283	DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1284	DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1285	DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1286	DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1287	DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1288	DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1289	DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1290	DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1291	DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1292	DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1293	DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1294	DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1295	DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1296	DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1297	DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1298	DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1299	DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1300	DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1301	DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1302	DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1303	DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1304	DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1305	DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1306	DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1307	DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1308	DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1309	DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1310	DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1311	DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1312	DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1313	GLOBL masks<>(SB),RODATA,$256
  1314	
  1315	TEXT ·checkASM(SB),NOSPLIT,$0-1
  1316		// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1317		MOVQ	$masks<>(SB), AX
  1318		MOVQ	$shifts<>(SB), BX
  1319		ORQ	BX, AX
  1320		TESTQ	$15, AX
  1321		SETEQ	ret+0(FP)
  1322		RET
  1323	
  1324	// these are arguments to pshufb. They move data down from
  1325	// the high bytes of the register to the low bytes of the register.
  1326	// index is how many bytes to move.
  1327	DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1328	DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1329	DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1330	DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1331	DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1332	DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1333	DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1334	DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1335	DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1336	DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1337	DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1338	DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1339	DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1340	DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1341	DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1342	DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1343	DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1344	DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1345	DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1346	DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1347	DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1348	DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1349	DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1350	DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1351	DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1352	DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1353	DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1354	DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1355	DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1356	DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1357	DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1358	DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1359	GLOBL shifts<>(SB),RODATA,$256
  1360	
  1361	// memequal(p, q unsafe.Pointer, size uintptr) bool
  1362	TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1363		MOVQ	a+0(FP), SI
  1364		MOVQ	b+8(FP), DI
  1365		CMPQ	SI, DI
  1366		JEQ	eq
  1367		MOVQ	size+16(FP), BX
  1368		LEAQ	ret+24(FP), AX
  1369		JMP	runtime·memeqbody(SB)
  1370	eq:
  1371		MOVB	$1, ret+24(FP)
  1372		RET
  1373	
  1374	// memequal_varlen(a, b unsafe.Pointer) bool
  1375	TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1376		MOVQ	a+0(FP), SI
  1377		MOVQ	b+8(FP), DI
  1378		CMPQ	SI, DI
  1379		JEQ	eq
  1380		MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1381		LEAQ	ret+16(FP), AX
  1382		JMP	runtime·memeqbody(SB)
  1383	eq:
  1384		MOVB	$1, ret+16(FP)
  1385		RET
  1386	
  1387	// a in SI
  1388	// b in DI
  1389	// count in BX
  1390	// address of result byte in AX
  1391	TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1392		CMPQ	BX, $8
  1393		JB	small
  1394		CMPQ	BX, $64
  1395		JB	bigloop
  1396		CMPB    runtime·support_avx2(SB), $1
  1397		JE	hugeloop_avx2
  1398		
  1399		// 64 bytes at a time using xmm registers
  1400	hugeloop:
  1401		CMPQ	BX, $64
  1402		JB	bigloop
  1403		MOVOU	(SI), X0
  1404		MOVOU	(DI), X1
  1405		MOVOU	16(SI), X2
  1406		MOVOU	16(DI), X3
  1407		MOVOU	32(SI), X4
  1408		MOVOU	32(DI), X5
  1409		MOVOU	48(SI), X6
  1410		MOVOU	48(DI), X7
  1411		PCMPEQB	X1, X0
  1412		PCMPEQB	X3, X2
  1413		PCMPEQB	X5, X4
  1414		PCMPEQB	X7, X6
  1415		PAND	X2, X0
  1416		PAND	X6, X4
  1417		PAND	X4, X0
  1418		PMOVMSKB X0, DX
  1419		ADDQ	$64, SI
  1420		ADDQ	$64, DI
  1421		SUBQ	$64, BX
  1422		CMPL	DX, $0xffff
  1423		JEQ	hugeloop
  1424		MOVB	$0, (AX)
  1425		RET
  1426	
  1427		// 64 bytes at a time using ymm registers
  1428	hugeloop_avx2:
  1429		CMPQ	BX, $64
  1430		JB	bigloop_avx2
  1431		VMOVDQU	(SI), Y0
  1432		VMOVDQU	(DI), Y1
  1433		VMOVDQU	32(SI), Y2
  1434		VMOVDQU	32(DI), Y3
  1435		VPCMPEQB	Y1, Y0, Y4
  1436		VPCMPEQB	Y2, Y3, Y5
  1437		VPAND	Y4, Y5, Y6
  1438		VPMOVMSKB Y6, DX
  1439		ADDQ	$64, SI
  1440		ADDQ	$64, DI
  1441		SUBQ	$64, BX
  1442		CMPL	DX, $0xffffffff
  1443		JEQ	hugeloop_avx2
  1444		VZEROUPPER
  1445		MOVB	$0, (AX)
  1446		RET
  1447	
  1448	bigloop_avx2:
  1449		VZEROUPPER
  1450	
  1451		// 8 bytes at a time using 64-bit register
  1452	bigloop:
  1453		CMPQ	BX, $8
  1454		JBE	leftover
  1455		MOVQ	(SI), CX
  1456		MOVQ	(DI), DX
  1457		ADDQ	$8, SI
  1458		ADDQ	$8, DI
  1459		SUBQ	$8, BX
  1460		CMPQ	CX, DX
  1461		JEQ	bigloop
  1462		MOVB	$0, (AX)
  1463		RET
  1464	
  1465		// remaining 0-8 bytes
  1466	leftover:
  1467		MOVQ	-8(SI)(BX*1), CX
  1468		MOVQ	-8(DI)(BX*1), DX
  1469		CMPQ	CX, DX
  1470		SETEQ	(AX)
  1471		RET
  1472	
  1473	small:
  1474		CMPQ	BX, $0
  1475		JEQ	equal
  1476	
  1477		LEAQ	0(BX*8), CX
  1478		NEGQ	CX
  1479	
  1480		CMPB	SI, $0xf8
  1481		JA	si_high
  1482	
  1483		// load at SI won't cross a page boundary.
  1484		MOVQ	(SI), SI
  1485		JMP	si_finish
  1486	si_high:
  1487		// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1488		MOVQ	-8(SI)(BX*1), SI
  1489		SHRQ	CX, SI
  1490	si_finish:
  1491	
  1492		// same for DI.
  1493		CMPB	DI, $0xf8
  1494		JA	di_high
  1495		MOVQ	(DI), DI
  1496		JMP	di_finish
  1497	di_high:
  1498		MOVQ	-8(DI)(BX*1), DI
  1499		SHRQ	CX, DI
  1500	di_finish:
  1501	
  1502		SUBQ	SI, DI
  1503		SHLQ	CX, DI
  1504	equal:
  1505		SETEQ	(AX)
  1506		RET
  1507	
  1508	TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1509		MOVQ	s1_base+0(FP), SI
  1510		MOVQ	s1_len+8(FP), BX
  1511		MOVQ	s2_base+16(FP), DI
  1512		MOVQ	s2_len+24(FP), DX
  1513		LEAQ	ret+32(FP), R9
  1514		JMP	runtime·cmpbody(SB)
  1515	
  1516	TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1517		MOVQ	s1+0(FP), SI
  1518		MOVQ	s1+8(FP), BX
  1519		MOVQ	s2+24(FP), DI
  1520		MOVQ	s2+32(FP), DX
  1521		LEAQ	res+48(FP), R9
  1522		JMP	runtime·cmpbody(SB)
  1523	
  1524	// input:
  1525	//   SI = a
  1526	//   DI = b
  1527	//   BX = alen
  1528	//   DX = blen
  1529	//   R9 = address of output word (stores -1/0/1 here)
  1530	TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1531		CMPQ	SI, DI
  1532		JEQ	allsame
  1533		CMPQ	BX, DX
  1534		MOVQ	DX, R8
  1535		CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1536		CMPQ	R8, $8
  1537		JB	small
  1538	
  1539		CMPQ	R8, $63
  1540		JBE	loop
  1541		CMPB    runtime·support_avx2(SB), $1
  1542		JEQ     big_loop_avx2
  1543		JMP	big_loop
  1544	loop:
  1545		CMPQ	R8, $16
  1546		JBE	_0through16
  1547		MOVOU	(SI), X0
  1548		MOVOU	(DI), X1
  1549		PCMPEQB X0, X1
  1550		PMOVMSKB X1, AX
  1551		XORQ	$0xffff, AX	// convert EQ to NE
  1552		JNE	diff16	// branch if at least one byte is not equal
  1553		ADDQ	$16, SI
  1554		ADDQ	$16, DI
  1555		SUBQ	$16, R8
  1556		JMP	loop
  1557		
  1558	diff64:
  1559		ADDQ	$48, SI
  1560		ADDQ	$48, DI
  1561		JMP	diff16
  1562	diff48:
  1563		ADDQ	$32, SI
  1564		ADDQ	$32, DI
  1565		JMP	diff16
  1566	diff32:
  1567		ADDQ	$16, SI
  1568		ADDQ	$16, DI
  1569		// AX = bit mask of differences
  1570	diff16:
  1571		BSFQ	AX, BX	// index of first byte that differs
  1572		XORQ	AX, AX
  1573		MOVB	(SI)(BX*1), CX
  1574		CMPB	CX, (DI)(BX*1)
  1575		SETHI	AX
  1576		LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1577		MOVQ	AX, (R9)
  1578		RET
  1579	
  1580		// 0 through 16 bytes left, alen>=8, blen>=8
  1581	_0through16:
  1582		CMPQ	R8, $8
  1583		JBE	_0through8
  1584		MOVQ	(SI), AX
  1585		MOVQ	(DI), CX
  1586		CMPQ	AX, CX
  1587		JNE	diff8
  1588	_0through8:
  1589		MOVQ	-8(SI)(R8*1), AX
  1590		MOVQ	-8(DI)(R8*1), CX
  1591		CMPQ	AX, CX
  1592		JEQ	allsame
  1593	
  1594		// AX and CX contain parts of a and b that differ.
  1595	diff8:
  1596		BSWAPQ	AX	// reverse order of bytes
  1597		BSWAPQ	CX
  1598		XORQ	AX, CX
  1599		BSRQ	CX, CX	// index of highest bit difference
  1600		SHRQ	CX, AX	// move a's bit to bottom
  1601		ANDQ	$1, AX	// mask bit
  1602		LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1603		MOVQ	AX, (R9)
  1604		RET
  1605	
  1606		// 0-7 bytes in common
  1607	small:
  1608		LEAQ	(R8*8), CX	// bytes left -> bits left
  1609		NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1610		JEQ	allsame
  1611	
  1612		// load bytes of a into high bytes of AX
  1613		CMPB	SI, $0xf8
  1614		JA	si_high
  1615		MOVQ	(SI), SI
  1616		JMP	si_finish
  1617	si_high:
  1618		MOVQ	-8(SI)(R8*1), SI
  1619		SHRQ	CX, SI
  1620	si_finish:
  1621		SHLQ	CX, SI
  1622	
  1623		// load bytes of b in to high bytes of BX
  1624		CMPB	DI, $0xf8
  1625		JA	di_high
  1626		MOVQ	(DI), DI
  1627		JMP	di_finish
  1628	di_high:
  1629		MOVQ	-8(DI)(R8*1), DI
  1630		SHRQ	CX, DI
  1631	di_finish:
  1632		SHLQ	CX, DI
  1633	
  1634		BSWAPQ	SI	// reverse order of bytes
  1635		BSWAPQ	DI
  1636		XORQ	SI, DI	// find bit differences
  1637		JEQ	allsame
  1638		BSRQ	DI, CX	// index of highest bit difference
  1639		SHRQ	CX, SI	// move a's bit to bottom
  1640		ANDQ	$1, SI	// mask bit
  1641		LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1642		MOVQ	AX, (R9)
  1643		RET
  1644	
  1645	allsame:
  1646		XORQ	AX, AX
  1647		XORQ	CX, CX
  1648		CMPQ	BX, DX
  1649		SETGT	AX	// 1 if alen > blen
  1650		SETEQ	CX	// 1 if alen == blen
  1651		LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1652		MOVQ	AX, (R9)
  1653		RET
  1654	
  1655		// this works for >= 64 bytes of data.
  1656	big_loop:
  1657		MOVOU	(SI), X0
  1658		MOVOU	(DI), X1
  1659		PCMPEQB X0, X1
  1660		PMOVMSKB X1, AX
  1661		XORQ	$0xffff, AX
  1662		JNE	diff16
  1663	
  1664		MOVOU	16(SI), X0
  1665		MOVOU	16(DI), X1
  1666		PCMPEQB X0, X1
  1667		PMOVMSKB X1, AX
  1668		XORQ	$0xffff, AX
  1669		JNE	diff32
  1670	
  1671		MOVOU	32(SI), X0
  1672		MOVOU	32(DI), X1
  1673		PCMPEQB X0, X1
  1674		PMOVMSKB X1, AX
  1675		XORQ	$0xffff, AX
  1676		JNE	diff48
  1677	
  1678		MOVOU	48(SI), X0
  1679		MOVOU	48(DI), X1
  1680		PCMPEQB X0, X1
  1681		PMOVMSKB X1, AX
  1682		XORQ	$0xffff, AX
  1683		JNE	diff64
  1684	
  1685		ADDQ	$64, SI
  1686		ADDQ	$64, DI
  1687		SUBQ	$64, R8
  1688		CMPQ	R8, $64
  1689		JBE	loop
  1690		JMP	big_loop
  1691	
  1692		// Compare 64-bytes per loop iteration.
  1693		// Loop is unrolled and uses AVX2.
  1694	big_loop_avx2:
  1695		VMOVDQU	(SI), Y2
  1696		VMOVDQU	(DI), Y3
  1697		VMOVDQU	32(SI), Y4
  1698		VMOVDQU	32(DI), Y5
  1699		VPCMPEQB Y2, Y3, Y0
  1700		VPMOVMSKB Y0, AX
  1701		XORL	$0xffffffff, AX
  1702		JNE	diff32_avx2
  1703		VPCMPEQB Y4, Y5, Y6
  1704		VPMOVMSKB Y6, AX
  1705		XORL	$0xffffffff, AX
  1706		JNE	diff64_avx2
  1707	
  1708		ADDQ	$64, SI
  1709		ADDQ	$64, DI
  1710		SUBQ	$64, R8
  1711		CMPQ	R8, $64
  1712		JB	big_loop_avx2_exit
  1713		JMP	big_loop_avx2
  1714	
  1715		// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1716	diff32_avx2:
  1717		VZEROUPPER
  1718		JMP diff16
  1719	
  1720		// Same as diff32_avx2, but for last 32 bytes.
  1721	diff64_avx2:
  1722		VZEROUPPER
  1723		JMP diff48
  1724	
  1725		// For <64 bytes remainder jump to normal loop.
  1726	big_loop_avx2_exit:
  1727		VZEROUPPER
  1728		JMP loop
  1729	
  1730	TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1731		MOVQ s+0(FP), DI
  1732		// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1733		MOVQ s_len+8(FP), DX
  1734		MOVQ c+16(FP), BP
  1735		MOVQ c_len+24(FP), AX
  1736		MOVQ DI, R10
  1737		LEAQ ret+32(FP), R11
  1738		JMP  runtime·indexShortStr(SB)
  1739	
  1740	TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1741		MOVQ s+0(FP), DI
  1742		MOVQ s_len+8(FP), DX
  1743		MOVQ c+24(FP), BP
  1744		MOVQ c_len+32(FP), AX
  1745		MOVQ DI, R10
  1746		LEAQ ret+48(FP), R11
  1747		JMP  runtime·indexShortStr(SB)
  1748	
  1749	// AX: length of string, that we are searching for
  1750	// DX: length of string, in which we are searching
  1751	// DI: pointer to string, in which we are searching
  1752	// BP: pointer to string, that we are searching for
  1753	// R11: address, where to put return value
  1754	TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1755		CMPQ AX, DX
  1756		JA fail
  1757		CMPQ DX, $16
  1758		JAE sse42
  1759	no_sse42:
  1760		CMPQ AX, $2
  1761		JA   _3_or_more
  1762		MOVW (BP), BP
  1763		LEAQ -1(DI)(DX*1), DX
  1764	loop2:
  1765		MOVW (DI), SI
  1766		CMPW SI,BP
  1767		JZ success
  1768		ADDQ $1,DI
  1769		CMPQ DI,DX
  1770		JB loop2
  1771		JMP fail
  1772	_3_or_more:
  1773		CMPQ AX, $3
  1774		JA   _4_or_more
  1775		MOVW 1(BP), BX
  1776		MOVW (BP), BP
  1777		LEAQ -2(DI)(DX*1), DX
  1778	loop3:
  1779		MOVW (DI), SI
  1780		CMPW SI,BP
  1781		JZ   partial_success3
  1782		ADDQ $1,DI
  1783		CMPQ DI,DX
  1784		JB loop3
  1785		JMP fail
  1786	partial_success3:
  1787		MOVW 1(DI), SI
  1788		CMPW SI,BX
  1789		JZ success
  1790		ADDQ $1,DI
  1791		CMPQ DI,DX
  1792		JB loop3
  1793		JMP fail
  1794	_4_or_more:
  1795		CMPQ AX, $4
  1796		JA   _5_or_more
  1797		MOVL (BP), BP
  1798		LEAQ -3(DI)(DX*1), DX
  1799	loop4:
  1800		MOVL (DI), SI
  1801		CMPL SI,BP
  1802		JZ   success
  1803		ADDQ $1,DI
  1804		CMPQ DI,DX
  1805		JB loop4
  1806		JMP fail
  1807	_5_or_more:
  1808		CMPQ AX, $7
  1809		JA   _8_or_more
  1810		LEAQ 1(DI)(DX*1), DX
  1811		SUBQ AX, DX
  1812		MOVL -4(BP)(AX*1), BX
  1813		MOVL (BP), BP
  1814	loop5to7:
  1815		MOVL (DI), SI
  1816		CMPL SI,BP
  1817		JZ   partial_success5to7
  1818		ADDQ $1,DI
  1819		CMPQ DI,DX
  1820		JB loop5to7
  1821		JMP fail
  1822	partial_success5to7:
  1823		MOVL -4(AX)(DI*1), SI
  1824		CMPL SI,BX
  1825		JZ success
  1826		ADDQ $1,DI
  1827		CMPQ DI,DX
  1828		JB loop5to7
  1829		JMP fail
  1830	_8_or_more:
  1831		CMPQ AX, $8
  1832		JA   _9_or_more
  1833		MOVQ (BP), BP
  1834		LEAQ -7(DI)(DX*1), DX
  1835	loop8:
  1836		MOVQ (DI), SI
  1837		CMPQ SI,BP
  1838		JZ   success
  1839		ADDQ $1,DI
  1840		CMPQ DI,DX
  1841		JB loop8
  1842		JMP fail
  1843	_9_or_more:
  1844		CMPQ AX, $15
  1845		JA   _16_or_more
  1846		LEAQ 1(DI)(DX*1), DX
  1847		SUBQ AX, DX
  1848		MOVQ -8(BP)(AX*1), BX
  1849		MOVQ (BP), BP
  1850	loop9to15:
  1851		MOVQ (DI), SI
  1852		CMPQ SI,BP
  1853		JZ   partial_success9to15
  1854		ADDQ $1,DI
  1855		CMPQ DI,DX
  1856		JB loop9to15
  1857		JMP fail
  1858	partial_success9to15:
  1859		MOVQ -8(AX)(DI*1), SI
  1860		CMPQ SI,BX
  1861		JZ success
  1862		ADDQ $1,DI
  1863		CMPQ DI,DX
  1864		JB loop9to15
  1865		JMP fail
  1866	_16_or_more:
  1867		CMPQ AX, $16
  1868		JA   _17_or_more
  1869		MOVOU (BP), X1
  1870		LEAQ -15(DI)(DX*1), DX
  1871	loop16:
  1872		MOVOU (DI), X2
  1873		PCMPEQB X1, X2
  1874		PMOVMSKB X2, SI
  1875		CMPQ  SI, $0xffff
  1876		JE   success
  1877		ADDQ $1,DI
  1878		CMPQ DI,DX
  1879		JB loop16
  1880		JMP fail
  1881	_17_or_more:
  1882		CMPQ AX, $31
  1883		JA   _32_or_more
  1884		LEAQ 1(DI)(DX*1), DX
  1885		SUBQ AX, DX
  1886		MOVOU -16(BP)(AX*1), X0
  1887		MOVOU (BP), X1
  1888	loop17to31:
  1889		MOVOU (DI), X2
  1890		PCMPEQB X1,X2
  1891		PMOVMSKB X2, SI
  1892		CMPQ  SI, $0xffff
  1893		JE   partial_success17to31
  1894		ADDQ $1,DI
  1895		CMPQ DI,DX
  1896		JB loop17to31
  1897		JMP fail
  1898	partial_success17to31:
  1899		MOVOU -16(AX)(DI*1), X3
  1900		PCMPEQB X0, X3
  1901		PMOVMSKB X3, SI
  1902		CMPQ  SI, $0xffff
  1903		JE success
  1904		ADDQ $1,DI
  1905		CMPQ DI,DX
  1906		JB loop17to31
  1907		JMP fail
  1908	// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1909	// So no need to check cpuid
  1910	_32_or_more:
  1911		CMPQ AX, $32
  1912		JA   _33_to_63
  1913		VMOVDQU (BP), Y1
  1914		LEAQ -31(DI)(DX*1), DX
  1915	loop32:
  1916		VMOVDQU (DI), Y2
  1917		VPCMPEQB Y1, Y2, Y3
  1918		VPMOVMSKB Y3, SI
  1919		CMPL  SI, $0xffffffff
  1920		JE   success_avx2
  1921		ADDQ $1,DI
  1922		CMPQ DI,DX
  1923		JB loop32
  1924		JMP fail_avx2
  1925	_33_to_63:
  1926		LEAQ 1(DI)(DX*1), DX
  1927		SUBQ AX, DX
  1928		VMOVDQU -32(BP)(AX*1), Y0
  1929		VMOVDQU (BP), Y1
  1930	loop33to63:
  1931		VMOVDQU (DI), Y2
  1932		VPCMPEQB Y1, Y2, Y3
  1933		VPMOVMSKB Y3, SI
  1934		CMPL  SI, $0xffffffff
  1935		JE   partial_success33to63
  1936		ADDQ $1,DI
  1937		CMPQ DI,DX
  1938		JB loop33to63
  1939		JMP fail_avx2
  1940	partial_success33to63:
  1941		VMOVDQU -32(AX)(DI*1), Y3
  1942		VPCMPEQB Y0, Y3, Y4
  1943		VPMOVMSKB Y4, SI
  1944		CMPL  SI, $0xffffffff
  1945		JE success_avx2
  1946		ADDQ $1,DI
  1947		CMPQ DI,DX
  1948		JB loop33to63
  1949	fail_avx2:
  1950		VZEROUPPER
  1951	fail:
  1952		MOVQ $-1, (R11)
  1953		RET
  1954	success_avx2:
  1955		VZEROUPPER
  1956		JMP success
  1957	sse42:
  1958		CMPB runtime·support_sse42(SB), $1
  1959		JNE no_sse42
  1960		CMPQ AX, $12
  1961		// PCMPESTRI is slower than normal compare,
  1962		// so using it makes sense only if we advance 4+ bytes per compare
  1963		// This value was determined experimentally and is the ~same
  1964		// on Nehalem (first with SSE42) and Haswell.
  1965		JAE _9_or_more
  1966		LEAQ 16(BP), SI
  1967		TESTW $0xff0, SI
  1968		JEQ no_sse42
  1969		MOVOU (BP), X1
  1970		LEAQ -15(DI)(DX*1), SI
  1971		MOVQ $16, R9
  1972		SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1973	loop_sse42:
  1974		// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1975		// for equality (bits 2,3 are 11)
  1976		// result is not masked or inverted (bits 4,5 are 00)
  1977		// and corresponds to first matching byte (bit 6 is 0)
  1978		PCMPESTRI $0x0c, (DI), X1
  1979		// CX == 16 means no match,
  1980		// CX > R9 means partial match at the end of the string,
  1981		// otherwise sep is at offset CX from X1 start
  1982		CMPQ CX, R9
  1983		JBE sse42_success
  1984		ADDQ R9, DI
  1985		CMPQ DI, SI
  1986		JB loop_sse42
  1987		PCMPESTRI $0x0c, -1(SI), X1
  1988		CMPQ CX, R9
  1989		JA fail
  1990		LEAQ -1(SI), DI
  1991	sse42_success:
  1992		ADDQ CX, DI
  1993	success:
  1994		SUBQ R10, DI
  1995		MOVQ DI, (R11)
  1996		RET
  1997	
  1998	
  1999	TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  2000		MOVQ s+0(FP), SI
  2001		MOVQ s_len+8(FP), BX
  2002		MOVB c+24(FP), AL
  2003		LEAQ ret+32(FP), R8
  2004		JMP  runtime·indexbytebody(SB)
  2005	
  2006	TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  2007		MOVQ s+0(FP), SI
  2008		MOVQ s_len+8(FP), BX
  2009		MOVB c+16(FP), AL
  2010		LEAQ ret+24(FP), R8
  2011		JMP  runtime·indexbytebody(SB)
  2012	
  2013	// input:
  2014	//   SI: data
  2015	//   BX: data len
  2016	//   AL: byte sought
  2017	//   R8: address to put result
  2018	TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  2019		// Shuffle X0 around so that each byte contains
  2020		// the character we're looking for.
  2021		MOVD AX, X0
  2022		PUNPCKLBW X0, X0
  2023		PUNPCKLBW X0, X0
  2024		PSHUFL $0, X0, X0
  2025		
  2026		CMPQ BX, $16
  2027		JLT small
  2028	
  2029		MOVQ SI, DI
  2030	
  2031		CMPQ BX, $32
  2032		JA avx2
  2033	sse:
  2034		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2035		JMP	sseloopentry
  2036		
  2037	sseloop:
  2038		// Move the next 16-byte chunk of the data into X1.
  2039		MOVOU	(DI), X1
  2040		// Compare bytes in X0 to X1.
  2041		PCMPEQB	X0, X1
  2042		// Take the top bit of each byte in X1 and put the result in DX.
  2043		PMOVMSKB X1, DX
  2044		// Find first set bit, if any.
  2045		BSFL	DX, DX
  2046		JNZ	ssesuccess
  2047		// Advance to next block.
  2048		ADDQ	$16, DI
  2049	sseloopentry:
  2050		CMPQ	DI, AX
  2051		JB	sseloop
  2052	
  2053		// Search the last 16-byte chunk. This chunk may overlap with the
  2054		// chunks we've already searched, but that's ok.
  2055		MOVQ	AX, DI
  2056		MOVOU	(AX), X1
  2057		PCMPEQB	X0, X1
  2058		PMOVMSKB X1, DX
  2059		BSFL	DX, DX
  2060		JNZ	ssesuccess
  2061	
  2062	failure:
  2063		MOVQ $-1, (R8)
  2064		RET
  2065	
  2066	// We've found a chunk containing the byte.
  2067	// The chunk was loaded from DI.
  2068	// The index of the matching byte in the chunk is DX.
  2069	// The start of the data is SI.
  2070	ssesuccess:
  2071		SUBQ SI, DI	// Compute offset of chunk within data.
  2072		ADDQ DX, DI	// Add offset of byte within chunk.
  2073		MOVQ DI, (R8)
  2074		RET
  2075	
  2076	// handle for lengths < 16
  2077	small:
  2078		TESTQ	BX, BX
  2079		JEQ	failure
  2080	
  2081		// Check if we'll load across a page boundary.
  2082		LEAQ	16(SI), AX
  2083		TESTW	$0xff0, AX
  2084		JEQ	endofpage
  2085	
  2086		MOVOU	(SI), X1 // Load data
  2087		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2088		PMOVMSKB X1, DX	// Move result bits to integer register.
  2089		BSFL	DX, DX	// Find first set bit.
  2090		JZ	failure	// No set bit, failure.
  2091		CMPL	DX, BX
  2092		JAE	failure	// Match is past end of data.
  2093		MOVQ	DX, (R8)
  2094		RET
  2095	
  2096	endofpage:
  2097		MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2098		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2099		PMOVMSKB X1, DX	// Move result bits to integer register.
  2100		MOVL	BX, CX
  2101		SHLL	CX, DX
  2102		SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2103		BSFL	DX, DX	// Find first set bit.
  2104		JZ	failure	// No set bit, failure.
  2105		MOVQ	DX, (R8)
  2106		RET
  2107	
  2108	avx2:
  2109		CMPB   runtime·support_avx2(SB), $1
  2110		JNE sse
  2111		MOVD AX, X0
  2112		LEAQ -32(SI)(BX*1), R11
  2113		VPBROADCASTB  X0, Y1
  2114	avx2_loop:
  2115		VMOVDQU (DI), Y2
  2116		VPCMPEQB Y1, Y2, Y3
  2117		VPTEST Y3, Y3
  2118		JNZ avx2success
  2119		ADDQ $32, DI
  2120		CMPQ DI, R11
  2121		JLT avx2_loop
  2122		MOVQ R11, DI
  2123		VMOVDQU (DI), Y2
  2124		VPCMPEQB Y1, Y2, Y3
  2125		VPTEST Y3, Y3
  2126		JNZ avx2success
  2127		VZEROUPPER
  2128		MOVQ $-1, (R8)
  2129		RET
  2130	
  2131	avx2success:
  2132		VPMOVMSKB Y3, DX
  2133		BSFL DX, DX
  2134		SUBQ SI, DI
  2135		ADDQ DI, DX
  2136		MOVQ DX, (R8)
  2137		VZEROUPPER
  2138		RET
  2139	
  2140	TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2141		MOVQ	a_len+8(FP), BX
  2142		MOVQ	b_len+32(FP), CX
  2143		CMPQ	BX, CX
  2144		JNE	eqret
  2145		MOVQ	a+0(FP), SI
  2146		MOVQ	b+24(FP), DI
  2147		LEAQ	ret+48(FP), AX
  2148		JMP	runtime·memeqbody(SB)
  2149	eqret:
  2150		MOVB	$0, ret+48(FP)
  2151		RET
  2152	
  2153	
  2154	TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2155		MOVQ s+0(FP), SI
  2156		MOVQ s_len+8(FP), BX
  2157		MOVB c+24(FP), AL
  2158		LEAQ ret+32(FP), R8
  2159		JMP  runtime·countByte(SB)
  2160	
  2161	TEXT strings·countByte(SB),NOSPLIT,$0-32
  2162		MOVQ s+0(FP), SI
  2163		MOVQ s_len+8(FP), BX
  2164		MOVB c+16(FP), AL
  2165		LEAQ ret+24(FP), R8
  2166		JMP  runtime·countByte(SB)
  2167	
  2168	// input:
  2169	//   SI: data
  2170	//   BX: data len
  2171	//   AL: byte sought
  2172	//   R8: address to put result
  2173	// This requires the POPCNT instruction
  2174	TEXT runtime·countByte(SB),NOSPLIT,$0
  2175		// Shuffle X0 around so that each byte contains
  2176		// the character we're looking for.
  2177		MOVD AX, X0
  2178		PUNPCKLBW X0, X0
  2179		PUNPCKLBW X0, X0
  2180		PSHUFL $0, X0, X0
  2181	
  2182		CMPQ BX, $16
  2183		JLT small
  2184	
  2185		MOVQ $0, R12 // Accumulator
  2186	
  2187		MOVQ SI, DI
  2188	
  2189		CMPQ BX, $32
  2190		JA avx2
  2191	sse:
  2192		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2193		JMP	sseloopentry
  2194	
  2195	sseloop:
  2196		// Move the next 16-byte chunk of the data into X1.
  2197		MOVOU	(DI), X1
  2198		// Compare bytes in X0 to X1.
  2199		PCMPEQB	X0, X1
  2200		// Take the top bit of each byte in X1 and put the result in DX.
  2201		PMOVMSKB X1, DX
  2202		// Count number of matching bytes
  2203		POPCNTL DX, DX
  2204		// Accumulate into R12
  2205		ADDQ DX, R12
  2206		// Advance to next block.
  2207		ADDQ	$16, DI
  2208	sseloopentry:
  2209		CMPQ	DI, AX
  2210		JBE	sseloop
  2211	
  2212		// Get the number of bytes to consider in the last 16 bytes
  2213		ANDQ $15, BX
  2214		JZ end
  2215	
  2216		// Create mask to ignore overlap between previous 16 byte block
  2217		// and the next.
  2218		MOVQ $16,CX
  2219		SUBQ BX, CX
  2220		MOVQ $0xFFFF, R10
  2221		SARQ CL, R10
  2222		SALQ CL, R10
  2223	
  2224		// Process the last 16-byte chunk. This chunk may overlap with the
  2225		// chunks we've already searched so we need to mask part of it.
  2226		MOVOU	(AX), X1
  2227		PCMPEQB	X0, X1
  2228		PMOVMSKB X1, DX
  2229		// Apply mask
  2230		ANDQ R10, DX
  2231		POPCNTL DX, DX
  2232		ADDQ DX, R12
  2233	end:
  2234		MOVQ R12, (R8)
  2235		RET
  2236	
  2237	// handle for lengths < 16
  2238	small:
  2239		TESTQ	BX, BX
  2240		JEQ	endzero
  2241	
  2242		// Check if we'll load across a page boundary.
  2243		LEAQ	16(SI), AX
  2244		TESTW	$0xff0, AX
  2245		JEQ	endofpage
  2246	
  2247		// We must ignore high bytes as they aren't part of our slice.
  2248		// Create mask.
  2249		MOVB BX, CX
  2250		MOVQ $1, R10
  2251		SALQ CL, R10
  2252		SUBQ $1, R10
  2253	
  2254		// Load data
  2255		MOVOU	(SI), X1
  2256		// Compare target byte with each byte in data.
  2257		PCMPEQB	X0, X1
  2258		// Move result bits to integer register.
  2259		PMOVMSKB X1, DX
  2260		// Apply mask
  2261		ANDQ R10, DX
  2262		POPCNTL DX, DX
  2263		// Directly return DX, we don't need to accumulate
  2264		// since we have <16 bytes.
  2265		MOVQ	DX, (R8)
  2266		RET
  2267	endzero:
  2268		MOVQ $0, (R8)
  2269		RET
  2270	
  2271	endofpage:
  2272		// We must ignore low bytes as they aren't part of our slice.
  2273		MOVQ $16,CX
  2274		SUBQ BX, CX
  2275		MOVQ $0xFFFF, R10
  2276		SARQ CL, R10
  2277		SALQ CL, R10
  2278	
  2279		// Load data into the high end of X1.
  2280		MOVOU	-16(SI)(BX*1), X1
  2281		// Compare target byte with each byte in data.
  2282		PCMPEQB	X0, X1
  2283		// Move result bits to integer register.
  2284		PMOVMSKB X1, DX
  2285		// Apply mask
  2286		ANDQ R10, DX
  2287		// Directly return DX, we don't need to accumulate
  2288		// since we have <16 bytes.
  2289		POPCNTL DX, DX
  2290		MOVQ	DX, (R8)
  2291		RET
  2292	
  2293	avx2:
  2294		CMPB   runtime·support_avx2(SB), $1
  2295		JNE sse
  2296		MOVD AX, X0
  2297		LEAQ -32(SI)(BX*1), R11
  2298		VPBROADCASTB  X0, Y1
  2299	avx2_loop:
  2300		VMOVDQU (DI), Y2
  2301		VPCMPEQB Y1, Y2, Y3
  2302		VPMOVMSKB Y3, DX
  2303		POPCNTL DX, DX
  2304		ADDQ DX, R12
  2305		ADDQ $32, DI
  2306		CMPQ DI, R11
  2307		JLE avx2_loop
  2308	
  2309		// If last block is already processed,
  2310		// skip to the end.
  2311		CMPQ DI, R11
  2312		JEQ endavx
  2313	
  2314		// Load address of the last 32 bytes.
  2315		// There is an overlap with the previous block.
  2316		MOVQ R11, DI
  2317		VMOVDQU (DI), Y2
  2318		VPCMPEQB Y1, Y2, Y3
  2319		VPMOVMSKB Y3, DX
  2320		// Exit AVX mode.
  2321		VZEROUPPER
  2322	
  2323		// Create mask to ignore overlap between previous 32 byte block
  2324		// and the next.
  2325		ANDQ $31, BX
  2326		MOVQ $32,CX
  2327		SUBQ BX, CX
  2328		MOVQ $0xFFFFFFFF, R10
  2329		SARQ CL, R10
  2330		SALQ CL, R10
  2331		// Apply mask
  2332		ANDQ R10, DX
  2333		POPCNTL DX, DX
  2334		ADDQ DX, R12
  2335		MOVQ R12, (R8)
  2336		RET
  2337	endavx:
  2338		// Exit AVX mode.
  2339		VZEROUPPER
  2340		MOVQ R12, (R8)
  2341		RET
  2342	
  2343	TEXT runtime·return0(SB), NOSPLIT, $0
  2344		MOVL	$0, AX
  2345		RET
  2346	
  2347	
  2348	// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2349	// Must obey the gcc calling convention.
  2350	TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2351		get_tls(CX)
  2352		MOVQ	g(CX), AX
  2353		MOVQ	g_m(AX), AX
  2354		MOVQ	m_curg(AX), AX
  2355		MOVQ	(g_stack+stack_hi)(AX), AX
  2356		RET
  2357	
  2358	// The top-most function running on a goroutine
  2359	// returns to goexit+PCQuantum.
  2360	TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2361		BYTE	$0x90	// NOP
  2362		CALL	runtime·goexit1(SB)	// does not return
  2363		// traceback from goexit1 must hit code range of goexit
  2364		BYTE	$0x90	// NOP
  2365	
  2366	// This is called from .init_array and follows the platform, not Go, ABI.
  2367	TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2368		PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2369		MOVQ	runtime·lastmoduledatap(SB), AX
  2370		MOVQ	DI, moduledata_next(AX)
  2371		MOVQ	DI, runtime·lastmoduledatap(SB)
  2372		POPQ	R15
  2373		RET
  2374	
  2375	// gcWriteBarrier performs a heap pointer write and informs the GC.
  2376	//
  2377	// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
  2378	// - DI is the destination of the write
  2379	// - AX is the value being written at DI
  2380	// It clobbers FLAGS. It does not clobber any general-purpose registers,
  2381	// but may clobber others (e.g., SSE registers).
  2382	TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
  2383		// Save the registers clobbered by the fast path. This is slightly
  2384		// faster than having the caller spill these.
  2385		MOVQ	R14, 104(SP)
  2386		MOVQ	R13, 112(SP)
  2387		// TODO: Consider passing g.m.p in as an argument so they can be shared
  2388		// across a sequence of write barriers.
  2389		get_tls(R13)
  2390		MOVQ	g(R13), R13
  2391		MOVQ	g_m(R13), R13
  2392		MOVQ	m_p(R13), R13
  2393		MOVQ	(p_wbBuf+wbBuf_next)(R13), R14
  2394		// Increment wbBuf.next position.
  2395		LEAQ	16(R14), R14
  2396		MOVQ	R14, (p_wbBuf+wbBuf_next)(R13)
  2397		CMPQ	R14, (p_wbBuf+wbBuf_end)(R13)
  2398		// Record the write.
  2399		MOVQ	AX, -16(R14)	// Record value
  2400		MOVQ	(DI), R13	// TODO: This turns bad writes into bad reads.
  2401		MOVQ	R13, -8(R14)	// Record *slot
  2402		// Is the buffer full? (flags set in CMPQ above)
  2403		JEQ	flush
  2404	ret:
  2405		MOVQ	104(SP), R14
  2406		MOVQ	112(SP), R13
  2407		// Do the write.
  2408		MOVQ	AX, (DI)
  2409		RET
  2410	
  2411	flush:
  2412		// Save all general purpose registers since these could be
  2413		// clobbered by wbBufFlush and were not saved by the caller.
  2414		// It is possible for wbBufFlush to clobber other registers
  2415		// (e.g., SSE registers), but the compiler takes care of saving
  2416		// those in the caller if necessary. This strikes a balance
  2417		// with registers that are likely to be used.
  2418		//
  2419		// We don't have type information for these, but all code under
  2420		// here is NOSPLIT, so nothing will observe these.
  2421		//
  2422		// TODO: We could strike a different balance; e.g., saving X0
  2423		// and not saving GP registers that are less likely to be used.
  2424		MOVQ	DI, 0(SP)	// Also first argument to wbBufFlush
  2425		MOVQ	AX, 8(SP)	// Also second argument to wbBufFlush
  2426		MOVQ	BX, 16(SP)
  2427		MOVQ	CX, 24(SP)
  2428		MOVQ	DX, 32(SP)
  2429		// DI already saved
  2430		MOVQ	SI, 40(SP)
  2431		MOVQ	BP, 48(SP)
  2432		MOVQ	R8, 56(SP)
  2433		MOVQ	R9, 64(SP)
  2434		MOVQ	R10, 72(SP)
  2435		MOVQ	R11, 80(SP)
  2436		MOVQ	R12, 88(SP)
  2437		// R13 already saved
  2438		// R14 already saved
  2439		MOVQ	R15, 96(SP)
  2440	
  2441		// This takes arguments DI and AX
  2442		CALL	runtime·wbBufFlush(SB)
  2443	
  2444		MOVQ	0(SP), DI
  2445		MOVQ	8(SP), AX
  2446		MOVQ	16(SP), BX
  2447		MOVQ	24(SP), CX
  2448		MOVQ	32(SP), DX
  2449		MOVQ	40(SP), SI
  2450		MOVQ	48(SP), BP
  2451		MOVQ	56(SP), R8
  2452		MOVQ	64(SP), R9
  2453		MOVQ	72(SP), R10
  2454		MOVQ	80(SP), R11
  2455		MOVQ	88(SP), R12
  2456		MOVQ	96(SP), R15
  2457		JMP	ret

View as plain text