Text file src/runtime/asm_amd64.s

Documentation: runtime

     1// Copyright 2009 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "go_tls.h"
     7#include "funcdata.h"
     8#include "textflag.h"
     9
    10// _rt0_amd64 is common startup code for most amd64 systems when using
    11// internal linking. This is the entry point for the program from the
    12// kernel for an ordinary -buildmode=exe program. The stack holds the
    13// number of arguments and the C-style argv.
    14TEXT _rt0_amd64(SB),NOSPLIT,$-8
    15	MOVQ	0(SP), DI	// argc
    16	LEAQ	8(SP), SI	// argv
    17	JMP	runtime·rt0_go(SB)
    18
    19// main is common startup code for most amd64 systems when using
    20// external linking. The C startup code will call the symbol "main"
    21// passing argc and argv in the usual C ABI registers DI and SI.
    22TEXT main(SB),NOSPLIT,$-8
    23	JMP	runtime·rt0_go(SB)
    24
    25// _rt0_amd64_lib is common startup code for most amd64 systems when
    26// using -buildmode=c-archive or -buildmode=c-shared. The linker will
    27// arrange to invoke this function as a global constructor (for
    28// c-archive) or when the shared library is loaded (for c-shared).
    29// We expect argc and argv to be passed in the usual C ABI registers
    30// DI and SI.
    31TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
    32	// Align stack per ELF ABI requirements.
    33	MOVQ	SP, AX
    34	ANDQ	$~15, SP
    35	// Save C ABI callee-saved registers, as caller may need them.
    36	MOVQ	BX, 0x10(SP)
    37	MOVQ	BP, 0x18(SP)
    38	MOVQ	R12, 0x20(SP)
    39	MOVQ	R13, 0x28(SP)
    40	MOVQ	R14, 0x30(SP)
    41	MOVQ	R15, 0x38(SP)
    42	MOVQ	AX, 0x40(SP)
    43
    44	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
    45	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
    46
    47	// Synchronous initialization.
    48	CALL	runtime·libpreinit(SB)
    49
    50	// Create a new thread to finish Go runtime initialization.
    51	MOVQ	_cgo_sys_thread_create(SB), AX
    52	TESTQ	AX, AX
    53	JZ	nocgo
    54	MOVQ	$_rt0_amd64_lib_go(SB), DI
    55	MOVQ	$0, SI
    56	CALL	AX
    57	JMP	restore
    58
    59nocgo:
    60	MOVQ	$0x800000, 0(SP)		// stacksize
    61	MOVQ	$_rt0_amd64_lib_go(SB), AX
    62	MOVQ	AX, 8(SP)			// fn
    63	CALL	runtime·newosproc0(SB)
    64
    65restore:
    66	MOVQ	0x10(SP), BX
    67	MOVQ	0x18(SP), BP
    68	MOVQ	0x20(SP), R12
    69	MOVQ	0x28(SP), R13
    70	MOVQ	0x30(SP), R14
    71	MOVQ	0x38(SP), R15
    72	MOVQ	0x40(SP), SP
    73	RET
    74
    75// _rt0_amd64_lib_go initializes the Go runtime.
    76// This is started in a separate thread by _rt0_amd64_lib.
    77TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
    78	MOVQ	_rt0_amd64_lib_argc<>(SB), DI
    79	MOVQ	_rt0_amd64_lib_argv<>(SB), SI
    80	JMP	runtime·rt0_go(SB)
    81
    82DATA _rt0_amd64_lib_argc<>(SB)/8, $0
    83GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
    84DATA _rt0_amd64_lib_argv<>(SB)/8, $0
    85GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
    86
    87TEXT runtime·rt0_go(SB),NOSPLIT,$0
    88	// copy arguments forward on an even stack
    89	MOVQ	DI, AX		// argc
    90	MOVQ	SI, BX		// argv
    91	SUBQ	$(4*8+7), SP		// 2args 2auto
    92	ANDQ	$~15, SP
    93	MOVQ	AX, 16(SP)
    94	MOVQ	BX, 24(SP)
    95
    96	// create istack out of the given (operating system) stack.
    97	// _cgo_init may update stackguard.
    98	MOVQ	$runtime·g0(SB), DI
    99	LEAQ	(-64*1024+104)(SP), BX
   100	MOVQ	BX, g_stackguard0(DI)
   101	MOVQ	BX, g_stackguard1(DI)
   102	MOVQ	BX, (g_stack+stack_lo)(DI)
   103	MOVQ	SP, (g_stack+stack_hi)(DI)
   104
   105	// find out information about the processor we're on
   106	MOVL	$0, AX
   107	CPUID
   108	MOVL	AX, SI
   109	CMPL	AX, $0
   110	JE	nocpuinfo
   111
   112	// Figure out how to serialize RDTSC.
   113	// On Intel processors LFENCE is enough. AMD requires MFENCE.
   114	// Don't know about the rest, so let's do MFENCE.
   115	CMPL	BX, $0x756E6547  // "Genu"
   116	JNE	notintel
   117	CMPL	DX, $0x49656E69  // "ineI"
   118	JNE	notintel
   119	CMPL	CX, $0x6C65746E  // "ntel"
   120	JNE	notintel
   121	MOVB	$1, runtime·isIntel(SB)
   122	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
   123notintel:
   124
   125	// Load EAX=1 cpuid flags
   126	MOVL	$1, AX
   127	CPUID
   128	MOVL	AX, runtime·processorVersionInfo(SB)
   129
   130nocpuinfo:
   131	// if there is an _cgo_init, call it.
   132	MOVQ	_cgo_init(SB), AX
   133	TESTQ	AX, AX
   134	JZ	needtls
   135	// g0 already in DI
   136	MOVQ	DI, CX	// Win64 uses CX for first parameter
   137	MOVQ	$setg_gcc<>(SB), SI
   138	CALL	AX
   139
   140	// update stackguard after _cgo_init
   141	MOVQ	$runtime·g0(SB), CX
   142	MOVQ	(g_stack+stack_lo)(CX), AX
   143	ADDQ	$const__StackGuard, AX
   144	MOVQ	AX, g_stackguard0(CX)
   145	MOVQ	AX, g_stackguard1(CX)
   146
   147#ifndef GOOS_windows
   148	JMP ok
   149#endif
   150needtls:
   151#ifdef GOOS_plan9
   152	// skip TLS setup on Plan 9
   153	JMP ok
   154#endif
   155#ifdef GOOS_solaris
   156	// skip TLS setup on Solaris
   157	JMP ok
   158#endif
   159#ifdef GOOS_darwin
   160	// skip TLS setup on Darwin
   161	JMP ok
   162#endif
   163
   164	LEAQ	runtime·m0+m_tls(SB), DI
   165	CALL	runtime·settls(SB)
   166
   167	// store through it, to make sure it works
   168	get_tls(BX)
   169	MOVQ	$0x123, g(BX)
   170	MOVQ	runtime·m0+m_tls(SB), AX
   171	CMPQ	AX, $0x123
   172	JEQ 2(PC)
   173	CALL	runtime·abort(SB)
   174ok:
   175	// set the per-goroutine and per-mach "registers"
   176	get_tls(BX)
   177	LEAQ	runtime·g0(SB), CX
   178	MOVQ	CX, g(BX)
   179	LEAQ	runtime·m0(SB), AX
   180
   181	// save m->g0 = g0
   182	MOVQ	CX, m_g0(AX)
   183	// save m0 to g0->m
   184	MOVQ	AX, g_m(CX)
   185
   186	CLD				// convention is D is always left cleared
   187	CALL	runtime·check(SB)
   188
   189	MOVL	16(SP), AX		// copy argc
   190	MOVL	AX, 0(SP)
   191	MOVQ	24(SP), AX		// copy argv
   192	MOVQ	AX, 8(SP)
   193	CALL	runtime·args(SB)
   194	CALL	runtime·osinit(SB)
   195	CALL	runtime·schedinit(SB)
   196
   197	// create a new goroutine to start program
   198	MOVQ	$runtime·mainPC(SB), AX		// entry
   199	PUSHQ	AX
   200	PUSHQ	$0			// arg size
   201	CALL	runtime·newproc(SB)
   202	POPQ	AX
   203	POPQ	AX
   204
   205	// start this M
   206	CALL	runtime·mstart(SB)
   207
   208	CALL	runtime·abort(SB)	// mstart should never return
   209	RET
   210
   211	// Prevent dead-code elimination of debugCallV1, which is
   212	// intended to be called by debuggers.
   213	MOVQ	$runtime·debugCallV1(SB), AX
   214	RET
   215
   216DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   217GLOBL	runtime·mainPC(SB),RODATA,$8
   218
   219TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   220	BYTE	$0xcc
   221	RET
   222
   223TEXT runtime·asminit(SB),NOSPLIT,$0-0
   224	// No per-thread init.
   225	RET
   226
   227/*
   228 *  go-routine
   229 */
   230
   231// func gosave(buf *gobuf)
   232// save state in Gobuf; setjmp
   233TEXT runtime·gosave(SB), NOSPLIT, $0-8
   234	MOVQ	buf+0(FP), AX		// gobuf
   235	LEAQ	buf+0(FP), BX		// caller's SP
   236	MOVQ	BX, gobuf_sp(AX)
   237	MOVQ	0(SP), BX		// caller's PC
   238	MOVQ	BX, gobuf_pc(AX)
   239	MOVQ	$0, gobuf_ret(AX)
   240	MOVQ	BP, gobuf_bp(AX)
   241	// Assert ctxt is zero. See func save.
   242	MOVQ	gobuf_ctxt(AX), BX
   243	TESTQ	BX, BX
   244	JZ	2(PC)
   245	CALL	runtime·badctxt(SB)
   246	get_tls(CX)
   247	MOVQ	g(CX), BX
   248	MOVQ	BX, gobuf_g(AX)
   249	RET
   250
   251// func gogo(buf *gobuf)
   252// restore state from Gobuf; longjmp
   253TEXT runtime·gogo(SB), NOSPLIT, $16-8
   254	MOVQ	buf+0(FP), BX		// gobuf
   255	MOVQ	gobuf_g(BX), DX
   256	MOVQ	0(DX), CX		// make sure g != nil
   257	get_tls(CX)
   258	MOVQ	DX, g(CX)
   259	MOVQ	gobuf_sp(BX), SP	// restore SP
   260	MOVQ	gobuf_ret(BX), AX
   261	MOVQ	gobuf_ctxt(BX), DX
   262	MOVQ	gobuf_bp(BX), BP
   263	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   264	MOVQ	$0, gobuf_ret(BX)
   265	MOVQ	$0, gobuf_ctxt(BX)
   266	MOVQ	$0, gobuf_bp(BX)
   267	MOVQ	gobuf_pc(BX), BX
   268	JMP	BX
   269
   270// func mcall(fn func(*g))
   271// Switch to m->g0's stack, call fn(g).
   272// Fn must never return. It should gogo(&g->sched)
   273// to keep running g.
   274TEXT runtime·mcall(SB), NOSPLIT, $0-8
   275	MOVQ	fn+0(FP), DI
   276
   277	get_tls(CX)
   278	MOVQ	g(CX), AX	// save state in g->sched
   279	MOVQ	0(SP), BX	// caller's PC
   280	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   281	LEAQ	fn+0(FP), BX	// caller's SP
   282	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   283	MOVQ	AX, (g_sched+gobuf_g)(AX)
   284	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   285
   286	// switch to m->g0 & its stack, call fn
   287	MOVQ	g(CX), BX
   288	MOVQ	g_m(BX), BX
   289	MOVQ	m_g0(BX), SI
   290	CMPQ	SI, AX	// if g == m->g0 call badmcall
   291	JNE	3(PC)
   292	MOVQ	$runtime·badmcall(SB), AX
   293	JMP	AX
   294	MOVQ	SI, g(CX)	// g = m->g0
   295	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   296	PUSHQ	AX
   297	MOVQ	DI, DX
   298	MOVQ	0(DI), DI
   299	CALL	DI
   300	POPQ	AX
   301	MOVQ	$runtime·badmcall2(SB), AX
   302	JMP	AX
   303	RET
   304
   305// systemstack_switch is a dummy routine that systemstack leaves at the bottom
   306// of the G stack. We need to distinguish the routine that
   307// lives at the bottom of the G stack from the one that lives
   308// at the top of the system stack because the one at the top of
   309// the system stack terminates the stack walk (see topofstack()).
   310TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   311	RET
   312
   313// func systemstack(fn func())
   314TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   315	MOVQ	fn+0(FP), DI	// DI = fn
   316	get_tls(CX)
   317	MOVQ	g(CX), AX	// AX = g
   318	MOVQ	g_m(AX), BX	// BX = m
   319
   320	CMPQ	AX, m_gsignal(BX)
   321	JEQ	noswitch
   322
   323	MOVQ	m_g0(BX), DX	// DX = g0
   324	CMPQ	AX, DX
   325	JEQ	noswitch
   326
   327	CMPQ	AX, m_curg(BX)
   328	JNE	bad
   329
   330	// switch stacks
   331	// save our state in g->sched. Pretend to
   332	// be systemstack_switch if the G stack is scanned.
   333	MOVQ	$runtime·systemstack_switch(SB), SI
   334	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   335	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   336	MOVQ	AX, (g_sched+gobuf_g)(AX)
   337	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   338
   339	// switch to g0
   340	MOVQ	DX, g(CX)
   341	MOVQ	(g_sched+gobuf_sp)(DX), BX
   342	// make it look like mstart called systemstack on g0, to stop traceback
   343	SUBQ	$8, BX
   344	MOVQ	$runtime·mstart(SB), DX
   345	MOVQ	DX, 0(BX)
   346	MOVQ	BX, SP
   347
   348	// call target function
   349	MOVQ	DI, DX
   350	MOVQ	0(DI), DI
   351	CALL	DI
   352
   353	// switch back to g
   354	get_tls(CX)
   355	MOVQ	g(CX), AX
   356	MOVQ	g_m(AX), BX
   357	MOVQ	m_curg(BX), AX
   358	MOVQ	AX, g(CX)
   359	MOVQ	(g_sched+gobuf_sp)(AX), SP
   360	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   361	RET
   362
   363noswitch:
   364	// already on m stack; tail call the function
   365	// Using a tail call here cleans up tracebacks since we won't stop
   366	// at an intermediate systemstack.
   367	MOVQ	DI, DX
   368	MOVQ	0(DI), DI
   369	JMP	DI
   370
   371bad:
   372	// Bad: g is not gsignal, not g0, not curg. What is it?
   373	MOVQ	$runtime·badsystemstack(SB), AX
   374	CALL	AX
   375	INT	$3
   376
   377
   378/*
   379 * support for morestack
   380 */
   381
   382// Called during function prolog when more stack is needed.
   383//
   384// The traceback routines see morestack on a g0 as being
   385// the top of a stack (for example, morestack calling newstack
   386// calling the scheduler calling newm calling gc), so we must
   387// record an argument size. For that purpose, it has no arguments.
   388TEXT runtime·morestack(SB),NOSPLIT,$0-0
   389	// Cannot grow scheduler stack (m->g0).
   390	get_tls(CX)
   391	MOVQ	g(CX), BX
   392	MOVQ	g_m(BX), BX
   393	MOVQ	m_g0(BX), SI
   394	CMPQ	g(CX), SI
   395	JNE	3(PC)
   396	CALL	runtime·badmorestackg0(SB)
   397	CALL	runtime·abort(SB)
   398
   399	// Cannot grow signal stack (m->gsignal).
   400	MOVQ	m_gsignal(BX), SI
   401	CMPQ	g(CX), SI
   402	JNE	3(PC)
   403	CALL	runtime·badmorestackgsignal(SB)
   404	CALL	runtime·abort(SB)
   405
   406	// Called from f.
   407	// Set m->morebuf to f's caller.
   408	MOVQ	8(SP), AX	// f's caller's PC
   409	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   410	LEAQ	16(SP), AX	// f's caller's SP
   411	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   412	get_tls(CX)
   413	MOVQ	g(CX), SI
   414	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   415
   416	// Set g->sched to context in f.
   417	MOVQ	0(SP), AX // f's PC
   418	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   419	MOVQ	SI, (g_sched+gobuf_g)(SI)
   420	LEAQ	8(SP), AX // f's SP
   421	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   422	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   423	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   424
   425	// Call newstack on m->g0's stack.
   426	MOVQ	m_g0(BX), BX
   427	MOVQ	BX, g(CX)
   428	MOVQ	(g_sched+gobuf_sp)(BX), SP
   429	CALL	runtime·newstack(SB)
   430	CALL	runtime·abort(SB)	// crash if newstack returns
   431	RET
   432
   433// morestack but not preserving ctxt.
   434TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   435	MOVL	$0, DX
   436	JMP	runtime·morestack(SB)
   437
   438// reflectcall: call a function with the given argument list
   439// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   440// we don't have variable-sized frames, so we use a small number
   441// of constant-sized-frame functions to encode a few bits of size in the pc.
   442// Caution: ugly multiline assembly macros in your future!
   443
   444#define DISPATCH(NAME,MAXSIZE)		\
   445	CMPQ	CX, $MAXSIZE;		\
   446	JA	3(PC);			\
   447	MOVQ	$NAME(SB), AX;		\
   448	JMP	AX
   449// Note: can't just "JMP NAME(SB)" - bad inlining results.
   450
   451TEXT ·reflectcall(SB), NOSPLIT, $0-32
   452	MOVLQZX argsize+24(FP), CX
   453	DISPATCH(runtime·call32, 32)
   454	DISPATCH(runtime·call64, 64)
   455	DISPATCH(runtime·call128, 128)
   456	DISPATCH(runtime·call256, 256)
   457	DISPATCH(runtime·call512, 512)
   458	DISPATCH(runtime·call1024, 1024)
   459	DISPATCH(runtime·call2048, 2048)
   460	DISPATCH(runtime·call4096, 4096)
   461	DISPATCH(runtime·call8192, 8192)
   462	DISPATCH(runtime·call16384, 16384)
   463	DISPATCH(runtime·call32768, 32768)
   464	DISPATCH(runtime·call65536, 65536)
   465	DISPATCH(runtime·call131072, 131072)
   466	DISPATCH(runtime·call262144, 262144)
   467	DISPATCH(runtime·call524288, 524288)
   468	DISPATCH(runtime·call1048576, 1048576)
   469	DISPATCH(runtime·call2097152, 2097152)
   470	DISPATCH(runtime·call4194304, 4194304)
   471	DISPATCH(runtime·call8388608, 8388608)
   472	DISPATCH(runtime·call16777216, 16777216)
   473	DISPATCH(runtime·call33554432, 33554432)
   474	DISPATCH(runtime·call67108864, 67108864)
   475	DISPATCH(runtime·call134217728, 134217728)
   476	DISPATCH(runtime·call268435456, 268435456)
   477	DISPATCH(runtime·call536870912, 536870912)
   478	DISPATCH(runtime·call1073741824, 1073741824)
   479	MOVQ	$runtime·badreflectcall(SB), AX
   480	JMP	AX
   481
   482#define CALLFN(NAME,MAXSIZE)			\
   483TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   484	NO_LOCAL_POINTERS;			\
   485	/* copy arguments to stack */		\
   486	MOVQ	argptr+16(FP), SI;		\
   487	MOVLQZX argsize+24(FP), CX;		\
   488	MOVQ	SP, DI;				\
   489	REP;MOVSB;				\
   490	/* call function */			\
   491	MOVQ	f+8(FP), DX;			\
   492	PCDATA  $PCDATA_StackMapIndex, $0;	\
   493	CALL	(DX);				\
   494	/* copy return values back */		\
   495	MOVQ	argtype+0(FP), DX;		\
   496	MOVQ	argptr+16(FP), DI;		\
   497	MOVLQZX	argsize+24(FP), CX;		\
   498	MOVLQZX	retoffset+28(FP), BX;		\
   499	MOVQ	SP, SI;				\
   500	ADDQ	BX, DI;				\
   501	ADDQ	BX, SI;				\
   502	SUBQ	BX, CX;				\
   503	CALL	callRet<>(SB);			\
   504	RET
   505
   506// callRet copies return values back at the end of call*. This is a
   507// separate function so it can allocate stack space for the arguments
   508// to reflectcallmove. It does not follow the Go ABI; it expects its
   509// arguments in registers.
   510TEXT callRet<>(SB), NOSPLIT, $32-0
   511	NO_LOCAL_POINTERS
   512	MOVQ	DX, 0(SP)
   513	MOVQ	DI, 8(SP)
   514	MOVQ	SI, 16(SP)
   515	MOVQ	CX, 24(SP)
   516	CALL	runtime·reflectcallmove(SB)
   517	RET
   518
   519CALLFN(·call32, 32)
   520CALLFN(·call64, 64)
   521CALLFN(·call128, 128)
   522CALLFN(·call256, 256)
   523CALLFN(·call512, 512)
   524CALLFN(·call1024, 1024)
   525CALLFN(·call2048, 2048)
   526CALLFN(·call4096, 4096)
   527CALLFN(·call8192, 8192)
   528CALLFN(·call16384, 16384)
   529CALLFN(·call32768, 32768)
   530CALLFN(·call65536, 65536)
   531CALLFN(·call131072, 131072)
   532CALLFN(·call262144, 262144)
   533CALLFN(·call524288, 524288)
   534CALLFN(·call1048576, 1048576)
   535CALLFN(·call2097152, 2097152)
   536CALLFN(·call4194304, 4194304)
   537CALLFN(·call8388608, 8388608)
   538CALLFN(·call16777216, 16777216)
   539CALLFN(·call33554432, 33554432)
   540CALLFN(·call67108864, 67108864)
   541CALLFN(·call134217728, 134217728)
   542CALLFN(·call268435456, 268435456)
   543CALLFN(·call536870912, 536870912)
   544CALLFN(·call1073741824, 1073741824)
   545
   546TEXT runtime·procyield(SB),NOSPLIT,$0-0
   547	MOVL	cycles+0(FP), AX
   548again:
   549	PAUSE
   550	SUBL	$1, AX
   551	JNZ	again
   552	RET
   553
   554
   555TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   556	// Stores are already ordered on x86, so this is just a
   557	// compile barrier.
   558	RET
   559
   560// func jmpdefer(fv *funcval, argp uintptr)
   561// argp is a caller SP.
   562// called from deferreturn.
   563// 1. pop the caller
   564// 2. sub 5 bytes from the callers return
   565// 3. jmp to the argument
   566TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   567	MOVQ	fv+0(FP), DX	// fn
   568	MOVQ	argp+8(FP), BX	// caller sp
   569	LEAQ	-8(BX), SP	// caller sp after CALL
   570	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   571	SUBQ	$5, (SP)	// return to CALL again
   572	MOVQ	0(DX), BX
   573	JMP	BX	// but first run the deferred function
   574
   575// Save state of caller into g->sched. Smashes R8, R9.
   576TEXT gosave<>(SB),NOSPLIT,$0
   577	get_tls(R8)
   578	MOVQ	g(R8), R8
   579	MOVQ	0(SP), R9
   580	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   581	LEAQ	8(SP), R9
   582	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   583	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   584	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   585	// Assert ctxt is zero. See func save.
   586	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   587	TESTQ	R9, R9
   588	JZ	2(PC)
   589	CALL	runtime·badctxt(SB)
   590	RET
   591
   592// func asmcgocall(fn, arg unsafe.Pointer) int32
   593// Call fn(arg) on the scheduler stack,
   594// aligned appropriately for the gcc ABI.
   595// See cgocall.go for more details.
   596TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   597	MOVQ	fn+0(FP), AX
   598	MOVQ	arg+8(FP), BX
   599
   600	MOVQ	SP, DX
   601
   602	// Figure out if we need to switch to m->g0 stack.
   603	// We get called to create new OS threads too, and those
   604	// come in on the m->g0 stack already.
   605	get_tls(CX)
   606	MOVQ	g(CX), R8
   607	CMPQ	R8, $0
   608	JEQ	nosave
   609	MOVQ	g_m(R8), R8
   610	MOVQ	m_g0(R8), SI
   611	MOVQ	g(CX), DI
   612	CMPQ	SI, DI
   613	JEQ	nosave
   614	MOVQ	m_gsignal(R8), SI
   615	CMPQ	SI, DI
   616	JEQ	nosave
   617
   618	// Switch to system stack.
   619	MOVQ	m_g0(R8), SI
   620	CALL	gosave<>(SB)
   621	MOVQ	SI, g(CX)
   622	MOVQ	(g_sched+gobuf_sp)(SI), SP
   623
   624	// Now on a scheduling stack (a pthread-created stack).
   625	// Make sure we have enough room for 4 stack-backed fast-call
   626	// registers as per windows amd64 calling convention.
   627	SUBQ	$64, SP
   628	ANDQ	$~15, SP	// alignment for gcc ABI
   629	MOVQ	DI, 48(SP)	// save g
   630	MOVQ	(g_stack+stack_hi)(DI), DI
   631	SUBQ	DX, DI
   632	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   633	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   634	MOVQ	BX, CX		// CX = first argument in Win64
   635	CALL	AX
   636
   637	// Restore registers, g, stack pointer.
   638	get_tls(CX)
   639	MOVQ	48(SP), DI
   640	MOVQ	(g_stack+stack_hi)(DI), SI
   641	SUBQ	40(SP), SI
   642	MOVQ	DI, g(CX)
   643	MOVQ	SI, SP
   644
   645	MOVL	AX, ret+16(FP)
   646	RET
   647
   648nosave:
   649	// Running on a system stack, perhaps even without a g.
   650	// Having no g can happen during thread creation or thread teardown
   651	// (see needm/dropm on Solaris, for example).
   652	// This code is like the above sequence but without saving/restoring g
   653	// and without worrying about the stack moving out from under us
   654	// (because we're on a system stack, not a goroutine stack).
   655	// The above code could be used directly if already on a system stack,
   656	// but then the only path through this code would be a rare case on Solaris.
   657	// Using this code for all "already on system stack" calls exercises it more,
   658	// which should help keep it correct.
   659	SUBQ	$64, SP
   660	ANDQ	$~15, SP
   661	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   662	MOVQ	DX, 40(SP)	// save original stack pointer
   663	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   664	MOVQ	BX, CX		// CX = first argument in Win64
   665	CALL	AX
   666	MOVQ	40(SP), SI	// restore original stack pointer
   667	MOVQ	SI, SP
   668	MOVL	AX, ret+16(FP)
   669	RET
   670
   671// func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
   672// Turn the fn into a Go func (by taking its address) and call
   673// cgocallback_gofunc.
   674TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   675	LEAQ	fn+0(FP), AX
   676	MOVQ	AX, 0(SP)
   677	MOVQ	frame+8(FP), AX
   678	MOVQ	AX, 8(SP)
   679	MOVQ	framesize+16(FP), AX
   680	MOVQ	AX, 16(SP)
   681	MOVQ	ctxt+24(FP), AX
   682	MOVQ	AX, 24(SP)
   683	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   684	CALL	AX
   685	RET
   686
   687// func cgocallback_gofunc(fn, frame, framesize, ctxt uintptr)
   688// See cgocall.go for more details.
   689TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   690	NO_LOCAL_POINTERS
   691
   692	// If g is nil, Go did not create the current thread.
   693	// Call needm to obtain one m for temporary use.
   694	// In this case, we're running on the thread stack, so there's
   695	// lots of space, but the linker doesn't know. Hide the call from
   696	// the linker analysis by using an indirect call through AX.
   697	get_tls(CX)
   698#ifdef GOOS_windows
   699	MOVL	$0, BX
   700	CMPQ	CX, $0
   701	JEQ	2(PC)
   702#endif
   703	MOVQ	g(CX), BX
   704	CMPQ	BX, $0
   705	JEQ	needm
   706	MOVQ	g_m(BX), BX
   707	MOVQ	BX, R8 // holds oldm until end of function
   708	JMP	havem
   709needm:
   710	MOVQ	$0, 0(SP)
   711	MOVQ	$runtime·needm(SB), AX
   712	CALL	AX
   713	MOVQ	0(SP), R8
   714	get_tls(CX)
   715	MOVQ	g(CX), BX
   716	MOVQ	g_m(BX), BX
   717
   718	// Set m->sched.sp = SP, so that if a panic happens
   719	// during the function we are about to execute, it will
   720	// have a valid SP to run on the g0 stack.
   721	// The next few lines (after the havem label)
   722	// will save this SP onto the stack and then write
   723	// the same SP back to m->sched.sp. That seems redundant,
   724	// but if an unrecovered panic happens, unwindm will
   725	// restore the g->sched.sp from the stack location
   726	// and then systemstack will try to use it. If we don't set it here,
   727	// that restored SP will be uninitialized (typically 0) and
   728	// will not be usable.
   729	MOVQ	m_g0(BX), SI
   730	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   731
   732havem:
   733	// Now there's a valid m, and we're running on its m->g0.
   734	// Save current m->g0->sched.sp on stack and then set it to SP.
   735	// Save current sp in m->g0->sched.sp in preparation for
   736	// switch back to m->curg stack.
   737	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   738	MOVQ	m_g0(BX), SI
   739	MOVQ	(g_sched+gobuf_sp)(SI), AX
   740	MOVQ	AX, 0(SP)
   741	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   742
   743	// Switch to m->curg stack and call runtime.cgocallbackg.
   744	// Because we are taking over the execution of m->curg
   745	// but *not* resuming what had been running, we need to
   746	// save that information (m->curg->sched) so we can restore it.
   747	// We can restore m->curg->sched.sp easily, because calling
   748	// runtime.cgocallbackg leaves SP unchanged upon return.
   749	// To save m->curg->sched.pc, we push it onto the stack.
   750	// This has the added benefit that it looks to the traceback
   751	// routine like cgocallbackg is going to return to that
   752	// PC (because the frame we allocate below has the same
   753	// size as cgocallback_gofunc's frame declared above)
   754	// so that the traceback will seamlessly trace back into
   755	// the earlier calls.
   756	//
   757	// In the new goroutine, 8(SP) holds the saved R8.
   758	MOVQ	m_curg(BX), SI
   759	MOVQ	SI, g(CX)
   760	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   761	MOVQ	(g_sched+gobuf_pc)(SI), BX
   762	MOVQ	BX, -8(DI)
   763	// Compute the size of the frame, including return PC and, if
   764	// GOEXPERIMENT=framepointer, the saved base pointer
   765	MOVQ	ctxt+24(FP), BX
   766	LEAQ	fv+0(FP), AX
   767	SUBQ	SP, AX
   768	SUBQ	AX, DI
   769	MOVQ	DI, SP
   770
   771	MOVQ	R8, 8(SP)
   772	MOVQ	BX, 0(SP)
   773	CALL	runtime·cgocallbackg(SB)
   774	MOVQ	8(SP), R8
   775
   776	// Compute the size of the frame again. FP and SP have
   777	// completely different values here than they did above,
   778	// but only their difference matters.
   779	LEAQ	fv+0(FP), AX
   780	SUBQ	SP, AX
   781
   782	// Restore g->sched (== m->curg->sched) from saved values.
   783	get_tls(CX)
   784	MOVQ	g(CX), SI
   785	MOVQ	SP, DI
   786	ADDQ	AX, DI
   787	MOVQ	-8(DI), BX
   788	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   789	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   790
   791	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   792	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   793	// so we do not have to restore it.)
   794	MOVQ	g(CX), BX
   795	MOVQ	g_m(BX), BX
   796	MOVQ	m_g0(BX), SI
   797	MOVQ	SI, g(CX)
   798	MOVQ	(g_sched+gobuf_sp)(SI), SP
   799	MOVQ	0(SP), AX
   800	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   801
   802	// If the m on entry was nil, we called needm above to borrow an m
   803	// for the duration of the call. Since the call is over, return it with dropm.
   804	CMPQ	R8, $0
   805	JNE 3(PC)
   806	MOVQ	$runtime·dropm(SB), AX
   807	CALL	AX
   808
   809	// Done!
   810	RET
   811
   812// func setg(gg *g)
   813// set g. for use by needm.
   814TEXT runtime·setg(SB), NOSPLIT, $0-8
   815	MOVQ	gg+0(FP), BX
   816#ifdef GOOS_windows
   817	CMPQ	BX, $0
   818	JNE	settls
   819	MOVQ	$0, 0x28(GS)
   820	RET
   821settls:
   822	MOVQ	g_m(BX), AX
   823	LEAQ	m_tls(AX), AX
   824	MOVQ	AX, 0x28(GS)
   825#endif
   826	get_tls(CX)
   827	MOVQ	BX, g(CX)
   828	RET
   829
   830// void setg_gcc(G*); set g called from gcc.
   831TEXT setg_gcc<>(SB),NOSPLIT,$0
   832	get_tls(AX)
   833	MOVQ	DI, g(AX)
   834	RET
   835
   836TEXT runtime·abort(SB),NOSPLIT,$0-0
   837	INT	$3
   838loop:
   839	JMP	loop
   840
   841// check that SP is in range [g->stack.lo, g->stack.hi)
   842TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   843	get_tls(CX)
   844	MOVQ	g(CX), AX
   845	CMPQ	(g_stack+stack_hi)(AX), SP
   846	JHI	2(PC)
   847	CALL	runtime·abort(SB)
   848	CMPQ	SP, (g_stack+stack_lo)(AX)
   849	JHI	2(PC)
   850	CALL	runtime·abort(SB)
   851	RET
   852
   853// func cputicks() int64
   854TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   855	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   856	JNE	mfence
   857	LFENCE
   858	JMP	done
   859mfence:
   860	MFENCE
   861done:
   862	RDTSC
   863	SHLQ	$32, DX
   864	ADDQ	DX, AX
   865	MOVQ	AX, ret+0(FP)
   866	RET
   867
   868// func aeshash(p unsafe.Pointer, h, s uintptr) uintptr
   869// hash function using AES hardware instructions
   870TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   871	MOVQ	p+0(FP), AX	// ptr to data
   872	MOVQ	s+16(FP), CX	// size
   873	LEAQ	ret+24(FP), DX
   874	JMP	runtime·aeshashbody(SB)
   875
   876// func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
   877TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   878	MOVQ	p+0(FP), AX	// ptr to string struct
   879	MOVQ	8(AX), CX	// length of string
   880	MOVQ	(AX), AX	// string data
   881	LEAQ	ret+16(FP), DX
   882	JMP	runtime·aeshashbody(SB)
   883
   884// AX: data
   885// CX: length
   886// DX: address to put return value
   887TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   888	// Fill an SSE register with our seeds.
   889	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   890	PINSRW	$4, CX, X0			// 16 bits of length
   891	PSHUFHW $0, X0, X0			// repeat length 4 times total
   892	MOVO	X0, X1				// save unscrambled seed
   893	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   894	AESENC	X0, X0				// scramble seed
   895
   896	CMPQ	CX, $16
   897	JB	aes0to15
   898	JE	aes16
   899	CMPQ	CX, $32
   900	JBE	aes17to32
   901	CMPQ	CX, $64
   902	JBE	aes33to64
   903	CMPQ	CX, $128
   904	JBE	aes65to128
   905	JMP	aes129plus
   906
   907aes0to15:
   908	TESTQ	CX, CX
   909	JE	aes0
   910
   911	ADDQ	$16, AX
   912	TESTW	$0xff0, AX
   913	JE	endofpage
   914
   915	// 16 bytes loaded at this address won't cross
   916	// a page boundary, so we can load it directly.
   917	MOVOU	-16(AX), X1
   918	ADDQ	CX, CX
   919	MOVQ	$masks<>(SB), AX
   920	PAND	(AX)(CX*8), X1
   921final1:
   922	PXOR	X0, X1	// xor data with seed
   923	AESENC	X1, X1	// scramble combo 3 times
   924	AESENC	X1, X1
   925	AESENC	X1, X1
   926	MOVQ	X1, (DX)
   927	RET
   928
   929endofpage:
   930	// address ends in 1111xxxx. Might be up against
   931	// a page boundary, so load ending at last byte.
   932	// Then shift bytes down using pshufb.
   933	MOVOU	-32(AX)(CX*1), X1
   934	ADDQ	CX, CX
   935	MOVQ	$shifts<>(SB), AX
   936	PSHUFB	(AX)(CX*8), X1
   937	JMP	final1
   938
   939aes0:
   940	// Return scrambled input seed
   941	AESENC	X0, X0
   942	MOVQ	X0, (DX)
   943	RET
   944
   945aes16:
   946	MOVOU	(AX), X1
   947	JMP	final1
   948
   949aes17to32:
   950	// make second starting seed
   951	PXOR	runtime·aeskeysched+16(SB), X1
   952	AESENC	X1, X1
   953
   954	// load data to be hashed
   955	MOVOU	(AX), X2
   956	MOVOU	-16(AX)(CX*1), X3
   957
   958	// xor with seed
   959	PXOR	X0, X2
   960	PXOR	X1, X3
   961
   962	// scramble 3 times
   963	AESENC	X2, X2
   964	AESENC	X3, X3
   965	AESENC	X2, X2
   966	AESENC	X3, X3
   967	AESENC	X2, X2
   968	AESENC	X3, X3
   969
   970	// combine results
   971	PXOR	X3, X2
   972	MOVQ	X2, (DX)
   973	RET
   974
   975aes33to64:
   976	// make 3 more starting seeds
   977	MOVO	X1, X2
   978	MOVO	X1, X3
   979	PXOR	runtime·aeskeysched+16(SB), X1
   980	PXOR	runtime·aeskeysched+32(SB), X2
   981	PXOR	runtime·aeskeysched+48(SB), X3
   982	AESENC	X1, X1
   983	AESENC	X2, X2
   984	AESENC	X3, X3
   985
   986	MOVOU	(AX), X4
   987	MOVOU	16(AX), X5
   988	MOVOU	-32(AX)(CX*1), X6
   989	MOVOU	-16(AX)(CX*1), X7
   990
   991	PXOR	X0, X4
   992	PXOR	X1, X5
   993	PXOR	X2, X6
   994	PXOR	X3, X7
   995
   996	AESENC	X4, X4
   997	AESENC	X5, X5
   998	AESENC	X6, X6
   999	AESENC	X7, X7
  1000
  1001	AESENC	X4, X4
  1002	AESENC	X5, X5
  1003	AESENC	X6, X6
  1004	AESENC	X7, X7
  1005
  1006	AESENC	X4, X4
  1007	AESENC	X5, X5
  1008	AESENC	X6, X6
  1009	AESENC	X7, X7
  1010
  1011	PXOR	X6, X4
  1012	PXOR	X7, X5
  1013	PXOR	X5, X4
  1014	MOVQ	X4, (DX)
  1015	RET
  1016
  1017aes65to128:
  1018	// make 7 more starting seeds
  1019	MOVO	X1, X2
  1020	MOVO	X1, X3
  1021	MOVO	X1, X4
  1022	MOVO	X1, X5
  1023	MOVO	X1, X6
  1024	MOVO	X1, X7
  1025	PXOR	runtime·aeskeysched+16(SB), X1
  1026	PXOR	runtime·aeskeysched+32(SB), X2
  1027	PXOR	runtime·aeskeysched+48(SB), X3
  1028	PXOR	runtime·aeskeysched+64(SB), X4
  1029	PXOR	runtime·aeskeysched+80(SB), X5
  1030	PXOR	runtime·aeskeysched+96(SB), X6
  1031	PXOR	runtime·aeskeysched+112(SB), X7
  1032	AESENC	X1, X1
  1033	AESENC	X2, X2
  1034	AESENC	X3, X3
  1035	AESENC	X4, X4
  1036	AESENC	X5, X5
  1037	AESENC	X6, X6
  1038	AESENC	X7, X7
  1039
  1040	// load data
  1041	MOVOU	(AX), X8
  1042	MOVOU	16(AX), X9
  1043	MOVOU	32(AX), X10
  1044	MOVOU	48(AX), X11
  1045	MOVOU	-64(AX)(CX*1), X12
  1046	MOVOU	-48(AX)(CX*1), X13
  1047	MOVOU	-32(AX)(CX*1), X14
  1048	MOVOU	-16(AX)(CX*1), X15
  1049
  1050	// xor with seed
  1051	PXOR	X0, X8
  1052	PXOR	X1, X9
  1053	PXOR	X2, X10
  1054	PXOR	X3, X11
  1055	PXOR	X4, X12
  1056	PXOR	X5, X13
  1057	PXOR	X6, X14
  1058	PXOR	X7, X15
  1059
  1060	// scramble 3 times
  1061	AESENC	X8, X8
  1062	AESENC	X9, X9
  1063	AESENC	X10, X10
  1064	AESENC	X11, X11
  1065	AESENC	X12, X12
  1066	AESENC	X13, X13
  1067	AESENC	X14, X14
  1068	AESENC	X15, X15
  1069
  1070	AESENC	X8, X8
  1071	AESENC	X9, X9
  1072	AESENC	X10, X10
  1073	AESENC	X11, X11
  1074	AESENC	X12, X12
  1075	AESENC	X13, X13
  1076	AESENC	X14, X14
  1077	AESENC	X15, X15
  1078
  1079	AESENC	X8, X8
  1080	AESENC	X9, X9
  1081	AESENC	X10, X10
  1082	AESENC	X11, X11
  1083	AESENC	X12, X12
  1084	AESENC	X13, X13
  1085	AESENC	X14, X14
  1086	AESENC	X15, X15
  1087
  1088	// combine results
  1089	PXOR	X12, X8
  1090	PXOR	X13, X9
  1091	PXOR	X14, X10
  1092	PXOR	X15, X11
  1093	PXOR	X10, X8
  1094	PXOR	X11, X9
  1095	PXOR	X9, X8
  1096	MOVQ	X8, (DX)
  1097	RET
  1098
  1099aes129plus:
  1100	// make 7 more starting seeds
  1101	MOVO	X1, X2
  1102	MOVO	X1, X3
  1103	MOVO	X1, X4
  1104	MOVO	X1, X5
  1105	MOVO	X1, X6
  1106	MOVO	X1, X7
  1107	PXOR	runtime·aeskeysched+16(SB), X1
  1108	PXOR	runtime·aeskeysched+32(SB), X2
  1109	PXOR	runtime·aeskeysched+48(SB), X3
  1110	PXOR	runtime·aeskeysched+64(SB), X4
  1111	PXOR	runtime·aeskeysched+80(SB), X5
  1112	PXOR	runtime·aeskeysched+96(SB), X6
  1113	PXOR	runtime·aeskeysched+112(SB), X7
  1114	AESENC	X1, X1
  1115	AESENC	X2, X2
  1116	AESENC	X3, X3
  1117	AESENC	X4, X4
  1118	AESENC	X5, X5
  1119	AESENC	X6, X6
  1120	AESENC	X7, X7
  1121
  1122	// start with last (possibly overlapping) block
  1123	MOVOU	-128(AX)(CX*1), X8
  1124	MOVOU	-112(AX)(CX*1), X9
  1125	MOVOU	-96(AX)(CX*1), X10
  1126	MOVOU	-80(AX)(CX*1), X11
  1127	MOVOU	-64(AX)(CX*1), X12
  1128	MOVOU	-48(AX)(CX*1), X13
  1129	MOVOU	-32(AX)(CX*1), X14
  1130	MOVOU	-16(AX)(CX*1), X15
  1131
  1132	// xor in seed
  1133	PXOR	X0, X8
  1134	PXOR	X1, X9
  1135	PXOR	X2, X10
  1136	PXOR	X3, X11
  1137	PXOR	X4, X12
  1138	PXOR	X5, X13
  1139	PXOR	X6, X14
  1140	PXOR	X7, X15
  1141
  1142	// compute number of remaining 128-byte blocks
  1143	DECQ	CX
  1144	SHRQ	$7, CX
  1145
  1146aesloop:
  1147	// scramble state
  1148	AESENC	X8, X8
  1149	AESENC	X9, X9
  1150	AESENC	X10, X10
  1151	AESENC	X11, X11
  1152	AESENC	X12, X12
  1153	AESENC	X13, X13
  1154	AESENC	X14, X14
  1155	AESENC	X15, X15
  1156
  1157	// scramble state, xor in a block
  1158	MOVOU	(AX), X0
  1159	MOVOU	16(AX), X1
  1160	MOVOU	32(AX), X2
  1161	MOVOU	48(AX), X3
  1162	AESENC	X0, X8
  1163	AESENC	X1, X9
  1164	AESENC	X2, X10
  1165	AESENC	X3, X11
  1166	MOVOU	64(AX), X4
  1167	MOVOU	80(AX), X5
  1168	MOVOU	96(AX), X6
  1169	MOVOU	112(AX), X7
  1170	AESENC	X4, X12
  1171	AESENC	X5, X13
  1172	AESENC	X6, X14
  1173	AESENC	X7, X15
  1174
  1175	ADDQ	$128, AX
  1176	DECQ	CX
  1177	JNE	aesloop
  1178
  1179	// 3 more scrambles to finish
  1180	AESENC	X8, X8
  1181	AESENC	X9, X9
  1182	AESENC	X10, X10
  1183	AESENC	X11, X11
  1184	AESENC	X12, X12
  1185	AESENC	X13, X13
  1186	AESENC	X14, X14
  1187	AESENC	X15, X15
  1188	AESENC	X8, X8
  1189	AESENC	X9, X9
  1190	AESENC	X10, X10
  1191	AESENC	X11, X11
  1192	AESENC	X12, X12
  1193	AESENC	X13, X13
  1194	AESENC	X14, X14
  1195	AESENC	X15, X15
  1196	AESENC	X8, X8
  1197	AESENC	X9, X9
  1198	AESENC	X10, X10
  1199	AESENC	X11, X11
  1200	AESENC	X12, X12
  1201	AESENC	X13, X13
  1202	AESENC	X14, X14
  1203	AESENC	X15, X15
  1204
  1205	PXOR	X12, X8
  1206	PXOR	X13, X9
  1207	PXOR	X14, X10
  1208	PXOR	X15, X11
  1209	PXOR	X10, X8
  1210	PXOR	X11, X9
  1211	PXOR	X9, X8
  1212	MOVQ	X8, (DX)
  1213	RET
  1214
  1215// func aeshash32(p unsafe.Pointer, h uintptr) uintptr
  1216TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1217	MOVQ	p+0(FP), AX	// ptr to data
  1218	MOVQ	h+8(FP), X0	// seed
  1219	PINSRD	$2, (AX), X0	// data
  1220	AESENC	runtime·aeskeysched+0(SB), X0
  1221	AESENC	runtime·aeskeysched+16(SB), X0
  1222	AESENC	runtime·aeskeysched+32(SB), X0
  1223	MOVQ	X0, ret+16(FP)
  1224	RET
  1225
  1226// func aeshash64(p unsafe.Pointer, h uintptr) uintptr
  1227TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1228	MOVQ	p+0(FP), AX	// ptr to data
  1229	MOVQ	h+8(FP), X0	// seed
  1230	PINSRQ	$1, (AX), X0	// data
  1231	AESENC	runtime·aeskeysched+0(SB), X0
  1232	AESENC	runtime·aeskeysched+16(SB), X0
  1233	AESENC	runtime·aeskeysched+32(SB), X0
  1234	MOVQ	X0, ret+16(FP)
  1235	RET
  1236
  1237// simple mask to get rid of data in the high part of the register.
  1238DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1239DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1240DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1241DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1242DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1243DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1244DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1245DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1246DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1247DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1248DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1249DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1250DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1251DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1252DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1253DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1254DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1255DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1256DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1257DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1258DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1259DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1260DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1261DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1262DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1263DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1264DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1265DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1266DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1267DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1268DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1269DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1270GLOBL masks<>(SB),RODATA,$256
  1271
  1272// func checkASM() bool
  1273TEXT ·checkASM(SB),NOSPLIT,$0-1
  1274	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1275	MOVQ	$masks<>(SB), AX
  1276	MOVQ	$shifts<>(SB), BX
  1277	ORQ	BX, AX
  1278	TESTQ	$15, AX
  1279	SETEQ	ret+0(FP)
  1280	RET
  1281
  1282// these are arguments to pshufb. They move data down from
  1283// the high bytes of the register to the low bytes of the register.
  1284// index is how many bytes to move.
  1285DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1286DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1287DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1288DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1289DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1290DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1291DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1292DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1293DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1294DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1295DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1296DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1297DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1298DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1299DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1300DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1301DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1302DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1303DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1304DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1305DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1306DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1307DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1308DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1309DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1310DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1311DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1312DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1313DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1314DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1315DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1316DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1317GLOBL shifts<>(SB),RODATA,$256
  1318
  1319TEXT runtime·return0(SB), NOSPLIT, $0
  1320	MOVL	$0, AX
  1321	RET
  1322
  1323
  1324// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1325// Must obey the gcc calling convention.
  1326TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1327	get_tls(CX)
  1328	MOVQ	g(CX), AX
  1329	MOVQ	g_m(AX), AX
  1330	MOVQ	m_curg(AX), AX
  1331	MOVQ	(g_stack+stack_hi)(AX), AX
  1332	RET
  1333
  1334// The top-most function running on a goroutine
  1335// returns to goexit+PCQuantum.
  1336TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1337	BYTE	$0x90	// NOP
  1338	CALL	runtime·goexit1(SB)	// does not return
  1339	// traceback from goexit1 must hit code range of goexit
  1340	BYTE	$0x90	// NOP
  1341
  1342// This is called from .init_array and follows the platform, not Go, ABI.
  1343TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1344	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  1345	MOVQ	runtime·lastmoduledatap(SB), AX
  1346	MOVQ	DI, moduledata_next(AX)
  1347	MOVQ	DI, runtime·lastmoduledatap(SB)
  1348	POPQ	R15
  1349	RET
  1350
  1351// gcWriteBarrier performs a heap pointer write and informs the GC.
  1352//
  1353// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
  1354// - DI is the destination of the write
  1355// - AX is the value being written at DI
  1356// It clobbers FLAGS. It does not clobber any general-purpose registers,
  1357// but may clobber others (e.g., SSE registers).
  1358TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
  1359	// Save the registers clobbered by the fast path. This is slightly
  1360	// faster than having the caller spill these.
  1361	MOVQ	R14, 104(SP)
  1362	MOVQ	R13, 112(SP)
  1363	// TODO: Consider passing g.m.p in as an argument so they can be shared
  1364	// across a sequence of write barriers.
  1365	get_tls(R13)
  1366	MOVQ	g(R13), R13
  1367	MOVQ	g_m(R13), R13
  1368	MOVQ	m_p(R13), R13
  1369	MOVQ	(p_wbBuf+wbBuf_next)(R13), R14
  1370	// Increment wbBuf.next position.
  1371	LEAQ	16(R14), R14
  1372	MOVQ	R14, (p_wbBuf+wbBuf_next)(R13)
  1373	CMPQ	R14, (p_wbBuf+wbBuf_end)(R13)
  1374	// Record the write.
  1375	MOVQ	AX, -16(R14)	// Record value
  1376	// Note: This turns bad pointer writes into bad
  1377	// pointer reads, which could be confusing. We could avoid
  1378	// reading from obviously bad pointers, which would
  1379	// take care of the vast majority of these. We could
  1380	// patch this up in the signal handler, or use XCHG to
  1381	// combine the read and the write.
  1382	MOVQ	(DI), R13
  1383	MOVQ	R13, -8(R14)	// Record *slot
  1384	// Is the buffer full? (flags set in CMPQ above)
  1385	JEQ	flush
  1386ret:
  1387	MOVQ	104(SP), R14
  1388	MOVQ	112(SP), R13
  1389	// Do the write.
  1390	MOVQ	AX, (DI)
  1391	RET
  1392
  1393flush:
  1394	// Save all general purpose registers since these could be
  1395	// clobbered by wbBufFlush and were not saved by the caller.
  1396	// It is possible for wbBufFlush to clobber other registers
  1397	// (e.g., SSE registers), but the compiler takes care of saving
  1398	// those in the caller if necessary. This strikes a balance
  1399	// with registers that are likely to be used.
  1400	//
  1401	// We don't have type information for these, but all code under
  1402	// here is NOSPLIT, so nothing will observe these.
  1403	//
  1404	// TODO: We could strike a different balance; e.g., saving X0
  1405	// and not saving GP registers that are less likely to be used.
  1406	MOVQ	DI, 0(SP)	// Also first argument to wbBufFlush
  1407	MOVQ	AX, 8(SP)	// Also second argument to wbBufFlush
  1408	MOVQ	BX, 16(SP)
  1409	MOVQ	CX, 24(SP)
  1410	MOVQ	DX, 32(SP)
  1411	// DI already saved
  1412	MOVQ	SI, 40(SP)
  1413	MOVQ	BP, 48(SP)
  1414	MOVQ	R8, 56(SP)
  1415	MOVQ	R9, 64(SP)
  1416	MOVQ	R10, 72(SP)
  1417	MOVQ	R11, 80(SP)
  1418	MOVQ	R12, 88(SP)
  1419	// R13 already saved
  1420	// R14 already saved
  1421	MOVQ	R15, 96(SP)
  1422
  1423	// This takes arguments DI and AX
  1424	CALL	runtime·wbBufFlush(SB)
  1425
  1426	MOVQ	0(SP), DI
  1427	MOVQ	8(SP), AX
  1428	MOVQ	16(SP), BX
  1429	MOVQ	24(SP), CX
  1430	MOVQ	32(SP), DX
  1431	MOVQ	40(SP), SI
  1432	MOVQ	48(SP), BP
  1433	MOVQ	56(SP), R8
  1434	MOVQ	64(SP), R9
  1435	MOVQ	72(SP), R10
  1436	MOVQ	80(SP), R11
  1437	MOVQ	88(SP), R12
  1438	MOVQ	96(SP), R15
  1439	JMP	ret
  1440
  1441DATA	debugCallFrameTooLarge<>+0x00(SB)/8, $"call fra"
  1442DATA	debugCallFrameTooLarge<>+0x08(SB)/8, $"me too l"
  1443DATA	debugCallFrameTooLarge<>+0x10(SB)/4, $"arge"
  1444GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $0x14	// Size duplicated below
  1445
  1446// debugCallV1 is the entry point for debugger-injected function
  1447// calls on running goroutines. It informs the runtime that a
  1448// debug call has been injected and creates a call frame for the
  1449// debugger to fill in.
  1450//
  1451// To inject a function call, a debugger should:
  1452// 1. Check that the goroutine is in state _Grunning and that
  1453//    there are at least 256 bytes free on the stack.
  1454// 2. Push the current PC on the stack (updating SP).
  1455// 3. Write the desired argument frame size at SP-16 (using the SP
  1456//    after step 2).
  1457// 4. Save all machine registers (including flags and XMM reigsters)
  1458//    so they can be restored later by the debugger.
  1459// 5. Set the PC to debugCallV1 and resume execution.
  1460//
  1461// If the goroutine is in state _Grunnable, then it's not generally
  1462// safe to inject a call because it may return out via other runtime
  1463// operations. Instead, the debugger should unwind the stack to find
  1464// the return to non-runtime code, add a temporary breakpoint there,
  1465// and inject the call once that breakpoint is hit.
  1466//
  1467// If the goroutine is in any other state, it's not safe to inject a call.
  1468//
  1469// This function communicates back to the debugger by setting RAX and
  1470// invoking INT3 to raise a breakpoint signal. See the comments in the
  1471// implementation for the protocol the debugger is expected to
  1472// follow. InjectDebugCall in the runtime tests demonstrates this protocol.
  1473//
  1474// The debugger must ensure that any pointers passed to the function
  1475// obey escape analysis requirements. Specifically, it must not pass
  1476// a stack pointer to an escaping argument. debugCallV1 cannot check
  1477// this invariant.
  1478TEXT runtime·debugCallV1(SB),NOSPLIT,$152-0
  1479	// Save all registers that may contain pointers in GC register
  1480	// map order (see ssa.registersAMD64). This makes it possible
  1481	// to copy the stack while updating pointers currently held in
  1482	// registers, and for the GC to find roots in registers.
  1483	//
  1484	// We can't do anything that might clobber any of these
  1485	// registers before this.
  1486	MOVQ	R15, r15-(14*8+8)(SP)
  1487	MOVQ	R14, r14-(13*8+8)(SP)
  1488	MOVQ	R13, r13-(12*8+8)(SP)
  1489	MOVQ	R12, r12-(11*8+8)(SP)
  1490	MOVQ	R11, r11-(10*8+8)(SP)
  1491	MOVQ	R10, r10-(9*8+8)(SP)
  1492	MOVQ	R9, r9-(8*8+8)(SP)
  1493	MOVQ	R8, r8-(7*8+8)(SP)
  1494	MOVQ	DI, di-(6*8+8)(SP)
  1495	MOVQ	SI, si-(5*8+8)(SP)
  1496	MOVQ	BP, bp-(4*8+8)(SP)
  1497	MOVQ	BX, bx-(3*8+8)(SP)
  1498	MOVQ	DX, dx-(2*8+8)(SP)
  1499	// Save the frame size before we clobber it. Either of the last
  1500	// saves could clobber this depending on whether there's a saved BP.
  1501	MOVQ	frameSize-24(FP), DX	// aka -16(RSP) before prologue
  1502	MOVQ	CX, cx-(1*8+8)(SP)
  1503	MOVQ	AX, ax-(0*8+8)(SP)
  1504
  1505	// Save the argument frame size.
  1506	MOVQ	DX, frameSize-128(SP)
  1507
  1508	// Perform a safe-point check.
  1509	MOVQ	retpc-8(FP), AX	// Caller's PC
  1510	MOVQ	AX, 0(SP)
  1511	CALL	runtime·debugCallCheck(SB)
  1512	MOVQ	8(SP), AX
  1513	TESTQ	AX, AX
  1514	JZ	good
  1515	// The safety check failed. Put the reason string at the top
  1516	// of the stack.
  1517	MOVQ	AX, 0(SP)
  1518	MOVQ	16(SP), AX
  1519	MOVQ	AX, 8(SP)
  1520	// Set AX to 8 and invoke INT3. The debugger should get the
  1521	// reason a call can't be injected from the top of the stack
  1522	// and resume execution.
  1523	MOVQ	$8, AX
  1524	BYTE	$0xcc
  1525	JMP	restore
  1526
  1527good:
  1528	// Registers are saved and it's safe to make a call.
  1529	// Open up a call frame, moving the stack if necessary.
  1530	//
  1531	// Once the frame is allocated, this will set AX to 0 and
  1532	// invoke INT3. The debugger should write the argument
  1533	// frame for the call at SP, push the trapping PC on the
  1534	// stack, set the PC to the function to call, set RCX to point
  1535	// to the closure (if a closure call), and resume execution.
  1536	//
  1537	// If the function returns, this will set AX to 1 and invoke
  1538	// INT3. The debugger can then inspect any return value saved
  1539	// on the stack at SP and resume execution again.
  1540	//
  1541	// If the function panics, this will set AX to 2 and invoke INT3.
  1542	// The interface{} value of the panic will be at SP. The debugger
  1543	// can inspect the panic value and resume execution again.
  1544#define DEBUG_CALL_DISPATCH(NAME,MAXSIZE)	\
  1545	CMPQ	AX, $MAXSIZE;			\
  1546	JA	5(PC);				\
  1547	MOVQ	$NAME(SB), AX;			\
  1548	MOVQ	AX, 0(SP);			\
  1549	CALL	runtime·debugCallWrap(SB);	\
  1550	JMP	restore
  1551
  1552	MOVQ	frameSize-128(SP), AX
  1553	DEBUG_CALL_DISPATCH(debugCall32<>, 32)
  1554	DEBUG_CALL_DISPATCH(debugCall64<>, 64)
  1555	DEBUG_CALL_DISPATCH(debugCall128<>, 128)
  1556	DEBUG_CALL_DISPATCH(debugCall256<>, 256)
  1557	DEBUG_CALL_DISPATCH(debugCall512<>, 512)
  1558	DEBUG_CALL_DISPATCH(debugCall1024<>, 1024)
  1559	DEBUG_CALL_DISPATCH(debugCall2048<>, 2048)
  1560	DEBUG_CALL_DISPATCH(debugCall4096<>, 4096)
  1561	DEBUG_CALL_DISPATCH(debugCall8192<>, 8192)
  1562	DEBUG_CALL_DISPATCH(debugCall16384<>, 16384)
  1563	DEBUG_CALL_DISPATCH(debugCall32768<>, 32768)
  1564	DEBUG_CALL_DISPATCH(debugCall65536<>, 65536)
  1565	// The frame size is too large. Report the error.
  1566	MOVQ	$debugCallFrameTooLarge<>(SB), AX
  1567	MOVQ	AX, 0(SP)
  1568	MOVQ	$0x14, 8(SP)
  1569	MOVQ	$8, AX
  1570	BYTE	$0xcc
  1571	JMP	restore
  1572
  1573restore:
  1574	// Calls and failures resume here.
  1575	//
  1576	// Set AX to 16 and invoke INT3. The debugger should restore
  1577	// all registers except RIP and RSP and resume execution.
  1578	MOVQ	$16, AX
  1579	BYTE	$0xcc
  1580	// We must not modify flags after this point.
  1581
  1582	// Restore pointer-containing registers, which may have been
  1583	// modified from the debugger's copy by stack copying.
  1584	MOVQ	ax-(0*8+8)(SP), AX
  1585	MOVQ	cx-(1*8+8)(SP), CX
  1586	MOVQ	dx-(2*8+8)(SP), DX
  1587	MOVQ	bx-(3*8+8)(SP), BX
  1588	MOVQ	bp-(4*8+8)(SP), BP
  1589	MOVQ	si-(5*8+8)(SP), SI
  1590	MOVQ	di-(6*8+8)(SP), DI
  1591	MOVQ	r8-(7*8+8)(SP), R8
  1592	MOVQ	r9-(8*8+8)(SP), R9
  1593	MOVQ	r10-(9*8+8)(SP), R10
  1594	MOVQ	r11-(10*8+8)(SP), R11
  1595	MOVQ	r12-(11*8+8)(SP), R12
  1596	MOVQ	r13-(12*8+8)(SP), R13
  1597	MOVQ	r14-(13*8+8)(SP), R14
  1598	MOVQ	r15-(14*8+8)(SP), R15
  1599
  1600	RET
  1601
  1602#define DEBUG_CALL_FN(NAME,MAXSIZE)		\
  1603TEXT NAME(SB),WRAPPER,$MAXSIZE-0;		\
  1604	NO_LOCAL_POINTERS;			\
  1605	MOVQ	$0, AX;				\
  1606	BYTE	$0xcc;				\
  1607	MOVQ	$1, AX;				\
  1608	BYTE	$0xcc;				\
  1609	RET
  1610DEBUG_CALL_FN(debugCall32<>, 32)
  1611DEBUG_CALL_FN(debugCall64<>, 64)
  1612DEBUG_CALL_FN(debugCall128<>, 128)
  1613DEBUG_CALL_FN(debugCall256<>, 256)
  1614DEBUG_CALL_FN(debugCall512<>, 512)
  1615DEBUG_CALL_FN(debugCall1024<>, 1024)
  1616DEBUG_CALL_FN(debugCall2048<>, 2048)
  1617DEBUG_CALL_FN(debugCall4096<>, 4096)
  1618DEBUG_CALL_FN(debugCall8192<>, 8192)
  1619DEBUG_CALL_FN(debugCall16384<>, 16384)
  1620DEBUG_CALL_FN(debugCall32768<>, 32768)
  1621DEBUG_CALL_FN(debugCall65536<>, 65536)
  1622
  1623// func debugCallPanicked(val interface{})
  1624TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
  1625	// Copy the panic value to the top of stack.
  1626	MOVQ	val_type+0(FP), AX
  1627	MOVQ	AX, 0(SP)
  1628	MOVQ	val_data+8(FP), AX
  1629	MOVQ	AX, 8(SP)
  1630	MOVQ	$2, AX
  1631	BYTE	$0xcc
  1632	RET

View as plain text