Black Lives Matter. Support the Equal Justice Initiative.

Source file src/cmd/compile/internal/ppc64/ssa.go

Documentation: cmd/compile/internal/ppc64

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ppc64
     6  
     7  import (
     8  	"cmd/compile/internal/gc"
     9  	"cmd/compile/internal/logopt"
    10  	"cmd/compile/internal/ssa"
    11  	"cmd/compile/internal/types"
    12  	"cmd/internal/obj"
    13  	"cmd/internal/obj/ppc64"
    14  	"cmd/internal/objabi"
    15  	"math"
    16  	"strings"
    17  )
    18  
    19  // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
    20  func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
    21  	//	flive := b.FlagsLiveAtEnd
    22  	//	if b.Control != nil && b.Control.Type.IsFlags() {
    23  	//		flive = true
    24  	//	}
    25  	//	for i := len(b.Values) - 1; i >= 0; i-- {
    26  	//		v := b.Values[i]
    27  	//		if flive && (v.Op == v.Op == ssa.OpPPC64MOVDconst) {
    28  	//			// The "mark" is any non-nil Aux value.
    29  	//			v.Aux = v
    30  	//		}
    31  	//		if v.Type.IsFlags() {
    32  	//			flive = false
    33  	//		}
    34  	//		for _, a := range v.Args {
    35  	//			if a.Type.IsFlags() {
    36  	//				flive = true
    37  	//			}
    38  	//		}
    39  	//	}
    40  }
    41  
    42  // loadByType returns the load instruction of the given type.
    43  func loadByType(t *types.Type) obj.As {
    44  	if t.IsFloat() {
    45  		switch t.Size() {
    46  		case 4:
    47  			return ppc64.AFMOVS
    48  		case 8:
    49  			return ppc64.AFMOVD
    50  		}
    51  	} else {
    52  		switch t.Size() {
    53  		case 1:
    54  			if t.IsSigned() {
    55  				return ppc64.AMOVB
    56  			} else {
    57  				return ppc64.AMOVBZ
    58  			}
    59  		case 2:
    60  			if t.IsSigned() {
    61  				return ppc64.AMOVH
    62  			} else {
    63  				return ppc64.AMOVHZ
    64  			}
    65  		case 4:
    66  			if t.IsSigned() {
    67  				return ppc64.AMOVW
    68  			} else {
    69  				return ppc64.AMOVWZ
    70  			}
    71  		case 8:
    72  			return ppc64.AMOVD
    73  		}
    74  	}
    75  	panic("bad load type")
    76  }
    77  
    78  // storeByType returns the store instruction of the given type.
    79  func storeByType(t *types.Type) obj.As {
    80  	if t.IsFloat() {
    81  		switch t.Size() {
    82  		case 4:
    83  			return ppc64.AFMOVS
    84  		case 8:
    85  			return ppc64.AFMOVD
    86  		}
    87  	} else {
    88  		switch t.Size() {
    89  		case 1:
    90  			return ppc64.AMOVB
    91  		case 2:
    92  			return ppc64.AMOVH
    93  		case 4:
    94  			return ppc64.AMOVW
    95  		case 8:
    96  			return ppc64.AMOVD
    97  		}
    98  	}
    99  	panic("bad store type")
   100  }
   101  
   102  func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
   103  	switch v.Op {
   104  	case ssa.OpCopy:
   105  		t := v.Type
   106  		if t.IsMemory() {
   107  			return
   108  		}
   109  		x := v.Args[0].Reg()
   110  		y := v.Reg()
   111  		if x != y {
   112  			rt := obj.TYPE_REG
   113  			op := ppc64.AMOVD
   114  
   115  			if t.IsFloat() {
   116  				op = ppc64.AFMOVD
   117  			}
   118  			p := s.Prog(op)
   119  			p.From.Type = rt
   120  			p.From.Reg = x
   121  			p.To.Type = rt
   122  			p.To.Reg = y
   123  		}
   124  
   125  	case ssa.OpPPC64LoweredMuluhilo:
   126  		// MULHDU	Rarg1, Rarg0, Reg0
   127  		// MULLD	Rarg1, Rarg0, Reg1
   128  		r0 := v.Args[0].Reg()
   129  		r1 := v.Args[1].Reg()
   130  		p := s.Prog(ppc64.AMULHDU)
   131  		p.From.Type = obj.TYPE_REG
   132  		p.From.Reg = r1
   133  		p.Reg = r0
   134  		p.To.Type = obj.TYPE_REG
   135  		p.To.Reg = v.Reg0()
   136  		p1 := s.Prog(ppc64.AMULLD)
   137  		p1.From.Type = obj.TYPE_REG
   138  		p1.From.Reg = r1
   139  		p1.Reg = r0
   140  		p1.To.Type = obj.TYPE_REG
   141  		p1.To.Reg = v.Reg1()
   142  
   143  	case ssa.OpPPC64LoweredAdd64Carry:
   144  		// ADDC		Rarg2, -1, Rtmp
   145  		// ADDE		Rarg1, Rarg0, Reg0
   146  		// ADDZE	Rzero, Reg1
   147  		r0 := v.Args[0].Reg()
   148  		r1 := v.Args[1].Reg()
   149  		r2 := v.Args[2].Reg()
   150  		p := s.Prog(ppc64.AADDC)
   151  		p.From.Type = obj.TYPE_CONST
   152  		p.From.Offset = -1
   153  		p.Reg = r2
   154  		p.To.Type = obj.TYPE_REG
   155  		p.To.Reg = ppc64.REGTMP
   156  		p1 := s.Prog(ppc64.AADDE)
   157  		p1.From.Type = obj.TYPE_REG
   158  		p1.From.Reg = r1
   159  		p1.Reg = r0
   160  		p1.To.Type = obj.TYPE_REG
   161  		p1.To.Reg = v.Reg0()
   162  		p2 := s.Prog(ppc64.AADDZE)
   163  		p2.From.Type = obj.TYPE_REG
   164  		p2.From.Reg = ppc64.REGZERO
   165  		p2.To.Type = obj.TYPE_REG
   166  		p2.To.Reg = v.Reg1()
   167  
   168  	case ssa.OpPPC64LoweredAtomicAnd8,
   169  		ssa.OpPPC64LoweredAtomicAnd32,
   170  		ssa.OpPPC64LoweredAtomicOr8,
   171  		ssa.OpPPC64LoweredAtomicOr32:
   172  		// LWSYNC
   173  		// LBAR/LWAR	(Rarg0), Rtmp
   174  		// AND/OR	Rarg1, Rtmp
   175  		// STBCCC/STWCCC Rtmp, (Rarg0)
   176  		// BNE		-3(PC)
   177  		ld := ppc64.ALBAR
   178  		st := ppc64.ASTBCCC
   179  		if v.Op == ssa.OpPPC64LoweredAtomicAnd32 || v.Op == ssa.OpPPC64LoweredAtomicOr32 {
   180  			ld = ppc64.ALWAR
   181  			st = ppc64.ASTWCCC
   182  		}
   183  		r0 := v.Args[0].Reg()
   184  		r1 := v.Args[1].Reg()
   185  		// LWSYNC - Assuming shared data not write-through-required nor
   186  		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
   187  		plwsync := s.Prog(ppc64.ALWSYNC)
   188  		plwsync.To.Type = obj.TYPE_NONE
   189  		// LBAR or LWAR
   190  		p := s.Prog(ld)
   191  		p.From.Type = obj.TYPE_MEM
   192  		p.From.Reg = r0
   193  		p.To.Type = obj.TYPE_REG
   194  		p.To.Reg = ppc64.REGTMP
   195  		// AND/OR reg1,out
   196  		p1 := s.Prog(v.Op.Asm())
   197  		p1.From.Type = obj.TYPE_REG
   198  		p1.From.Reg = r1
   199  		p1.To.Type = obj.TYPE_REG
   200  		p1.To.Reg = ppc64.REGTMP
   201  		// STBCCC or STWCCC
   202  		p2 := s.Prog(st)
   203  		p2.From.Type = obj.TYPE_REG
   204  		p2.From.Reg = ppc64.REGTMP
   205  		p2.To.Type = obj.TYPE_MEM
   206  		p2.To.Reg = r0
   207  		p2.RegTo2 = ppc64.REGTMP
   208  		// BNE retry
   209  		p3 := s.Prog(ppc64.ABNE)
   210  		p3.To.Type = obj.TYPE_BRANCH
   211  		gc.Patch(p3, p)
   212  
   213  	case ssa.OpPPC64LoweredAtomicAdd32,
   214  		ssa.OpPPC64LoweredAtomicAdd64:
   215  		// LWSYNC
   216  		// LDAR/LWAR    (Rarg0), Rout
   217  		// ADD		Rarg1, Rout
   218  		// STDCCC/STWCCC Rout, (Rarg0)
   219  		// BNE         -3(PC)
   220  		// MOVW		Rout,Rout (if Add32)
   221  		ld := ppc64.ALDAR
   222  		st := ppc64.ASTDCCC
   223  		if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
   224  			ld = ppc64.ALWAR
   225  			st = ppc64.ASTWCCC
   226  		}
   227  		r0 := v.Args[0].Reg()
   228  		r1 := v.Args[1].Reg()
   229  		out := v.Reg0()
   230  		// LWSYNC - Assuming shared data not write-through-required nor
   231  		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
   232  		plwsync := s.Prog(ppc64.ALWSYNC)
   233  		plwsync.To.Type = obj.TYPE_NONE
   234  		// LDAR or LWAR
   235  		p := s.Prog(ld)
   236  		p.From.Type = obj.TYPE_MEM
   237  		p.From.Reg = r0
   238  		p.To.Type = obj.TYPE_REG
   239  		p.To.Reg = out
   240  		// ADD reg1,out
   241  		p1 := s.Prog(ppc64.AADD)
   242  		p1.From.Type = obj.TYPE_REG
   243  		p1.From.Reg = r1
   244  		p1.To.Reg = out
   245  		p1.To.Type = obj.TYPE_REG
   246  		// STDCCC or STWCCC
   247  		p3 := s.Prog(st)
   248  		p3.From.Type = obj.TYPE_REG
   249  		p3.From.Reg = out
   250  		p3.To.Type = obj.TYPE_MEM
   251  		p3.To.Reg = r0
   252  		// BNE retry
   253  		p4 := s.Prog(ppc64.ABNE)
   254  		p4.To.Type = obj.TYPE_BRANCH
   255  		gc.Patch(p4, p)
   256  
   257  		// Ensure a 32 bit result
   258  		if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
   259  			p5 := s.Prog(ppc64.AMOVWZ)
   260  			p5.To.Type = obj.TYPE_REG
   261  			p5.To.Reg = out
   262  			p5.From.Type = obj.TYPE_REG
   263  			p5.From.Reg = out
   264  		}
   265  
   266  	case ssa.OpPPC64LoweredAtomicExchange32,
   267  		ssa.OpPPC64LoweredAtomicExchange64:
   268  		// LWSYNC
   269  		// LDAR/LWAR    (Rarg0), Rout
   270  		// STDCCC/STWCCC Rout, (Rarg0)
   271  		// BNE         -2(PC)
   272  		// ISYNC
   273  		ld := ppc64.ALDAR
   274  		st := ppc64.ASTDCCC
   275  		if v.Op == ssa.OpPPC64LoweredAtomicExchange32 {
   276  			ld = ppc64.ALWAR
   277  			st = ppc64.ASTWCCC
   278  		}
   279  		r0 := v.Args[0].Reg()
   280  		r1 := v.Args[1].Reg()
   281  		out := v.Reg0()
   282  		// LWSYNC - Assuming shared data not write-through-required nor
   283  		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
   284  		plwsync := s.Prog(ppc64.ALWSYNC)
   285  		plwsync.To.Type = obj.TYPE_NONE
   286  		// LDAR or LWAR
   287  		p := s.Prog(ld)
   288  		p.From.Type = obj.TYPE_MEM
   289  		p.From.Reg = r0
   290  		p.To.Type = obj.TYPE_REG
   291  		p.To.Reg = out
   292  		// STDCCC or STWCCC
   293  		p1 := s.Prog(st)
   294  		p1.From.Type = obj.TYPE_REG
   295  		p1.From.Reg = r1
   296  		p1.To.Type = obj.TYPE_MEM
   297  		p1.To.Reg = r0
   298  		// BNE retry
   299  		p2 := s.Prog(ppc64.ABNE)
   300  		p2.To.Type = obj.TYPE_BRANCH
   301  		gc.Patch(p2, p)
   302  		// ISYNC
   303  		pisync := s.Prog(ppc64.AISYNC)
   304  		pisync.To.Type = obj.TYPE_NONE
   305  
   306  	case ssa.OpPPC64LoweredAtomicLoad8,
   307  		ssa.OpPPC64LoweredAtomicLoad32,
   308  		ssa.OpPPC64LoweredAtomicLoad64,
   309  		ssa.OpPPC64LoweredAtomicLoadPtr:
   310  		// SYNC
   311  		// MOVB/MOVD/MOVW (Rarg0), Rout
   312  		// CMP Rout,Rout
   313  		// BNE 1(PC)
   314  		// ISYNC
   315  		ld := ppc64.AMOVD
   316  		cmp := ppc64.ACMP
   317  		switch v.Op {
   318  		case ssa.OpPPC64LoweredAtomicLoad8:
   319  			ld = ppc64.AMOVBZ
   320  		case ssa.OpPPC64LoweredAtomicLoad32:
   321  			ld = ppc64.AMOVWZ
   322  			cmp = ppc64.ACMPW
   323  		}
   324  		arg0 := v.Args[0].Reg()
   325  		out := v.Reg0()
   326  		// SYNC when AuxInt == 1; otherwise, load-acquire
   327  		if v.AuxInt == 1 {
   328  			psync := s.Prog(ppc64.ASYNC)
   329  			psync.To.Type = obj.TYPE_NONE
   330  		}
   331  		// Load
   332  		p := s.Prog(ld)
   333  		p.From.Type = obj.TYPE_MEM
   334  		p.From.Reg = arg0
   335  		p.To.Type = obj.TYPE_REG
   336  		p.To.Reg = out
   337  		// CMP
   338  		p1 := s.Prog(cmp)
   339  		p1.From.Type = obj.TYPE_REG
   340  		p1.From.Reg = out
   341  		p1.To.Type = obj.TYPE_REG
   342  		p1.To.Reg = out
   343  		// BNE
   344  		p2 := s.Prog(ppc64.ABNE)
   345  		p2.To.Type = obj.TYPE_BRANCH
   346  		// ISYNC
   347  		pisync := s.Prog(ppc64.AISYNC)
   348  		pisync.To.Type = obj.TYPE_NONE
   349  		gc.Patch(p2, pisync)
   350  
   351  	case ssa.OpPPC64LoweredAtomicStore8,
   352  		ssa.OpPPC64LoweredAtomicStore32,
   353  		ssa.OpPPC64LoweredAtomicStore64:
   354  		// SYNC or LWSYNC
   355  		// MOVB/MOVW/MOVD arg1,(arg0)
   356  		st := ppc64.AMOVD
   357  		switch v.Op {
   358  		case ssa.OpPPC64LoweredAtomicStore8:
   359  			st = ppc64.AMOVB
   360  		case ssa.OpPPC64LoweredAtomicStore32:
   361  			st = ppc64.AMOVW
   362  		}
   363  		arg0 := v.Args[0].Reg()
   364  		arg1 := v.Args[1].Reg()
   365  		// If AuxInt == 0, LWSYNC (Store-Release), else SYNC
   366  		// SYNC
   367  		syncOp := ppc64.ASYNC
   368  		if v.AuxInt == 0 {
   369  			syncOp = ppc64.ALWSYNC
   370  		}
   371  		psync := s.Prog(syncOp)
   372  		psync.To.Type = obj.TYPE_NONE
   373  		// Store
   374  		p := s.Prog(st)
   375  		p.To.Type = obj.TYPE_MEM
   376  		p.To.Reg = arg0
   377  		p.From.Type = obj.TYPE_REG
   378  		p.From.Reg = arg1
   379  
   380  	case ssa.OpPPC64LoweredAtomicCas64,
   381  		ssa.OpPPC64LoweredAtomicCas32:
   382  		// LWSYNC
   383  		// loop:
   384  		// LDAR        (Rarg0), MutexHint, Rtmp
   385  		// CMP         Rarg1, Rtmp
   386  		// BNE         fail
   387  		// STDCCC      Rarg2, (Rarg0)
   388  		// BNE         loop
   389  		// LWSYNC      // Only for sequential consistency; not required in CasRel.
   390  		// MOVD        $1, Rout
   391  		// BR          end
   392  		// fail:
   393  		// MOVD        $0, Rout
   394  		// end:
   395  		ld := ppc64.ALDAR
   396  		st := ppc64.ASTDCCC
   397  		cmp := ppc64.ACMP
   398  		if v.Op == ssa.OpPPC64LoweredAtomicCas32 {
   399  			ld = ppc64.ALWAR
   400  			st = ppc64.ASTWCCC
   401  			cmp = ppc64.ACMPW
   402  		}
   403  		r0 := v.Args[0].Reg()
   404  		r1 := v.Args[1].Reg()
   405  		r2 := v.Args[2].Reg()
   406  		out := v.Reg0()
   407  		// LWSYNC - Assuming shared data not write-through-required nor
   408  		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
   409  		plwsync1 := s.Prog(ppc64.ALWSYNC)
   410  		plwsync1.To.Type = obj.TYPE_NONE
   411  		// LDAR or LWAR
   412  		p := s.Prog(ld)
   413  		p.From.Type = obj.TYPE_MEM
   414  		p.From.Reg = r0
   415  		p.To.Type = obj.TYPE_REG
   416  		p.To.Reg = ppc64.REGTMP
   417  		// If it is a Compare-and-Swap-Release operation, set the EH field with
   418  		// the release hint.
   419  		if v.AuxInt == 0 {
   420  			p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: 0})
   421  		}
   422  		// CMP reg1,reg2
   423  		p1 := s.Prog(cmp)
   424  		p1.From.Type = obj.TYPE_REG
   425  		p1.From.Reg = r1
   426  		p1.To.Reg = ppc64.REGTMP
   427  		p1.To.Type = obj.TYPE_REG
   428  		// BNE cas_fail
   429  		p2 := s.Prog(ppc64.ABNE)
   430  		p2.To.Type = obj.TYPE_BRANCH
   431  		// STDCCC or STWCCC
   432  		p3 := s.Prog(st)
   433  		p3.From.Type = obj.TYPE_REG
   434  		p3.From.Reg = r2
   435  		p3.To.Type = obj.TYPE_MEM
   436  		p3.To.Reg = r0
   437  		// BNE retry
   438  		p4 := s.Prog(ppc64.ABNE)
   439  		p4.To.Type = obj.TYPE_BRANCH
   440  		gc.Patch(p4, p)
   441  		// LWSYNC - Assuming shared data not write-through-required nor
   442  		// caching-inhibited. See Appendix B.2.1.1 in the ISA 2.07b.
   443  		// If the operation is a CAS-Release, then synchronization is not necessary.
   444  		if v.AuxInt != 0 {
   445  			plwsync2 := s.Prog(ppc64.ALWSYNC)
   446  			plwsync2.To.Type = obj.TYPE_NONE
   447  		}
   448  		// return true
   449  		p5 := s.Prog(ppc64.AMOVD)
   450  		p5.From.Type = obj.TYPE_CONST
   451  		p5.From.Offset = 1
   452  		p5.To.Type = obj.TYPE_REG
   453  		p5.To.Reg = out
   454  		// BR done
   455  		p6 := s.Prog(obj.AJMP)
   456  		p6.To.Type = obj.TYPE_BRANCH
   457  		// return false
   458  		p7 := s.Prog(ppc64.AMOVD)
   459  		p7.From.Type = obj.TYPE_CONST
   460  		p7.From.Offset = 0
   461  		p7.To.Type = obj.TYPE_REG
   462  		p7.To.Reg = out
   463  		gc.Patch(p2, p7)
   464  		// done (label)
   465  		p8 := s.Prog(obj.ANOP)
   466  		gc.Patch(p6, p8)
   467  
   468  	case ssa.OpPPC64LoweredGetClosurePtr:
   469  		// Closure pointer is R11 (already)
   470  		gc.CheckLoweredGetClosurePtr(v)
   471  
   472  	case ssa.OpPPC64LoweredGetCallerSP:
   473  		// caller's SP is FixedFrameSize below the address of the first arg
   474  		p := s.Prog(ppc64.AMOVD)
   475  		p.From.Type = obj.TYPE_ADDR
   476  		p.From.Offset = -gc.Ctxt.FixedFrameSize()
   477  		p.From.Name = obj.NAME_PARAM
   478  		p.To.Type = obj.TYPE_REG
   479  		p.To.Reg = v.Reg()
   480  
   481  	case ssa.OpPPC64LoweredGetCallerPC:
   482  		p := s.Prog(obj.AGETCALLERPC)
   483  		p.To.Type = obj.TYPE_REG
   484  		p.To.Reg = v.Reg()
   485  
   486  	case ssa.OpPPC64LoweredRound32F, ssa.OpPPC64LoweredRound64F:
   487  		// input is already rounded
   488  
   489  	case ssa.OpLoadReg:
   490  		loadOp := loadByType(v.Type)
   491  		p := s.Prog(loadOp)
   492  		gc.AddrAuto(&p.From, v.Args[0])
   493  		p.To.Type = obj.TYPE_REG
   494  		p.To.Reg = v.Reg()
   495  
   496  	case ssa.OpStoreReg:
   497  		storeOp := storeByType(v.Type)
   498  		p := s.Prog(storeOp)
   499  		p.From.Type = obj.TYPE_REG
   500  		p.From.Reg = v.Args[0].Reg()
   501  		gc.AddrAuto(&p.To, v)
   502  
   503  	case ssa.OpPPC64DIVD:
   504  		// For now,
   505  		//
   506  		// cmp arg1, -1
   507  		// be  ahead
   508  		// v = arg0 / arg1
   509  		// b over
   510  		// ahead: v = - arg0
   511  		// over: nop
   512  		r := v.Reg()
   513  		r0 := v.Args[0].Reg()
   514  		r1 := v.Args[1].Reg()
   515  
   516  		p := s.Prog(ppc64.ACMP)
   517  		p.From.Type = obj.TYPE_REG
   518  		p.From.Reg = r1
   519  		p.To.Type = obj.TYPE_CONST
   520  		p.To.Offset = -1
   521  
   522  		pbahead := s.Prog(ppc64.ABEQ)
   523  		pbahead.To.Type = obj.TYPE_BRANCH
   524  
   525  		p = s.Prog(v.Op.Asm())
   526  		p.From.Type = obj.TYPE_REG
   527  		p.From.Reg = r1
   528  		p.Reg = r0
   529  		p.To.Type = obj.TYPE_REG
   530  		p.To.Reg = r
   531  
   532  		pbover := s.Prog(obj.AJMP)
   533  		pbover.To.Type = obj.TYPE_BRANCH
   534  
   535  		p = s.Prog(ppc64.ANEG)
   536  		p.To.Type = obj.TYPE_REG
   537  		p.To.Reg = r
   538  		p.From.Type = obj.TYPE_REG
   539  		p.From.Reg = r0
   540  		gc.Patch(pbahead, p)
   541  
   542  		p = s.Prog(obj.ANOP)
   543  		gc.Patch(pbover, p)
   544  
   545  	case ssa.OpPPC64DIVW:
   546  		// word-width version of above
   547  		r := v.Reg()
   548  		r0 := v.Args[0].Reg()
   549  		r1 := v.Args[1].Reg()
   550  
   551  		p := s.Prog(ppc64.ACMPW)
   552  		p.From.Type = obj.TYPE_REG
   553  		p.From.Reg = r1
   554  		p.To.Type = obj.TYPE_CONST
   555  		p.To.Offset = -1
   556  
   557  		pbahead := s.Prog(ppc64.ABEQ)
   558  		pbahead.To.Type = obj.TYPE_BRANCH
   559  
   560  		p = s.Prog(v.Op.Asm())
   561  		p.From.Type = obj.TYPE_REG
   562  		p.From.Reg = r1
   563  		p.Reg = r0
   564  		p.To.Type = obj.TYPE_REG
   565  		p.To.Reg = r
   566  
   567  		pbover := s.Prog(obj.AJMP)
   568  		pbover.To.Type = obj.TYPE_BRANCH
   569  
   570  		p = s.Prog(ppc64.ANEG)
   571  		p.To.Type = obj.TYPE_REG
   572  		p.To.Reg = r
   573  		p.From.Type = obj.TYPE_REG
   574  		p.From.Reg = r0
   575  		gc.Patch(pbahead, p)
   576  
   577  		p = s.Prog(obj.ANOP)
   578  		gc.Patch(pbover, p)
   579  
   580  	case ssa.OpPPC64CLRLSLWI:
   581  		r := v.Reg()
   582  		r1 := v.Args[0].Reg()
   583  		shifts := v.AuxInt
   584  		p := s.Prog(v.Op.Asm())
   585  		// clrlslwi ra,rs,mb,sh will become rlwinm ra,rs,sh,mb-sh,31-sh as described in ISA
   586  		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}
   587  		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)})
   588  		p.Reg = r1
   589  		p.To.Type = obj.TYPE_REG
   590  		p.To.Reg = r
   591  
   592  	case ssa.OpPPC64CLRLSLDI:
   593  		r := v.Reg()
   594  		r1 := v.Args[0].Reg()
   595  		shifts := v.AuxInt
   596  		p := s.Prog(v.Op.Asm())
   597  		// clrlsldi ra,rs,mb,sh will become rldic ra,rs,sh,mb-sh
   598  		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}
   599  		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)})
   600  		p.Reg = r1
   601  		p.To.Type = obj.TYPE_REG
   602  		p.To.Reg = r
   603  
   604  		// Mask has been set as sh
   605  	case ssa.OpPPC64RLDICL:
   606  		r := v.Reg()
   607  		r1 := v.Args[0].Reg()
   608  		shifts := v.AuxInt
   609  		p := s.Prog(v.Op.Asm())
   610  		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)}
   611  		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)})
   612  		p.Reg = r1
   613  		p.To.Type = obj.TYPE_REG
   614  		p.To.Reg = r
   615  
   616  	case ssa.OpPPC64ADD, ssa.OpPPC64FADD, ssa.OpPPC64FADDS, ssa.OpPPC64SUB, ssa.OpPPC64FSUB, ssa.OpPPC64FSUBS,
   617  		ssa.OpPPC64MULLD, ssa.OpPPC64MULLW, ssa.OpPPC64DIVDU, ssa.OpPPC64DIVWU,
   618  		ssa.OpPPC64SRAD, ssa.OpPPC64SRAW, ssa.OpPPC64SRD, ssa.OpPPC64SRW, ssa.OpPPC64SLD, ssa.OpPPC64SLW,
   619  		ssa.OpPPC64ROTL, ssa.OpPPC64ROTLW,
   620  		ssa.OpPPC64MULHD, ssa.OpPPC64MULHW, ssa.OpPPC64MULHDU, ssa.OpPPC64MULHWU,
   621  		ssa.OpPPC64FMUL, ssa.OpPPC64FMULS, ssa.OpPPC64FDIV, ssa.OpPPC64FDIVS, ssa.OpPPC64FCPSGN,
   622  		ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64ANDN, ssa.OpPPC64ORN, ssa.OpPPC64NOR, ssa.OpPPC64XOR, ssa.OpPPC64EQV,
   623  		ssa.OpPPC64MODUD, ssa.OpPPC64MODSD, ssa.OpPPC64MODUW, ssa.OpPPC64MODSW:
   624  		r := v.Reg()
   625  		r1 := v.Args[0].Reg()
   626  		r2 := v.Args[1].Reg()
   627  		p := s.Prog(v.Op.Asm())
   628  		p.From.Type = obj.TYPE_REG
   629  		p.From.Reg = r2
   630  		p.Reg = r1
   631  		p.To.Type = obj.TYPE_REG
   632  		p.To.Reg = r
   633  
   634  	case ssa.OpPPC64ANDCC, ssa.OpPPC64ORCC, ssa.OpPPC64XORCC:
   635  		r1 := v.Args[0].Reg()
   636  		r2 := v.Args[1].Reg()
   637  		p := s.Prog(v.Op.Asm())
   638  		p.From.Type = obj.TYPE_REG
   639  		p.From.Reg = r2
   640  		p.Reg = r1
   641  		p.To.Type = obj.TYPE_REG
   642  		p.To.Reg = ppc64.REGTMP // result is not needed
   643  
   644  	case ssa.OpPPC64ROTLconst, ssa.OpPPC64ROTLWconst:
   645  		p := s.Prog(v.Op.Asm())
   646  		p.From.Type = obj.TYPE_CONST
   647  		p.From.Offset = v.AuxInt
   648  		p.Reg = v.Args[0].Reg()
   649  		p.To.Type = obj.TYPE_REG
   650  		p.To.Reg = v.Reg()
   651  
   652  		// Auxint holds encoded rotate + mask
   653  	case ssa.OpPPC64RLWINM, ssa.OpPPC64RLWMI:
   654  		rot, _, _, mask := ssa.DecodePPC64RotateMask(v.AuxInt)
   655  		p := s.Prog(v.Op.Asm())
   656  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   657  		p.Reg = v.Args[0].Reg()
   658  		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(rot)}
   659  		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: int64(mask)})
   660  
   661  		// Auxint holds mask
   662  	case ssa.OpPPC64RLWNM:
   663  		_, _, _, mask := ssa.DecodePPC64RotateMask(v.AuxInt)
   664  		p := s.Prog(v.Op.Asm())
   665  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   666  		p.Reg = v.Args[0].Reg()
   667  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()}
   668  		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: int64(mask)})
   669  
   670  	case ssa.OpPPC64MADDLD:
   671  		r := v.Reg()
   672  		r1 := v.Args[0].Reg()
   673  		r2 := v.Args[1].Reg()
   674  		r3 := v.Args[2].Reg()
   675  		// r = r1*r2 ± r3
   676  		p := s.Prog(v.Op.Asm())
   677  		p.From.Type = obj.TYPE_REG
   678  		p.From.Reg = r1
   679  		p.Reg = r2
   680  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: r3})
   681  		p.To.Type = obj.TYPE_REG
   682  		p.To.Reg = r
   683  
   684  	case ssa.OpPPC64FMADD, ssa.OpPPC64FMADDS, ssa.OpPPC64FMSUB, ssa.OpPPC64FMSUBS:
   685  		r := v.Reg()
   686  		r1 := v.Args[0].Reg()
   687  		r2 := v.Args[1].Reg()
   688  		r3 := v.Args[2].Reg()
   689  		// r = r1*r2 ± r3
   690  		p := s.Prog(v.Op.Asm())
   691  		p.From.Type = obj.TYPE_REG
   692  		p.From.Reg = r1
   693  		p.Reg = r3
   694  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: r2})
   695  		p.To.Type = obj.TYPE_REG
   696  		p.To.Reg = r
   697  
   698  	case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FFLOOR, ssa.OpPPC64FTRUNC, ssa.OpPPC64FCEIL,
   699  		ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FCFIDS, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW,
   700  		ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB, ssa.OpPPC64MFVSRD, ssa.OpPPC64MTVSRD, ssa.OpPPC64FABS, ssa.OpPPC64FNABS,
   701  		ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD:
   702  		r := v.Reg()
   703  		p := s.Prog(v.Op.Asm())
   704  		p.To.Type = obj.TYPE_REG
   705  		p.To.Reg = r
   706  		p.From.Type = obj.TYPE_REG
   707  		p.From.Reg = v.Args[0].Reg()
   708  
   709  	case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst,
   710  		ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst,
   711  		ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst, ssa.OpPPC64MULLWconst, ssa.OpPPC64MULLDconst:
   712  		p := s.Prog(v.Op.Asm())
   713  		p.Reg = v.Args[0].Reg()
   714  		p.From.Type = obj.TYPE_CONST
   715  		p.From.Offset = v.AuxInt
   716  		p.To.Type = obj.TYPE_REG
   717  		p.To.Reg = v.Reg()
   718  
   719  	case ssa.OpPPC64SUBFCconst:
   720  		p := s.Prog(v.Op.Asm())
   721  		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: v.AuxInt})
   722  		p.From.Type = obj.TYPE_REG
   723  		p.From.Reg = v.Args[0].Reg()
   724  		p.To.Type = obj.TYPE_REG
   725  		p.To.Reg = v.Reg()
   726  
   727  	case ssa.OpPPC64ANDCCconst:
   728  		p := s.Prog(v.Op.Asm())
   729  		p.Reg = v.Args[0].Reg()
   730  		p.From.Type = obj.TYPE_CONST
   731  		p.From.Offset = v.AuxInt
   732  		p.To.Type = obj.TYPE_REG
   733  		p.To.Reg = ppc64.REGTMP // discard result
   734  
   735  	case ssa.OpPPC64MOVDaddr:
   736  		switch v.Aux.(type) {
   737  		default:
   738  			v.Fatalf("aux in MOVDaddr is of unknown type %T", v.Aux)
   739  		case nil:
   740  			// If aux offset and aux int are both 0, and the same
   741  			// input and output regs are used, no instruction
   742  			// needs to be generated, since it would just be
   743  			// addi rx, rx, 0.
   744  			if v.AuxInt != 0 || v.Args[0].Reg() != v.Reg() {
   745  				p := s.Prog(ppc64.AMOVD)
   746  				p.From.Type = obj.TYPE_ADDR
   747  				p.From.Reg = v.Args[0].Reg()
   748  				p.From.Offset = v.AuxInt
   749  				p.To.Type = obj.TYPE_REG
   750  				p.To.Reg = v.Reg()
   751  			}
   752  
   753  		case *obj.LSym, *gc.Node:
   754  			p := s.Prog(ppc64.AMOVD)
   755  			p.From.Type = obj.TYPE_ADDR
   756  			p.From.Reg = v.Args[0].Reg()
   757  			p.To.Type = obj.TYPE_REG
   758  			p.To.Reg = v.Reg()
   759  			gc.AddAux(&p.From, v)
   760  
   761  		}
   762  
   763  	case ssa.OpPPC64MOVDconst:
   764  		p := s.Prog(v.Op.Asm())
   765  		p.From.Type = obj.TYPE_CONST
   766  		p.From.Offset = v.AuxInt
   767  		p.To.Type = obj.TYPE_REG
   768  		p.To.Reg = v.Reg()
   769  
   770  	case ssa.OpPPC64FMOVDconst, ssa.OpPPC64FMOVSconst:
   771  		p := s.Prog(v.Op.Asm())
   772  		p.From.Type = obj.TYPE_FCONST
   773  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   774  		p.To.Type = obj.TYPE_REG
   775  		p.To.Reg = v.Reg()
   776  
   777  	case ssa.OpPPC64FCMPU, ssa.OpPPC64CMP, ssa.OpPPC64CMPW, ssa.OpPPC64CMPU, ssa.OpPPC64CMPWU:
   778  		p := s.Prog(v.Op.Asm())
   779  		p.From.Type = obj.TYPE_REG
   780  		p.From.Reg = v.Args[0].Reg()
   781  		p.To.Type = obj.TYPE_REG
   782  		p.To.Reg = v.Args[1].Reg()
   783  
   784  	case ssa.OpPPC64CMPconst, ssa.OpPPC64CMPUconst, ssa.OpPPC64CMPWconst, ssa.OpPPC64CMPWUconst:
   785  		p := s.Prog(v.Op.Asm())
   786  		p.From.Type = obj.TYPE_REG
   787  		p.From.Reg = v.Args[0].Reg()
   788  		p.To.Type = obj.TYPE_CONST
   789  		p.To.Offset = v.AuxInt
   790  
   791  	case ssa.OpPPC64MOVBreg, ssa.OpPPC64MOVBZreg, ssa.OpPPC64MOVHreg, ssa.OpPPC64MOVHZreg, ssa.OpPPC64MOVWreg, ssa.OpPPC64MOVWZreg:
   792  		// Shift in register to required size
   793  		p := s.Prog(v.Op.Asm())
   794  		p.From.Type = obj.TYPE_REG
   795  		p.From.Reg = v.Args[0].Reg()
   796  		p.To.Reg = v.Reg()
   797  		p.To.Type = obj.TYPE_REG
   798  
   799  	case ssa.OpPPC64MOVDload:
   800  
   801  		// MOVDload uses a DS instruction which requires the offset value of the data to be a multiple of 4.
   802  		// For offsets known at compile time, a MOVDload won't be selected, but in the case of a go.string,
   803  		// the offset is not known until link time. If the load of a go.string uses relocation for the
   804  		// offset field of the instruction, and if the offset is not aligned to 4, then a link error will occur.
   805  		// To avoid this problem, the full address of the go.string is computed and loaded into the base register,
   806  		// and that base register is used for the MOVDload using a 0 offset. This problem can only occur with
   807  		// go.string types because other types will have proper alignment.
   808  
   809  		gostring := false
   810  		switch n := v.Aux.(type) {
   811  		case *obj.LSym:
   812  			gostring = strings.HasPrefix(n.Name, "go.string.")
   813  		}
   814  		if gostring {
   815  			// Generate full addr of the go.string const
   816  			// including AuxInt
   817  			p := s.Prog(ppc64.AMOVD)
   818  			p.From.Type = obj.TYPE_ADDR
   819  			p.From.Reg = v.Args[0].Reg()
   820  			gc.AddAux(&p.From, v)
   821  			p.To.Type = obj.TYPE_REG
   822  			p.To.Reg = v.Reg()
   823  			// Load go.string using 0 offset
   824  			p = s.Prog(v.Op.Asm())
   825  			p.From.Type = obj.TYPE_MEM
   826  			p.From.Reg = v.Reg()
   827  			p.To.Type = obj.TYPE_REG
   828  			p.To.Reg = v.Reg()
   829  			break
   830  		}
   831  		// Not a go.string, generate a normal load
   832  		fallthrough
   833  
   834  	case ssa.OpPPC64MOVWload, ssa.OpPPC64MOVHload, ssa.OpPPC64MOVWZload, ssa.OpPPC64MOVBZload, ssa.OpPPC64MOVHZload, ssa.OpPPC64FMOVDload, ssa.OpPPC64FMOVSload:
   835  		p := s.Prog(v.Op.Asm())
   836  		p.From.Type = obj.TYPE_MEM
   837  		p.From.Reg = v.Args[0].Reg()
   838  		gc.AddAux(&p.From, v)
   839  		p.To.Type = obj.TYPE_REG
   840  		p.To.Reg = v.Reg()
   841  
   842  	case ssa.OpPPC64MOVDBRload, ssa.OpPPC64MOVWBRload, ssa.OpPPC64MOVHBRload:
   843  		p := s.Prog(v.Op.Asm())
   844  		p.From.Type = obj.TYPE_MEM
   845  		p.From.Reg = v.Args[0].Reg()
   846  		p.To.Type = obj.TYPE_REG
   847  		p.To.Reg = v.Reg()
   848  
   849  	case ssa.OpPPC64MOVDBRstore, ssa.OpPPC64MOVWBRstore, ssa.OpPPC64MOVHBRstore:
   850  		p := s.Prog(v.Op.Asm())
   851  		p.To.Type = obj.TYPE_MEM
   852  		p.To.Reg = v.Args[0].Reg()
   853  		p.From.Type = obj.TYPE_REG
   854  		p.From.Reg = v.Args[1].Reg()
   855  
   856  	case ssa.OpPPC64MOVDloadidx, ssa.OpPPC64MOVWloadidx, ssa.OpPPC64MOVHloadidx, ssa.OpPPC64MOVWZloadidx,
   857  		ssa.OpPPC64MOVBZloadidx, ssa.OpPPC64MOVHZloadidx, ssa.OpPPC64FMOVDloadidx, ssa.OpPPC64FMOVSloadidx,
   858  		ssa.OpPPC64MOVDBRloadidx, ssa.OpPPC64MOVWBRloadidx, ssa.OpPPC64MOVHBRloadidx:
   859  		p := s.Prog(v.Op.Asm())
   860  		p.From.Type = obj.TYPE_MEM
   861  		p.From.Reg = v.Args[0].Reg()
   862  		p.From.Index = v.Args[1].Reg()
   863  		p.To.Type = obj.TYPE_REG
   864  		p.To.Reg = v.Reg()
   865  
   866  	case ssa.OpPPC64MOVDstorezero, ssa.OpPPC64MOVWstorezero, ssa.OpPPC64MOVHstorezero, ssa.OpPPC64MOVBstorezero:
   867  		p := s.Prog(v.Op.Asm())
   868  		p.From.Type = obj.TYPE_REG
   869  		p.From.Reg = ppc64.REGZERO
   870  		p.To.Type = obj.TYPE_MEM
   871  		p.To.Reg = v.Args[0].Reg()
   872  		gc.AddAux(&p.To, v)
   873  
   874  	case ssa.OpPPC64MOVDstore, ssa.OpPPC64MOVWstore, ssa.OpPPC64MOVHstore, ssa.OpPPC64MOVBstore, ssa.OpPPC64FMOVDstore, ssa.OpPPC64FMOVSstore:
   875  		p := s.Prog(v.Op.Asm())
   876  		p.From.Type = obj.TYPE_REG
   877  		p.From.Reg = v.Args[1].Reg()
   878  		p.To.Type = obj.TYPE_MEM
   879  		p.To.Reg = v.Args[0].Reg()
   880  		gc.AddAux(&p.To, v)
   881  
   882  	case ssa.OpPPC64MOVDstoreidx, ssa.OpPPC64MOVWstoreidx, ssa.OpPPC64MOVHstoreidx, ssa.OpPPC64MOVBstoreidx,
   883  		ssa.OpPPC64FMOVDstoreidx, ssa.OpPPC64FMOVSstoreidx, ssa.OpPPC64MOVDBRstoreidx, ssa.OpPPC64MOVWBRstoreidx,
   884  		ssa.OpPPC64MOVHBRstoreidx:
   885  		p := s.Prog(v.Op.Asm())
   886  		p.From.Type = obj.TYPE_REG
   887  		p.From.Reg = v.Args[2].Reg()
   888  		p.To.Index = v.Args[1].Reg()
   889  		p.To.Type = obj.TYPE_MEM
   890  		p.To.Reg = v.Args[0].Reg()
   891  
   892  	case ssa.OpPPC64ISEL, ssa.OpPPC64ISELB:
   893  		// ISEL, ISELB
   894  		// AuxInt value indicates condition: 0=LT 1=GT 2=EQ 4=GE 5=LE 6=NE
   895  		// ISEL only accepts 0, 1, 2 condition values but the others can be
   896  		// achieved by swapping operand order.
   897  		// arg0 ? arg1 : arg2 with conditions LT, GT, EQ
   898  		// arg0 ? arg2 : arg1 for conditions GE, LE, NE
   899  		// ISELB is used when a boolean result is needed, returning 0 or 1
   900  		p := s.Prog(ppc64.AISEL)
   901  		p.To.Type = obj.TYPE_REG
   902  		p.To.Reg = v.Reg()
   903  		// For ISELB, boolean result 0 or 1. Use R0 for 0 operand to avoid load.
   904  		r := obj.Addr{Type: obj.TYPE_REG, Reg: ppc64.REG_R0}
   905  		if v.Op == ssa.OpPPC64ISEL {
   906  			r.Reg = v.Args[1].Reg()
   907  		}
   908  		// AuxInt values 4,5,6 implemented with reverse operand order from 0,1,2
   909  		if v.AuxInt > 3 {
   910  			p.Reg = r.Reg
   911  			p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
   912  		} else {
   913  			p.Reg = v.Args[0].Reg()
   914  			p.SetFrom3(r)
   915  		}
   916  		p.From.Type = obj.TYPE_CONST
   917  		p.From.Offset = v.AuxInt & 3
   918  
   919  	case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
   920  		// The LoweredQuad code generation
   921  		// generates STXV instructions on
   922  		// power9. The Short variation is used
   923  		// if no loop is generated.
   924  
   925  		// sizes >= 64 generate a loop as follows:
   926  
   927  		// Set up loop counter in CTR, used by BC
   928  		// XXLXOR clears VS32
   929  		//       XXLXOR VS32,VS32,VS32
   930  		//       MOVD len/64,REG_TMP
   931  		//       MOVD REG_TMP,CTR
   932  		//       loop:
   933  		//       STXV VS32,0(R20)
   934  		//       STXV VS32,16(R20)
   935  		//       STXV VS32,32(R20)
   936  		//       STXV VS32,48(R20)
   937  		//       ADD  $64,R20
   938  		//       BC   16, 0, loop
   939  
   940  		// Bytes per iteration
   941  		ctr := v.AuxInt / 64
   942  
   943  		// Remainder bytes
   944  		rem := v.AuxInt % 64
   945  
   946  		// Only generate a loop if there is more
   947  		// than 1 iteration.
   948  		if ctr > 1 {
   949  			// Set up VS32 (V0) to hold 0s
   950  			p := s.Prog(ppc64.AXXLXOR)
   951  			p.From.Type = obj.TYPE_REG
   952  			p.From.Reg = ppc64.REG_VS32
   953  			p.To.Type = obj.TYPE_REG
   954  			p.To.Reg = ppc64.REG_VS32
   955  			p.Reg = ppc64.REG_VS32
   956  
   957  			// Set up CTR loop counter
   958  			p = s.Prog(ppc64.AMOVD)
   959  			p.From.Type = obj.TYPE_CONST
   960  			p.From.Offset = ctr
   961  			p.To.Type = obj.TYPE_REG
   962  			p.To.Reg = ppc64.REGTMP
   963  
   964  			p = s.Prog(ppc64.AMOVD)
   965  			p.From.Type = obj.TYPE_REG
   966  			p.From.Reg = ppc64.REGTMP
   967  			p.To.Type = obj.TYPE_REG
   968  			p.To.Reg = ppc64.REG_CTR
   969  
   970  			// Don't generate padding for
   971  			// loops with few iterations.
   972  			if ctr > 3 {
   973  				p = s.Prog(obj.APCALIGN)
   974  				p.From.Type = obj.TYPE_CONST
   975  				p.From.Offset = 16
   976  			}
   977  
   978  			// generate 4 STXVs to zero 64 bytes
   979  			var top *obj.Prog
   980  
   981  			p = s.Prog(ppc64.ASTXV)
   982  			p.From.Type = obj.TYPE_REG
   983  			p.From.Reg = ppc64.REG_VS32
   984  			p.To.Type = obj.TYPE_MEM
   985  			p.To.Reg = v.Args[0].Reg()
   986  
   987  			//  Save the top of loop
   988  			if top == nil {
   989  				top = p
   990  			}
   991  			p = s.Prog(ppc64.ASTXV)
   992  			p.From.Type = obj.TYPE_REG
   993  			p.From.Reg = ppc64.REG_VS32
   994  			p.To.Type = obj.TYPE_MEM
   995  			p.To.Reg = v.Args[0].Reg()
   996  			p.To.Offset = 16
   997  
   998  			p = s.Prog(ppc64.ASTXV)
   999  			p.From.Type = obj.TYPE_REG
  1000  			p.From.Reg = ppc64.REG_VS32
  1001  			p.To.Type = obj.TYPE_MEM
  1002  			p.To.Reg = v.Args[0].Reg()
  1003  			p.To.Offset = 32
  1004  
  1005  			p = s.Prog(ppc64.ASTXV)
  1006  			p.From.Type = obj.TYPE_REG
  1007  			p.From.Reg = ppc64.REG_VS32
  1008  			p.To.Type = obj.TYPE_MEM
  1009  			p.To.Reg = v.Args[0].Reg()
  1010  			p.To.Offset = 48
  1011  
  1012  			// Increment address for the
  1013  			// 64 bytes just zeroed.
  1014  			p = s.Prog(ppc64.AADD)
  1015  			p.Reg = v.Args[0].Reg()
  1016  			p.From.Type = obj.TYPE_CONST
  1017  			p.From.Offset = 64
  1018  			p.To.Type = obj.TYPE_REG
  1019  			p.To.Reg = v.Args[0].Reg()
  1020  
  1021  			// Branch back to top of loop
  1022  			// based on CTR
  1023  			// BC with BO_BCTR generates bdnz
  1024  			p = s.Prog(ppc64.ABC)
  1025  			p.From.Type = obj.TYPE_CONST
  1026  			p.From.Offset = ppc64.BO_BCTR
  1027  			p.Reg = ppc64.REG_R0
  1028  			p.To.Type = obj.TYPE_BRANCH
  1029  			gc.Patch(p, top)
  1030  		}
  1031  		// When ctr == 1 the loop was not generated but
  1032  		// there are at least 64 bytes to clear, so add
  1033  		// that to the remainder to generate the code
  1034  		// to clear those doublewords
  1035  		if ctr == 1 {
  1036  			rem += 64
  1037  		}
  1038  
  1039  		// Clear the remainder starting at offset zero
  1040  		offset := int64(0)
  1041  
  1042  		if rem >= 16 && ctr <= 1 {
  1043  			// If the XXLXOR hasn't already been
  1044  			// generated, do it here to initialize
  1045  			// VS32 (V0) to 0.
  1046  			p := s.Prog(ppc64.AXXLXOR)
  1047  			p.From.Type = obj.TYPE_REG
  1048  			p.From.Reg = ppc64.REG_VS32
  1049  			p.To.Type = obj.TYPE_REG
  1050  			p.To.Reg = ppc64.REG_VS32
  1051  			p.Reg = ppc64.REG_VS32
  1052  		}
  1053  		// Generate STXV for 32 or 64
  1054  		// bytes.
  1055  		for rem >= 32 {
  1056  			p := s.Prog(ppc64.ASTXV)
  1057  			p.From.Type = obj.TYPE_REG
  1058  			p.From.Reg = ppc64.REG_VS32
  1059  			p.To.Type = obj.TYPE_MEM
  1060  			p.To.Reg = v.Args[0].Reg()
  1061  			p.To.Offset = offset
  1062  
  1063  			p = s.Prog(ppc64.ASTXV)
  1064  			p.From.Type = obj.TYPE_REG
  1065  			p.From.Reg = ppc64.REG_VS32
  1066  			p.To.Type = obj.TYPE_MEM
  1067  			p.To.Reg = v.Args[0].Reg()
  1068  			p.To.Offset = offset + 16
  1069  			offset += 32
  1070  			rem -= 32
  1071  		}
  1072  		// Generate 16 bytes
  1073  		if rem >= 16 {
  1074  			p := s.Prog(ppc64.ASTXV)
  1075  			p.From.Type = obj.TYPE_REG
  1076  			p.From.Reg = ppc64.REG_VS32
  1077  			p.To.Type = obj.TYPE_MEM
  1078  			p.To.Reg = v.Args[0].Reg()
  1079  			p.To.Offset = offset
  1080  			offset += 16
  1081  			rem -= 16
  1082  		}
  1083  
  1084  		// first clear as many doublewords as possible
  1085  		// then clear remaining sizes as available
  1086  		for rem > 0 {
  1087  			op, size := ppc64.AMOVB, int64(1)
  1088  			switch {
  1089  			case rem >= 8:
  1090  				op, size = ppc64.AMOVD, 8
  1091  			case rem >= 4:
  1092  				op, size = ppc64.AMOVW, 4
  1093  			case rem >= 2:
  1094  				op, size = ppc64.AMOVH, 2
  1095  			}
  1096  			p := s.Prog(op)
  1097  			p.From.Type = obj.TYPE_REG
  1098  			p.From.Reg = ppc64.REG_R0
  1099  			p.To.Type = obj.TYPE_MEM
  1100  			p.To.Reg = v.Args[0].Reg()
  1101  			p.To.Offset = offset
  1102  			rem -= size
  1103  			offset += size
  1104  		}
  1105  
  1106  	case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
  1107  
  1108  		// Unaligned data doesn't hurt performance
  1109  		// for these instructions on power8.
  1110  
  1111  		// For sizes >= 64 generate a loop as follows:
  1112  
  1113  		// Set up loop counter in CTR, used by BC
  1114  		//       XXLXOR VS32,VS32,VS32
  1115  		//	 MOVD len/32,REG_TMP
  1116  		//	 MOVD REG_TMP,CTR
  1117  		//       MOVD $16,REG_TMP
  1118  		//	 loop:
  1119  		//	 STXVD2X VS32,(R0)(R20)
  1120  		//	 STXVD2X VS32,(R31)(R20)
  1121  		//	 ADD  $32,R20
  1122  		//	 BC   16, 0, loop
  1123  		//
  1124  		// any remainder is done as described below
  1125  
  1126  		// for sizes < 64 bytes, first clear as many doublewords as possible,
  1127  		// then handle the remainder
  1128  		//	MOVD R0,(R20)
  1129  		//	MOVD R0,8(R20)
  1130  		// .... etc.
  1131  		//
  1132  		// the remainder bytes are cleared using one or more
  1133  		// of the following instructions with the appropriate
  1134  		// offsets depending which instructions are needed
  1135  		//
  1136  		//	MOVW R0,n1(R20)	4 bytes
  1137  		//	MOVH R0,n2(R20)	2 bytes
  1138  		//	MOVB R0,n3(R20)	1 byte
  1139  		//
  1140  		// 7 bytes: MOVW, MOVH, MOVB
  1141  		// 6 bytes: MOVW, MOVH
  1142  		// 5 bytes: MOVW, MOVB
  1143  		// 3 bytes: MOVH, MOVB
  1144  
  1145  		// each loop iteration does 32 bytes
  1146  		ctr := v.AuxInt / 32
  1147  
  1148  		// remainder bytes
  1149  		rem := v.AuxInt % 32
  1150  
  1151  		// only generate a loop if there is more
  1152  		// than 1 iteration.
  1153  		if ctr > 1 {
  1154  			// Set up VS32 (V0) to hold 0s
  1155  			p := s.Prog(ppc64.AXXLXOR)
  1156  			p.From.Type = obj.TYPE_REG
  1157  			p.From.Reg = ppc64.REG_VS32
  1158  			p.To.Type = obj.TYPE_REG
  1159  			p.To.Reg = ppc64.REG_VS32
  1160  			p.Reg = ppc64.REG_VS32
  1161  
  1162  			// Set up CTR loop counter
  1163  			p = s.Prog(ppc64.AMOVD)
  1164  			p.From.Type = obj.TYPE_CONST
  1165  			p.From.Offset = ctr
  1166  			p.To.Type = obj.TYPE_REG
  1167  			p.To.Reg = ppc64.REGTMP
  1168  
  1169  			p = s.Prog(ppc64.AMOVD)
  1170  			p.From.Type = obj.TYPE_REG
  1171  			p.From.Reg = ppc64.REGTMP
  1172  			p.To.Type = obj.TYPE_REG
  1173  			p.To.Reg = ppc64.REG_CTR
  1174  
  1175  			// Set up R31 to hold index value 16
  1176  			p = s.Prog(ppc64.AMOVD)
  1177  			p.From.Type = obj.TYPE_CONST
  1178  			p.From.Offset = 16
  1179  			p.To.Type = obj.TYPE_REG
  1180  			p.To.Reg = ppc64.REGTMP
  1181  
  1182  			// Don't add padding for alignment
  1183  			// with few loop iterations.
  1184  			if ctr > 3 {
  1185  				p = s.Prog(obj.APCALIGN)
  1186  				p.From.Type = obj.TYPE_CONST
  1187  				p.From.Offset = 16
  1188  			}
  1189  
  1190  			// generate 2 STXVD2Xs to store 16 bytes
  1191  			// when this is a loop then the top must be saved
  1192  			var top *obj.Prog
  1193  			// This is the top of loop
  1194  
  1195  			p = s.Prog(ppc64.ASTXVD2X)
  1196  			p.From.Type = obj.TYPE_REG
  1197  			p.From.Reg = ppc64.REG_VS32
  1198  			p.To.Type = obj.TYPE_MEM
  1199  			p.To.Reg = v.Args[0].Reg()
  1200  			p.To.Index = ppc64.REGZERO
  1201  			// Save the top of loop
  1202  			if top == nil {
  1203  				top = p
  1204  			}
  1205  			p = s.Prog(ppc64.ASTXVD2X)
  1206  			p.From.Type = obj.TYPE_REG
  1207  			p.From.Reg = ppc64.REG_VS32
  1208  			p.To.Type = obj.TYPE_MEM
  1209  			p.To.Reg = v.Args[0].Reg()
  1210  			p.To.Index = ppc64.REGTMP
  1211  
  1212  			// Increment address for the
  1213  			// 4 doublewords just zeroed.
  1214  			p = s.Prog(ppc64.AADD)
  1215  			p.Reg = v.Args[0].Reg()
  1216  			p.From.Type = obj.TYPE_CONST
  1217  			p.From.Offset = 32
  1218  			p.To.Type = obj.TYPE_REG
  1219  			p.To.Reg = v.Args[0].Reg()
  1220  
  1221  			// Branch back to top of loop
  1222  			// based on CTR
  1223  			// BC with BO_BCTR generates bdnz
  1224  			p = s.Prog(ppc64.ABC)
  1225  			p.From.Type = obj.TYPE_CONST
  1226  			p.From.Offset = ppc64.BO_BCTR
  1227  			p.Reg = ppc64.REG_R0
  1228  			p.To.Type = obj.TYPE_BRANCH
  1229  			gc.Patch(p, top)
  1230  		}
  1231  
  1232  		// when ctr == 1 the loop was not generated but
  1233  		// there are at least 32 bytes to clear, so add
  1234  		// that to the remainder to generate the code
  1235  		// to clear those doublewords
  1236  		if ctr == 1 {
  1237  			rem += 32
  1238  		}
  1239  
  1240  		// clear the remainder starting at offset zero
  1241  		offset := int64(0)
  1242  
  1243  		// first clear as many doublewords as possible
  1244  		// then clear remaining sizes as available
  1245  		for rem > 0 {
  1246  			op, size := ppc64.AMOVB, int64(1)
  1247  			switch {
  1248  			case rem >= 8:
  1249  				op, size = ppc64.AMOVD, 8
  1250  			case rem >= 4:
  1251  				op, size = ppc64.AMOVW, 4
  1252  			case rem >= 2:
  1253  				op, size = ppc64.AMOVH, 2
  1254  			}
  1255  			p := s.Prog(op)
  1256  			p.From.Type = obj.TYPE_REG
  1257  			p.From.Reg = ppc64.REG_R0
  1258  			p.To.Type = obj.TYPE_MEM
  1259  			p.To.Reg = v.Args[0].Reg()
  1260  			p.To.Offset = offset
  1261  			rem -= size
  1262  			offset += size
  1263  		}
  1264  
  1265  	case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:
  1266  
  1267  		bytesPerLoop := int64(32)
  1268  		// This will be used when moving more
  1269  		// than 8 bytes.  Moves start with
  1270  		// as many 8 byte moves as possible, then
  1271  		// 4, 2, or 1 byte(s) as remaining.  This will
  1272  		// work and be efficient for power8 or later.
  1273  		// If there are 64 or more bytes, then a
  1274  		// loop is generated to move 32 bytes and
  1275  		// update the src and dst addresses on each
  1276  		// iteration. When < 64 bytes, the appropriate
  1277  		// number of moves are generated based on the
  1278  		// size.
  1279  		// When moving >= 64 bytes a loop is used
  1280  		//	MOVD len/32,REG_TMP
  1281  		//	MOVD REG_TMP,CTR
  1282  		//	MOVD $16,REG_TMP
  1283  		// top:
  1284  		//	LXVD2X (R0)(R21),VS32
  1285  		//	LXVD2X (R31)(R21),VS33
  1286  		//	ADD $32,R21
  1287  		//	STXVD2X VS32,(R0)(R20)
  1288  		//	STXVD2X VS33,(R31)(R20)
  1289  		//	ADD $32,R20
  1290  		//	BC 16,0,top
  1291  		// Bytes not moved by this loop are moved
  1292  		// with a combination of the following instructions,
  1293  		// starting with the largest sizes and generating as
  1294  		// many as needed, using the appropriate offset value.
  1295  		//	MOVD  n(R21),R31
  1296  		//	MOVD  R31,n(R20)
  1297  		//	MOVW  n1(R21),R31
  1298  		//	MOVW  R31,n1(R20)
  1299  		//	MOVH  n2(R21),R31
  1300  		//	MOVH  R31,n2(R20)
  1301  		//	MOVB  n3(R21),R31
  1302  		//	MOVB  R31,n3(R20)
  1303  
  1304  		// Each loop iteration moves 32 bytes
  1305  		ctr := v.AuxInt / bytesPerLoop
  1306  
  1307  		// Remainder after the loop
  1308  		rem := v.AuxInt % bytesPerLoop
  1309  
  1310  		dstReg := v.Args[0].Reg()
  1311  		srcReg := v.Args[1].Reg()
  1312  
  1313  		// The set of registers used here, must match the clobbered reg list
  1314  		// in PPC64Ops.go.
  1315  		offset := int64(0)
  1316  
  1317  		// top of the loop
  1318  		var top *obj.Prog
  1319  		// Only generate looping code when loop counter is > 1 for >= 64 bytes
  1320  		if ctr > 1 {
  1321  			// Set up the CTR
  1322  			p := s.Prog(ppc64.AMOVD)
  1323  			p.From.Type = obj.TYPE_CONST
  1324  			p.From.Offset = ctr
  1325  			p.To.Type = obj.TYPE_REG
  1326  			p.To.Reg = ppc64.REGTMP
  1327  
  1328  			p = s.Prog(ppc64.AMOVD)
  1329  			p.From.Type = obj.TYPE_REG
  1330  			p.From.Reg = ppc64.REGTMP
  1331  			p.To.Type = obj.TYPE_REG
  1332  			p.To.Reg = ppc64.REG_CTR
  1333  
  1334  			// Use REGTMP as index reg
  1335  			p = s.Prog(ppc64.AMOVD)
  1336  			p.From.Type = obj.TYPE_CONST
  1337  			p.From.Offset = 16
  1338  			p.To.Type = obj.TYPE_REG
  1339  			p.To.Reg = ppc64.REGTMP
  1340  
  1341  			// Don't adding padding for
  1342  			// alignment with small iteration
  1343  			// counts.
  1344  			if ctr > 3 {
  1345  				p = s.Prog(obj.APCALIGN)
  1346  				p.From.Type = obj.TYPE_CONST
  1347  				p.From.Offset = 16
  1348  			}
  1349  
  1350  			// Generate 16 byte loads and stores.
  1351  			// Use temp register for index (16)
  1352  			// on the second one.
  1353  
  1354  			p = s.Prog(ppc64.ALXVD2X)
  1355  			p.From.Type = obj.TYPE_MEM
  1356  			p.From.Reg = srcReg
  1357  			p.From.Index = ppc64.REGZERO
  1358  			p.To.Type = obj.TYPE_REG
  1359  			p.To.Reg = ppc64.REG_VS32
  1360  			if top == nil {
  1361  				top = p
  1362  			}
  1363  			p = s.Prog(ppc64.ALXVD2X)
  1364  			p.From.Type = obj.TYPE_MEM
  1365  			p.From.Reg = srcReg
  1366  			p.From.Index = ppc64.REGTMP
  1367  			p.To.Type = obj.TYPE_REG
  1368  			p.To.Reg = ppc64.REG_VS33
  1369  
  1370  			// increment the src reg for next iteration
  1371  			p = s.Prog(ppc64.AADD)
  1372  			p.Reg = srcReg
  1373  			p.From.Type = obj.TYPE_CONST
  1374  			p.From.Offset = bytesPerLoop
  1375  			p.To.Type = obj.TYPE_REG
  1376  			p.To.Reg = srcReg
  1377  
  1378  			// generate 16 byte stores
  1379  			p = s.Prog(ppc64.ASTXVD2X)
  1380  			p.From.Type = obj.TYPE_REG
  1381  			p.From.Reg = ppc64.REG_VS32
  1382  			p.To.Type = obj.TYPE_MEM
  1383  			p.To.Reg = dstReg
  1384  			p.To.Index = ppc64.REGZERO
  1385  
  1386  			p = s.Prog(ppc64.ASTXVD2X)
  1387  			p.From.Type = obj.TYPE_REG
  1388  			p.From.Reg = ppc64.REG_VS33
  1389  			p.To.Type = obj.TYPE_MEM
  1390  			p.To.Reg = dstReg
  1391  			p.To.Index = ppc64.REGTMP
  1392  
  1393  			// increment the dst reg for next iteration
  1394  			p = s.Prog(ppc64.AADD)
  1395  			p.Reg = dstReg
  1396  			p.From.Type = obj.TYPE_CONST
  1397  			p.From.Offset = bytesPerLoop
  1398  			p.To.Type = obj.TYPE_REG
  1399  			p.To.Reg = dstReg
  1400  
  1401  			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
  1402  			// to loop top.
  1403  			p = s.Prog(ppc64.ABC)
  1404  			p.From.Type = obj.TYPE_CONST
  1405  			p.From.Offset = ppc64.BO_BCTR
  1406  			p.Reg = ppc64.REG_R0
  1407  			p.To.Type = obj.TYPE_BRANCH
  1408  			gc.Patch(p, top)
  1409  
  1410  			// srcReg and dstReg were incremented in the loop, so
  1411  			// later instructions start with offset 0.
  1412  			offset = int64(0)
  1413  		}
  1414  
  1415  		// No loop was generated for one iteration, so
  1416  		// add 32 bytes to the remainder to move those bytes.
  1417  		if ctr == 1 {
  1418  			rem += bytesPerLoop
  1419  		}
  1420  
  1421  		if rem >= 16 {
  1422  			// Generate 16 byte loads and stores.
  1423  			// Use temp register for index (value 16)
  1424  			// on the second one.
  1425  			p := s.Prog(ppc64.ALXVD2X)
  1426  			p.From.Type = obj.TYPE_MEM
  1427  			p.From.Reg = srcReg
  1428  			p.From.Index = ppc64.REGZERO
  1429  			p.To.Type = obj.TYPE_REG
  1430  			p.To.Reg = ppc64.REG_VS32
  1431  
  1432  			p = s.Prog(ppc64.ASTXVD2X)
  1433  			p.From.Type = obj.TYPE_REG
  1434  			p.From.Reg = ppc64.REG_VS32
  1435  			p.To.Type = obj.TYPE_MEM
  1436  			p.To.Reg = dstReg
  1437  			p.To.Index = ppc64.REGZERO
  1438  
  1439  			offset = 16
  1440  			rem -= 16
  1441  
  1442  			if rem >= 16 {
  1443  				// Use REGTMP as index reg
  1444  				p := s.Prog(ppc64.AMOVD)
  1445  				p.From.Type = obj.TYPE_CONST
  1446  				p.From.Offset = 16
  1447  				p.To.Type = obj.TYPE_REG
  1448  				p.To.Reg = ppc64.REGTMP
  1449  
  1450  				p = s.Prog(ppc64.ALXVD2X)
  1451  				p.From.Type = obj.TYPE_MEM
  1452  				p.From.Reg = srcReg
  1453  				p.From.Index = ppc64.REGTMP
  1454  				p.To.Type = obj.TYPE_REG
  1455  				p.To.Reg = ppc64.REG_VS32
  1456  
  1457  				p = s.Prog(ppc64.ASTXVD2X)
  1458  				p.From.Type = obj.TYPE_REG
  1459  				p.From.Reg = ppc64.REG_VS32
  1460  				p.To.Type = obj.TYPE_MEM
  1461  				p.To.Reg = dstReg
  1462  				p.To.Index = ppc64.REGTMP
  1463  
  1464  				offset = 32
  1465  				rem -= 16
  1466  			}
  1467  		}
  1468  
  1469  		// Generate all the remaining load and store pairs, starting with
  1470  		// as many 8 byte moves as possible, then 4, 2, 1.
  1471  		for rem > 0 {
  1472  			op, size := ppc64.AMOVB, int64(1)
  1473  			switch {
  1474  			case rem >= 8:
  1475  				op, size = ppc64.AMOVD, 8
  1476  			case rem >= 4:
  1477  				op, size = ppc64.AMOVW, 4
  1478  			case rem >= 2:
  1479  				op, size = ppc64.AMOVH, 2
  1480  			}
  1481  			// Load
  1482  			p := s.Prog(op)
  1483  			p.To.Type = obj.TYPE_REG
  1484  			p.To.Reg = ppc64.REGTMP
  1485  			p.From.Type = obj.TYPE_MEM
  1486  			p.From.Reg = srcReg
  1487  			p.From.Offset = offset
  1488  
  1489  			// Store
  1490  			p = s.Prog(op)
  1491  			p.From.Type = obj.TYPE_REG
  1492  			p.From.Reg = ppc64.REGTMP
  1493  			p.To.Type = obj.TYPE_MEM
  1494  			p.To.Reg = dstReg
  1495  			p.To.Offset = offset
  1496  			rem -= size
  1497  			offset += size
  1498  		}
  1499  
  1500  	case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
  1501  		bytesPerLoop := int64(64)
  1502  		// This is used when moving more
  1503  		// than 8 bytes on power9.  Moves start with
  1504  		// as many 8 byte moves as possible, then
  1505  		// 4, 2, or 1 byte(s) as remaining.  This will
  1506  		// work and be efficient for power8 or later.
  1507  		// If there are 64 or more bytes, then a
  1508  		// loop is generated to move 32 bytes and
  1509  		// update the src and dst addresses on each
  1510  		// iteration. When < 64 bytes, the appropriate
  1511  		// number of moves are generated based on the
  1512  		// size.
  1513  		// When moving >= 64 bytes a loop is used
  1514  		//      MOVD len/32,REG_TMP
  1515  		//      MOVD REG_TMP,CTR
  1516  		// top:
  1517  		//      LXV 0(R21),VS32
  1518  		//      LXV 16(R21),VS33
  1519  		//      ADD $32,R21
  1520  		//      STXV VS32,0(R20)
  1521  		//      STXV VS33,16(R20)
  1522  		//      ADD $32,R20
  1523  		//      BC 16,0,top
  1524  		// Bytes not moved by this loop are moved
  1525  		// with a combination of the following instructions,
  1526  		// starting with the largest sizes and generating as
  1527  		// many as needed, using the appropriate offset value.
  1528  		//      MOVD  n(R21),R31
  1529  		//      MOVD  R31,n(R20)
  1530  		//      MOVW  n1(R21),R31
  1531  		//      MOVW  R31,n1(R20)
  1532  		//      MOVH  n2(R21),R31
  1533  		//      MOVH  R31,n2(R20)
  1534  		//      MOVB  n3(R21),R31
  1535  		//      MOVB  R31,n3(R20)
  1536  
  1537  		// Each loop iteration moves 32 bytes
  1538  		ctr := v.AuxInt / bytesPerLoop
  1539  
  1540  		// Remainder after the loop
  1541  		rem := v.AuxInt % bytesPerLoop
  1542  
  1543  		dstReg := v.Args[0].Reg()
  1544  		srcReg := v.Args[1].Reg()
  1545  
  1546  		offset := int64(0)
  1547  
  1548  		// top of the loop
  1549  		var top *obj.Prog
  1550  
  1551  		// Only generate looping code when loop counter is > 1 for >= 64 bytes
  1552  		if ctr > 1 {
  1553  			// Set up the CTR
  1554  			p := s.Prog(ppc64.AMOVD)
  1555  			p.From.Type = obj.TYPE_CONST
  1556  			p.From.Offset = ctr
  1557  			p.To.Type = obj.TYPE_REG
  1558  			p.To.Reg = ppc64.REGTMP
  1559  
  1560  			p = s.Prog(ppc64.AMOVD)
  1561  			p.From.Type = obj.TYPE_REG
  1562  			p.From.Reg = ppc64.REGTMP
  1563  			p.To.Type = obj.TYPE_REG
  1564  			p.To.Reg = ppc64.REG_CTR
  1565  
  1566  			p = s.Prog(obj.APCALIGN)
  1567  			p.From.Type = obj.TYPE_CONST
  1568  			p.From.Offset = 16
  1569  
  1570  			// Generate 16 byte loads and stores.
  1571  			p = s.Prog(ppc64.ALXV)
  1572  			p.From.Type = obj.TYPE_MEM
  1573  			p.From.Reg = srcReg
  1574  			p.From.Offset = offset
  1575  			p.To.Type = obj.TYPE_REG
  1576  			p.To.Reg = ppc64.REG_VS32
  1577  			if top == nil {
  1578  				top = p
  1579  			}
  1580  			p = s.Prog(ppc64.ALXV)
  1581  			p.From.Type = obj.TYPE_MEM
  1582  			p.From.Reg = srcReg
  1583  			p.From.Offset = offset + 16
  1584  			p.To.Type = obj.TYPE_REG
  1585  			p.To.Reg = ppc64.REG_VS33
  1586  
  1587  			// generate 16 byte stores
  1588  			p = s.Prog(ppc64.ASTXV)
  1589  			p.From.Type = obj.TYPE_REG
  1590  			p.From.Reg = ppc64.REG_VS32
  1591  			p.To.Type = obj.TYPE_MEM
  1592  			p.To.Reg = dstReg
  1593  			p.To.Offset = offset
  1594  
  1595  			p = s.Prog(ppc64.ASTXV)
  1596  			p.From.Type = obj.TYPE_REG
  1597  			p.From.Reg = ppc64.REG_VS33
  1598  			p.To.Type = obj.TYPE_MEM
  1599  			p.To.Reg = dstReg
  1600  			p.To.Offset = offset + 16
  1601  
  1602  			// Generate 16 byte loads and stores.
  1603  			p = s.Prog(ppc64.ALXV)
  1604  			p.From.Type = obj.TYPE_MEM
  1605  			p.From.Reg = srcReg
  1606  			p.From.Offset = offset + 32
  1607  			p.To.Type = obj.TYPE_REG
  1608  			p.To.Reg = ppc64.REG_VS32
  1609  
  1610  			p = s.Prog(ppc64.ALXV)
  1611  			p.From.Type = obj.TYPE_MEM
  1612  			p.From.Reg = srcReg
  1613  			p.From.Offset = offset + 48
  1614  			p.To.Type = obj.TYPE_REG
  1615  			p.To.Reg = ppc64.REG_VS33
  1616  
  1617  			// generate 16 byte stores
  1618  			p = s.Prog(ppc64.ASTXV)
  1619  			p.From.Type = obj.TYPE_REG
  1620  			p.From.Reg = ppc64.REG_VS32
  1621  			p.To.Type = obj.TYPE_MEM
  1622  			p.To.Reg = dstReg
  1623  			p.To.Offset = offset + 32
  1624  
  1625  			p = s.Prog(ppc64.ASTXV)
  1626  			p.From.Type = obj.TYPE_REG
  1627  			p.From.Reg = ppc64.REG_VS33
  1628  			p.To.Type = obj.TYPE_MEM
  1629  			p.To.Reg = dstReg
  1630  			p.To.Offset = offset + 48
  1631  
  1632  			// increment the src reg for next iteration
  1633  			p = s.Prog(ppc64.AADD)
  1634  			p.Reg = srcReg
  1635  			p.From.Type = obj.TYPE_CONST
  1636  			p.From.Offset = bytesPerLoop
  1637  			p.To.Type = obj.TYPE_REG
  1638  			p.To.Reg = srcReg
  1639  
  1640  			// increment the dst reg for next iteration
  1641  			p = s.Prog(ppc64.AADD)
  1642  			p.Reg = dstReg
  1643  			p.From.Type = obj.TYPE_CONST
  1644  			p.From.Offset = bytesPerLoop
  1645  			p.To.Type = obj.TYPE_REG
  1646  			p.To.Reg = dstReg
  1647  
  1648  			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
  1649  			// to loop top.
  1650  			p = s.Prog(ppc64.ABC)
  1651  			p.From.Type = obj.TYPE_CONST
  1652  			p.From.Offset = ppc64.BO_BCTR
  1653  			p.Reg = ppc64.REG_R0
  1654  			p.To.Type = obj.TYPE_BRANCH
  1655  			gc.Patch(p, top)
  1656  
  1657  			// srcReg and dstReg were incremented in the loop, so
  1658  			// later instructions start with offset 0.
  1659  			offset = int64(0)
  1660  		}
  1661  
  1662  		// No loop was generated for one iteration, so
  1663  		// add 32 bytes to the remainder to move those bytes.
  1664  		if ctr == 1 {
  1665  			rem += bytesPerLoop
  1666  		}
  1667  		if rem >= 32 {
  1668  			p := s.Prog(ppc64.ALXV)
  1669  			p.From.Type = obj.TYPE_MEM
  1670  			p.From.Reg = srcReg
  1671  			p.To.Type = obj.TYPE_REG
  1672  			p.To.Reg = ppc64.REG_VS32
  1673  
  1674  			p = s.Prog(ppc64.ALXV)
  1675  			p.From.Type = obj.TYPE_MEM
  1676  			p.From.Reg = srcReg
  1677  			p.From.Offset = 16
  1678  			p.To.Type = obj.TYPE_REG
  1679  			p.To.Reg = ppc64.REG_VS33
  1680  
  1681  			p = s.Prog(ppc64.ASTXV)
  1682  			p.From.Type = obj.TYPE_REG
  1683  			p.From.Reg = ppc64.REG_VS32
  1684  			p.To.Type = obj.TYPE_MEM
  1685  			p.To.Reg = dstReg
  1686  
  1687  			p = s.Prog(ppc64.ASTXV)
  1688  			p.From.Type = obj.TYPE_REG
  1689  			p.From.Reg = ppc64.REG_VS33
  1690  			p.To.Type = obj.TYPE_MEM
  1691  			p.To.Reg = dstReg
  1692  			p.To.Offset = 16
  1693  
  1694  			offset = 32
  1695  			rem -= 32
  1696  		}
  1697  
  1698  		if rem >= 16 {
  1699  			// Generate 16 byte loads and stores.
  1700  			p := s.Prog(ppc64.ALXV)
  1701  			p.From.Type = obj.TYPE_MEM
  1702  			p.From.Reg = srcReg
  1703  			p.From.Offset = offset
  1704  			p.To.Type = obj.TYPE_REG
  1705  			p.To.Reg = ppc64.REG_VS32
  1706  
  1707  			p = s.Prog(ppc64.ASTXV)
  1708  			p.From.Type = obj.TYPE_REG
  1709  			p.From.Reg = ppc64.REG_VS32
  1710  			p.To.Type = obj.TYPE_MEM
  1711  			p.To.Reg = dstReg
  1712  			p.To.Offset = offset
  1713  
  1714  			offset += 16
  1715  			rem -= 16
  1716  
  1717  			if rem >= 16 {
  1718  				p := s.Prog(ppc64.ALXV)
  1719  				p.From.Type = obj.TYPE_MEM
  1720  				p.From.Reg = srcReg
  1721  				p.From.Offset = offset
  1722  				p.To.Type = obj.TYPE_REG
  1723  				p.To.Reg = ppc64.REG_VS32
  1724  
  1725  				p = s.Prog(ppc64.ASTXV)
  1726  				p.From.Type = obj.TYPE_REG
  1727  				p.From.Reg = ppc64.REG_VS32
  1728  				p.To.Type = obj.TYPE_MEM
  1729  				p.To.Reg = dstReg
  1730  				p.To.Offset = offset
  1731  
  1732  				offset += 16
  1733  				rem -= 16
  1734  			}
  1735  		}
  1736  		// Generate all the remaining load and store pairs, starting with
  1737  		// as many 8 byte moves as possible, then 4, 2, 1.
  1738  		for rem > 0 {
  1739  			op, size := ppc64.AMOVB, int64(1)
  1740  			switch {
  1741  			case rem >= 8:
  1742  				op, size = ppc64.AMOVD, 8
  1743  			case rem >= 4:
  1744  				op, size = ppc64.AMOVW, 4
  1745  			case rem >= 2:
  1746  				op, size = ppc64.AMOVH, 2
  1747  			}
  1748  			// Load
  1749  			p := s.Prog(op)
  1750  			p.To.Type = obj.TYPE_REG
  1751  			p.To.Reg = ppc64.REGTMP
  1752  			p.From.Type = obj.TYPE_MEM
  1753  			p.From.Reg = srcReg
  1754  			p.From.Offset = offset
  1755  
  1756  			// Store
  1757  			p = s.Prog(op)
  1758  			p.From.Type = obj.TYPE_REG
  1759  			p.From.Reg = ppc64.REGTMP
  1760  			p.To.Type = obj.TYPE_MEM
  1761  			p.To.Reg = dstReg
  1762  			p.To.Offset = offset
  1763  			rem -= size
  1764  			offset += size
  1765  		}
  1766  
  1767  	case ssa.OpPPC64CALLstatic:
  1768  		s.Call(v)
  1769  
  1770  	case ssa.OpPPC64CALLclosure, ssa.OpPPC64CALLinter:
  1771  		p := s.Prog(ppc64.AMOVD)
  1772  		p.From.Type = obj.TYPE_REG
  1773  		p.From.Reg = v.Args[0].Reg()
  1774  		p.To.Type = obj.TYPE_REG
  1775  		p.To.Reg = ppc64.REG_LR
  1776  
  1777  		if v.Args[0].Reg() != ppc64.REG_R12 {
  1778  			v.Fatalf("Function address for %v should be in R12 %d but is in %d", v.LongString(), ppc64.REG_R12, p.From.Reg)
  1779  		}
  1780  
  1781  		pp := s.Call(v)
  1782  		pp.To.Reg = ppc64.REG_LR
  1783  
  1784  		// Insert a hint this is not a subroutine return.
  1785  		pp.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: 1})
  1786  
  1787  		if gc.Ctxt.Flag_shared {
  1788  			// When compiling Go into PIC, the function we just
  1789  			// called via pointer might have been implemented in
  1790  			// a separate module and so overwritten the TOC
  1791  			// pointer in R2; reload it.
  1792  			q := s.Prog(ppc64.AMOVD)
  1793  			q.From.Type = obj.TYPE_MEM
  1794  			q.From.Offset = 24
  1795  			q.From.Reg = ppc64.REGSP
  1796  			q.To.Type = obj.TYPE_REG
  1797  			q.To.Reg = ppc64.REG_R2
  1798  		}
  1799  
  1800  	case ssa.OpPPC64LoweredWB:
  1801  		p := s.Prog(obj.ACALL)
  1802  		p.To.Type = obj.TYPE_MEM
  1803  		p.To.Name = obj.NAME_EXTERN
  1804  		p.To.Sym = v.Aux.(*obj.LSym)
  1805  
  1806  	case ssa.OpPPC64LoweredPanicBoundsA, ssa.OpPPC64LoweredPanicBoundsB, ssa.OpPPC64LoweredPanicBoundsC:
  1807  		p := s.Prog(obj.ACALL)
  1808  		p.To.Type = obj.TYPE_MEM
  1809  		p.To.Name = obj.NAME_EXTERN
  1810  		p.To.Sym = gc.BoundsCheckFunc[v.AuxInt]
  1811  		s.UseArgs(16) // space used in callee args area by assembly stubs
  1812  
  1813  	case ssa.OpPPC64LoweredNilCheck:
  1814  		if objabi.GOOS == "aix" {
  1815  			// CMP Rarg0, R0
  1816  			// BNE 2(PC)
  1817  			// STW R0, 0(R0)
  1818  			// NOP (so the BNE has somewhere to land)
  1819  
  1820  			// CMP Rarg0, R0
  1821  			p := s.Prog(ppc64.ACMP)
  1822  			p.From.Type = obj.TYPE_REG
  1823  			p.From.Reg = v.Args[0].Reg()
  1824  			p.To.Type = obj.TYPE_REG
  1825  			p.To.Reg = ppc64.REG_R0
  1826  
  1827  			// BNE 2(PC)
  1828  			p2 := s.Prog(ppc64.ABNE)
  1829  			p2.To.Type = obj.TYPE_BRANCH
  1830  
  1831  			// STW R0, 0(R0)
  1832  			// Write at 0 is forbidden and will trigger a SIGSEGV
  1833  			p = s.Prog(ppc64.AMOVW)
  1834  			p.From.Type = obj.TYPE_REG
  1835  			p.From.Reg = ppc64.REG_R0
  1836  			p.To.Type = obj.TYPE_MEM
  1837  			p.To.Reg = ppc64.REG_R0
  1838  
  1839  			// NOP (so the BNE has somewhere to land)
  1840  			nop := s.Prog(obj.ANOP)
  1841  			gc.Patch(p2, nop)
  1842  
  1843  		} else {
  1844  			// Issue a load which will fault if arg is nil.
  1845  			p := s.Prog(ppc64.AMOVBZ)
  1846  			p.From.Type = obj.TYPE_MEM
  1847  			p.From.Reg = v.Args[0].Reg()
  1848  			gc.AddAux(&p.From, v)
  1849  			p.To.Type = obj.TYPE_REG
  1850  			p.To.Reg = ppc64.REGTMP
  1851  		}
  1852  		if logopt.Enabled() {
  1853  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1854  		}
  1855  		if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1856  			gc.Warnl(v.Pos, "generated nil check")
  1857  		}
  1858  
  1859  	// These should be resolved by rules and not make it here.
  1860  	case ssa.OpPPC64Equal, ssa.OpPPC64NotEqual, ssa.OpPPC64LessThan, ssa.OpPPC64FLessThan,
  1861  		ssa.OpPPC64LessEqual, ssa.OpPPC64GreaterThan, ssa.OpPPC64FGreaterThan, ssa.OpPPC64GreaterEqual,
  1862  		ssa.OpPPC64FLessEqual, ssa.OpPPC64FGreaterEqual:
  1863  		v.Fatalf("Pseudo-op should not make it to codegen: %s ###\n", v.LongString())
  1864  	case ssa.OpPPC64InvertFlags:
  1865  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1866  	case ssa.OpPPC64FlagEQ, ssa.OpPPC64FlagLT, ssa.OpPPC64FlagGT:
  1867  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1868  	case ssa.OpClobber:
  1869  		// TODO: implement for clobberdead experiment. Nop is ok for now.
  1870  	default:
  1871  		v.Fatalf("genValue not implemented: %s", v.LongString())
  1872  	}
  1873  }
  1874  
  1875  var blockJump = [...]struct {
  1876  	asm, invasm     obj.As
  1877  	asmeq, invasmun bool
  1878  }{
  1879  	ssa.BlockPPC64EQ: {ppc64.ABEQ, ppc64.ABNE, false, false},
  1880  	ssa.BlockPPC64NE: {ppc64.ABNE, ppc64.ABEQ, false, false},
  1881  
  1882  	ssa.BlockPPC64LT: {ppc64.ABLT, ppc64.ABGE, false, false},
  1883  	ssa.BlockPPC64GE: {ppc64.ABGE, ppc64.ABLT, false, false},
  1884  	ssa.BlockPPC64LE: {ppc64.ABLE, ppc64.ABGT, false, false},
  1885  	ssa.BlockPPC64GT: {ppc64.ABGT, ppc64.ABLE, false, false},
  1886  
  1887  	// TODO: need to work FP comparisons into block jumps
  1888  	ssa.BlockPPC64FLT: {ppc64.ABLT, ppc64.ABGE, false, false},
  1889  	ssa.BlockPPC64FGE: {ppc64.ABGT, ppc64.ABLT, true, true}, // GE = GT or EQ; !GE = LT or UN
  1890  	ssa.BlockPPC64FLE: {ppc64.ABLT, ppc64.ABGT, true, true}, // LE = LT or EQ; !LE = GT or UN
  1891  	ssa.BlockPPC64FGT: {ppc64.ABGT, ppc64.ABLE, false, false},
  1892  }
  1893  
  1894  func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
  1895  	switch b.Kind {
  1896  	case ssa.BlockDefer:
  1897  		// defer returns in R3:
  1898  		// 0 if we should continue executing
  1899  		// 1 if we should jump to deferreturn call
  1900  		p := s.Prog(ppc64.ACMP)
  1901  		p.From.Type = obj.TYPE_REG
  1902  		p.From.Reg = ppc64.REG_R3
  1903  		p.To.Type = obj.TYPE_REG
  1904  		p.To.Reg = ppc64.REG_R0
  1905  
  1906  		p = s.Prog(ppc64.ABNE)
  1907  		p.To.Type = obj.TYPE_BRANCH
  1908  		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
  1909  		if b.Succs[0].Block() != next {
  1910  			p := s.Prog(obj.AJMP)
  1911  			p.To.Type = obj.TYPE_BRANCH
  1912  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1913  		}
  1914  
  1915  	case ssa.BlockPlain:
  1916  		if b.Succs[0].Block() != next {
  1917  			p := s.Prog(obj.AJMP)
  1918  			p.To.Type = obj.TYPE_BRANCH
  1919  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1920  		}
  1921  	case ssa.BlockExit:
  1922  	case ssa.BlockRet:
  1923  		s.Prog(obj.ARET)
  1924  	case ssa.BlockRetJmp:
  1925  		p := s.Prog(obj.AJMP)
  1926  		p.To.Type = obj.TYPE_MEM
  1927  		p.To.Name = obj.NAME_EXTERN
  1928  		p.To.Sym = b.Aux.(*obj.LSym)
  1929  
  1930  	case ssa.BlockPPC64EQ, ssa.BlockPPC64NE,
  1931  		ssa.BlockPPC64LT, ssa.BlockPPC64GE,
  1932  		ssa.BlockPPC64LE, ssa.BlockPPC64GT,
  1933  		ssa.BlockPPC64FLT, ssa.BlockPPC64FGE,
  1934  		ssa.BlockPPC64FLE, ssa.BlockPPC64FGT:
  1935  		jmp := blockJump[b.Kind]
  1936  		switch next {
  1937  		case b.Succs[0].Block():
  1938  			s.Br(jmp.invasm, b.Succs[1].Block())
  1939  			if jmp.invasmun {
  1940  				// TODO: The second branch is probably predict-not-taken since it is for FP unordered
  1941  				s.Br(ppc64.ABVS, b.Succs[1].Block())
  1942  			}
  1943  		case b.Succs[1].Block():
  1944  			s.Br(jmp.asm, b.Succs[0].Block())
  1945  			if jmp.asmeq {
  1946  				s.Br(ppc64.ABEQ, b.Succs[0].Block())
  1947  			}
  1948  		default:
  1949  			if b.Likely != ssa.BranchUnlikely {
  1950  				s.Br(jmp.asm, b.Succs[0].Block())
  1951  				if jmp.asmeq {
  1952  					s.Br(ppc64.ABEQ, b.Succs[0].Block())
  1953  				}
  1954  				s.Br(obj.AJMP, b.Succs[1].Block())
  1955  			} else {
  1956  				s.Br(jmp.invasm, b.Succs[1].Block())
  1957  				if jmp.invasmun {
  1958  					// TODO: The second branch is probably predict-not-taken since it is for FP unordered
  1959  					s.Br(ppc64.ABVS, b.Succs[1].Block())
  1960  				}
  1961  				s.Br(obj.AJMP, b.Succs[0].Block())
  1962  			}
  1963  		}
  1964  	default:
  1965  		b.Fatalf("branch not implemented: %s", b.LongString())
  1966  	}
  1967  }
  1968  

View as plain text