1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file provides fast assembly versions for the elementary
6 // arithmetic operations on vectors implemented in arith.go.
7
8 // TODO(gri) - experiment with unrolled loops for faster execution
9
10 // func mulWW(x, y Word) (z1, z0 Word)
11 TEXT ·mulWW(SB),7,$0
12 MOVQ x+0(FP), AX
13 MULQ y+8(FP)
14 MOVQ DX, z1+16(FP)
15 MOVQ AX, z0+24(FP)
16 RET
17
18
19 // func divWW(x1, x0, y Word) (q, r Word)
20 TEXT ·divWW(SB),7,$0
21 MOVQ x1+0(FP), DX
22 MOVQ x0+8(FP), AX
23 DIVQ y+16(FP)
24 MOVQ AX, q+24(FP)
25 MOVQ DX, r+32(FP)
26 RET
27
28
29 // func addVV(z, x, y []Word) (c Word)
30 TEXT ·addVV(SB),7,$0
31 MOVQ z+0(FP), R10
32 MOVQ x+16(FP), R8
33 MOVQ y+32(FP), R9
34 MOVL n+8(FP), R11
35 MOVQ $0, BX // i = 0
36 MOVQ $0, DX // c = 0
37 JMP E1
38
39 L1: MOVQ (R8)(BX*8), AX
40 RCRQ $1, DX
41 ADCQ (R9)(BX*8), AX
42 RCLQ $1, DX
43 MOVQ AX, (R10)(BX*8)
44 ADDL $1, BX // i++
45
46 E1: CMPQ BX, R11 // i < n
47 JL L1
48
49 MOVQ DX, c+48(FP)
50 RET
51
52
53 // func subVV(z, x, y []Word) (c Word)
54 // (same as addVV_s except for SBBQ instead of ADCQ and label names)
55 TEXT ·subVV(SB),7,$0
56 MOVQ z+0(FP), R10
57 MOVQ x+16(FP), R8
58 MOVQ y+32(FP), R9
59 MOVL n+8(FP), R11
60 MOVQ $0, BX // i = 0
61 MOVQ $0, DX // c = 0
62 JMP E2
63
64 L2: MOVQ (R8)(BX*8), AX
65 RCRQ $1, DX
66 SBBQ (R9)(BX*8), AX
67 RCLQ $1, DX
68 MOVQ AX, (R10)(BX*8)
69 ADDL $1, BX // i++
70
71 E2: CMPQ BX, R11 // i < n
72 JL L2
73
74 MOVQ DX, c+48(FP)
75 RET
76
77
78 // func addVW(z, x []Word, y Word) (c Word)
79 TEXT ·addVW(SB),7,$0
80 MOVQ z+0(FP), R10
81 MOVQ x+16(FP), R8
82 MOVQ y+32(FP), AX // c = y
83 MOVL n+8(FP), R11
84 MOVQ $0, BX // i = 0
85 JMP E3
86
87 L3: ADDQ (R8)(BX*8), AX
88 MOVQ AX, (R10)(BX*8)
89 RCLQ $1, AX
90 ANDQ $1, AX
91 ADDL $1, BX // i++
92
93 E3: CMPQ BX, R11 // i < n
94 JL L3
95
96 MOVQ AX, c+40(FP)
97 RET
98
99
100 // func subVW(z, x []Word, y Word) (c Word)
101 TEXT ·subVW(SB),7,$0
102 MOVQ z+0(FP), R10
103 MOVQ x+16(FP), R8
104 MOVQ y+32(FP), AX // c = y
105 MOVL n+8(FP), R11
106 MOVQ $0, BX // i = 0
107 JMP E4
108
109 L4: MOVQ (R8)(BX*8), DX // TODO(gri) is there a reverse SUBQ?
110 SUBQ AX, DX
111 MOVQ DX, (R10)(BX*8)
112 RCLQ $1, AX
113 ANDQ $1, AX
114 ADDL $1, BX // i++
115
116 E4: CMPQ BX, R11 // i < n
117 JL L4
118
119 MOVQ AX, c+40(FP)
120 RET
121
122
123 // func shlVU(z, x []Word, s uint) (c Word)
124 TEXT ·shlVU(SB),7,$0
125 MOVL n+8(FP), BX // i = n
126 SUBL $1, BX // i--
127 JL X8b // i < 0 (n <= 0)
128
129 // n > 0
130 MOVQ z+0(FP), R10
131 MOVQ x+16(FP), R8
132 MOVL s+32(FP), CX
133 MOVQ (R8)(BX*8), AX // w1 = x[n-1]
134 MOVQ $0, DX
135 SHLQ CX, DX:AX // w1>>ŝ
136 MOVQ DX, c+40(FP)
137
138 CMPL BX, $0
139 JLE X8a // i <= 0
140
141 // i > 0
142 L8: MOVQ AX, DX // w = w1
143 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
144 SHLQ CX, DX:AX // w<<s | w1>>ŝ
145 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
146 SUBL $1, BX // i--
147 JG L8 // i > 0
148
149 // i <= 0
150 X8a: SHLQ CX, AX // w1<<s
151 MOVQ AX, (R10) // z[0] = w1<<s
152 RET
153
154 X8b: MOVQ $0, c+40(FP)
155 RET
156
157
158 // func shrVU(z, x []Word, s uint) (c Word)
159 TEXT ·shrVU(SB),7,$0
160 MOVL n+8(FP), R11
161 SUBL $1, R11 // n--
162 JL X9b // n < 0 (n <= 0)
163
164 // n > 0
165 MOVQ z+0(FP), R10
166 MOVQ x+16(FP), R8
167 MOVL s+32(FP), CX
168 MOVQ (R8), AX // w1 = x[0]
169 MOVQ $0, DX
170 SHRQ CX, DX:AX // w1<<ŝ
171 MOVQ DX, c+40(FP)
172
173 MOVQ $0, BX // i = 0
174 JMP E9
175
176 // i < n-1
177 L9: MOVQ AX, DX // w = w1
178 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
179 SHRQ CX, DX:AX // w>>s | w1<<ŝ
180 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
181 ADDL $1, BX // i++
182
183 E9: CMPQ BX, R11
184 JL L9 // i < n-1
185
186 // i >= n-1
187 X9a: SHRQ CX, AX // w1>>s
188 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
189 RET
190
191 X9b: MOVQ $0, c+40(FP)
192 RET
193
194
195 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
196 TEXT ·mulAddVWW(SB),7,$0
197 MOVQ z+0(FP), R10
198 MOVQ x+16(FP), R8
199 MOVQ y+32(FP), R9
200 MOVQ r+40(FP), CX // c = r
201 MOVL n+8(FP), R11
202 MOVQ $0, BX // i = 0
203 JMP E5
204
205 L5: MOVQ (R8)(BX*8), AX
206 MULQ R9
207 ADDQ CX, AX
208 ADCQ $0, DX
209 MOVQ AX, (R10)(BX*8)
210 MOVQ DX, CX
211 ADDL $1, BX // i++
212
213 E5: CMPQ BX, R11 // i < n
214 JL L5
215
216 MOVQ CX, c+48(FP)
217 RET
218
219
220 // func addMulVVW(z, x []Word, y Word) (c Word)
221 TEXT ·addMulVVW(SB),7,$0
222 MOVQ z+0(FP), R10
223 MOVQ x+16(FP), R8
224 MOVQ y+32(FP), R9
225 MOVL n+8(FP), R11
226 MOVQ $0, BX // i = 0
227 MOVQ $0, CX // c = 0
228 JMP E6
229
230 L6: MOVQ (R8)(BX*8), AX
231 MULQ R9
232 ADDQ CX, AX
233 ADCQ $0, DX
234 ADDQ AX, (R10)(BX*8)
235 ADCQ $0, DX
236 MOVQ DX, CX
237 ADDL $1, BX // i++
238
239 E6: CMPQ BX, R11 // i < n
240 JL L6
241
242 MOVQ CX, c+40(FP)
243 RET
244
245
246 // divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
247 TEXT ·divWVW(SB),7,$0
248 MOVQ z+0(FP), R10
249 MOVQ xn+16(FP), DX // r = xn
250 MOVQ x+24(FP), R8
251 MOVQ y+40(FP), R9
252 MOVL n+8(FP), BX // i = n
253 JMP E7
254
255 L7: MOVQ (R8)(BX*8), AX
256 DIVQ R9
257 MOVQ AX, (R10)(BX*8)
258
259 E7: SUBL $1, BX // i--
260 JGE L7 // i >= 0
261
262 MOVQ DX, r+48(FP)
263 RET