ia64/xen-unstable

view linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/xor.h @ 11221:7c9d7fc3dce5

[HVM] Fix SMBIOS entry point copy destination.
Spotted by Xiaowei Yang <xiaowei.yang@intel.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@localhost.localdomain
date Sat Aug 19 12:06:36 2006 +0100 (2006-08-19)
parents fd9b2c1bb577
children
line source
1 /*
2 * x86-64 changes / gcc fixes from Andi Kleen.
3 * Copyright 2002 Andi Kleen, SuSE Labs.
4 *
5 * This hasn't been optimized for the hammer yet, but there are likely
6 * no advantages to be gotten from x86-64 here anyways.
7 */
9 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
11 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
12 tell it to do a clts before the register saving. */
13 #define XMMS_SAVE do { \
14 preempt_disable(); \
15 if (!(current_thread_info()->status & TS_USEDFPU)) \
16 clts(); \
17 __asm__ __volatile__ ( \
18 "movups %%xmm0,(%1) ;\n\t" \
19 "movups %%xmm1,0x10(%1) ;\n\t" \
20 "movups %%xmm2,0x20(%1) ;\n\t" \
21 "movups %%xmm3,0x30(%1) ;\n\t" \
22 : "=&r" (cr0) \
23 : "r" (xmm_save) \
24 : "memory"); \
25 } while(0)
27 #define XMMS_RESTORE do { \
28 asm volatile ( \
29 "sfence ;\n\t" \
30 "movups (%1),%%xmm0 ;\n\t" \
31 "movups 0x10(%1),%%xmm1 ;\n\t" \
32 "movups 0x20(%1),%%xmm2 ;\n\t" \
33 "movups 0x30(%1),%%xmm3 ;\n\t" \
34 : \
35 : "r" (cr0), "r" (xmm_save) \
36 : "memory"); \
37 if (!(current_thread_info()->status & TS_USEDFPU)) \
38 stts(); \
39 preempt_enable(); \
40 } while(0)
42 #define OFFS(x) "16*("#x")"
43 #define PF_OFFS(x) "256+16*("#x")"
44 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
45 #define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
46 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
47 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
48 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
49 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
50 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
51 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
52 #define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
53 #define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
54 #define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
55 #define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
56 #define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
59 static void
60 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
61 {
62 unsigned int lines = bytes >> 8;
63 unsigned long cr0;
64 xmm_store_t xmm_save[4];
66 XMMS_SAVE;
68 asm volatile (
69 #undef BLOCK
70 #define BLOCK(i) \
71 LD(i,0) \
72 LD(i+1,1) \
73 PF1(i) \
74 PF1(i+2) \
75 LD(i+2,2) \
76 LD(i+3,3) \
77 PF0(i+4) \
78 PF0(i+6) \
79 XO1(i,0) \
80 XO1(i+1,1) \
81 XO1(i+2,2) \
82 XO1(i+3,3) \
83 ST(i,0) \
84 ST(i+1,1) \
85 ST(i+2,2) \
86 ST(i+3,3) \
89 PF0(0)
90 PF0(2)
92 " .align 32 ;\n"
93 " 1: ;\n"
95 BLOCK(0)
96 BLOCK(4)
97 BLOCK(8)
98 BLOCK(12)
100 " addq %[inc], %[p1] ;\n"
101 " addq %[inc], %[p2] ;\n"
102 " decl %[cnt] ; jnz 1b"
103 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
104 : [inc] "r" (256UL)
105 : "memory");
107 XMMS_RESTORE;
108 }
110 static void
111 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
112 unsigned long *p3)
113 {
114 unsigned int lines = bytes >> 8;
115 xmm_store_t xmm_save[4];
116 unsigned long cr0;
118 XMMS_SAVE;
120 __asm__ __volatile__ (
121 #undef BLOCK
122 #define BLOCK(i) \
123 PF1(i) \
124 PF1(i+2) \
125 LD(i,0) \
126 LD(i+1,1) \
127 LD(i+2,2) \
128 LD(i+3,3) \
129 PF2(i) \
130 PF2(i+2) \
131 PF0(i+4) \
132 PF0(i+6) \
133 XO1(i,0) \
134 XO1(i+1,1) \
135 XO1(i+2,2) \
136 XO1(i+3,3) \
137 XO2(i,0) \
138 XO2(i+1,1) \
139 XO2(i+2,2) \
140 XO2(i+3,3) \
141 ST(i,0) \
142 ST(i+1,1) \
143 ST(i+2,2) \
144 ST(i+3,3) \
147 PF0(0)
148 PF0(2)
150 " .align 32 ;\n"
151 " 1: ;\n"
153 BLOCK(0)
154 BLOCK(4)
155 BLOCK(8)
156 BLOCK(12)
158 " addq %[inc], %[p1] ;\n"
159 " addq %[inc], %[p2] ;\n"
160 " addq %[inc], %[p3] ;\n"
161 " decl %[cnt] ; jnz 1b"
162 : [cnt] "+r" (lines),
163 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
164 : [inc] "r" (256UL)
165 : "memory");
166 XMMS_RESTORE;
167 }
169 static void
170 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
171 unsigned long *p3, unsigned long *p4)
172 {
173 unsigned int lines = bytes >> 8;
174 xmm_store_t xmm_save[4];
175 unsigned long cr0;
177 XMMS_SAVE;
179 __asm__ __volatile__ (
180 #undef BLOCK
181 #define BLOCK(i) \
182 PF1(i) \
183 PF1(i+2) \
184 LD(i,0) \
185 LD(i+1,1) \
186 LD(i+2,2) \
187 LD(i+3,3) \
188 PF2(i) \
189 PF2(i+2) \
190 XO1(i,0) \
191 XO1(i+1,1) \
192 XO1(i+2,2) \
193 XO1(i+3,3) \
194 PF3(i) \
195 PF3(i+2) \
196 PF0(i+4) \
197 PF0(i+6) \
198 XO2(i,0) \
199 XO2(i+1,1) \
200 XO2(i+2,2) \
201 XO2(i+3,3) \
202 XO3(i,0) \
203 XO3(i+1,1) \
204 XO3(i+2,2) \
205 XO3(i+3,3) \
206 ST(i,0) \
207 ST(i+1,1) \
208 ST(i+2,2) \
209 ST(i+3,3) \
212 PF0(0)
213 PF0(2)
215 " .align 32 ;\n"
216 " 1: ;\n"
218 BLOCK(0)
219 BLOCK(4)
220 BLOCK(8)
221 BLOCK(12)
223 " addq %[inc], %[p1] ;\n"
224 " addq %[inc], %[p2] ;\n"
225 " addq %[inc], %[p3] ;\n"
226 " addq %[inc], %[p4] ;\n"
227 " decl %[cnt] ; jnz 1b"
228 : [cnt] "+c" (lines),
229 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
230 : [inc] "r" (256UL)
231 : "memory" );
233 XMMS_RESTORE;
234 }
236 static void
237 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
238 unsigned long *p3, unsigned long *p4, unsigned long *p5)
239 {
240 unsigned int lines = bytes >> 8;
241 xmm_store_t xmm_save[4];
242 unsigned long cr0;
244 XMMS_SAVE;
246 __asm__ __volatile__ (
247 #undef BLOCK
248 #define BLOCK(i) \
249 PF1(i) \
250 PF1(i+2) \
251 LD(i,0) \
252 LD(i+1,1) \
253 LD(i+2,2) \
254 LD(i+3,3) \
255 PF2(i) \
256 PF2(i+2) \
257 XO1(i,0) \
258 XO1(i+1,1) \
259 XO1(i+2,2) \
260 XO1(i+3,3) \
261 PF3(i) \
262 PF3(i+2) \
263 XO2(i,0) \
264 XO2(i+1,1) \
265 XO2(i+2,2) \
266 XO2(i+3,3) \
267 PF4(i) \
268 PF4(i+2) \
269 PF0(i+4) \
270 PF0(i+6) \
271 XO3(i,0) \
272 XO3(i+1,1) \
273 XO3(i+2,2) \
274 XO3(i+3,3) \
275 XO4(i,0) \
276 XO4(i+1,1) \
277 XO4(i+2,2) \
278 XO4(i+3,3) \
279 ST(i,0) \
280 ST(i+1,1) \
281 ST(i+2,2) \
282 ST(i+3,3) \
285 PF0(0)
286 PF0(2)
288 " .align 32 ;\n"
289 " 1: ;\n"
291 BLOCK(0)
292 BLOCK(4)
293 BLOCK(8)
294 BLOCK(12)
296 " addq %[inc], %[p1] ;\n"
297 " addq %[inc], %[p2] ;\n"
298 " addq %[inc], %[p3] ;\n"
299 " addq %[inc], %[p4] ;\n"
300 " addq %[inc], %[p5] ;\n"
301 " decl %[cnt] ; jnz 1b"
302 : [cnt] "+c" (lines),
303 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
304 [p5] "+r" (p5)
305 : [inc] "r" (256UL)
306 : "memory");
308 XMMS_RESTORE;
309 }
311 static struct xor_block_template xor_block_sse = {
312 .name = "generic_sse",
313 .do_2 = xor_sse_2,
314 .do_3 = xor_sse_3,
315 .do_4 = xor_sse_4,
316 .do_5 = xor_sse_5,
317 };
319 #undef XOR_TRY_TEMPLATES
320 #define XOR_TRY_TEMPLATES \
321 do { \
322 xor_speed(&xor_block_sse); \
323 } while (0)
325 /* We force the use of the SSE xor block because it can write around L2.
326 We may also be able to load into the L1 only depending on how the cpu
327 deals with a load to a line that is being prefetched. */
328 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)