direct-io.hg

view linux-2.4.30-xen-sparse/include/asm-xen/xor.h @ 5517:10e9028c8e3d

bitkeeper revision 1.1718.1.10 (42b7b19aqOS_1M8I4pIOFjiTPYWV-g)

Merge bk://xenbits.xensource.com/xen-unstable.bk
into spot.cl.cam.ac.uk:C:/Documents and Settings/iap10/xen-unstable.bk
author iap10@spot.cl.cam.ac.uk
date Tue Jun 21 06:20:10 2005 +0000 (2005-06-21)
parents 85fcf3b1b7a5
children 56a63f9f378f
line source
1 /*
2 * include/asm-i386/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
16 /*
17 * High-speed RAID5 checksumming functions utilizing MMX instructions.
18 * Copyright (C) 1998 Ingo Molnar.
19 */
21 #define FPU_SAVE \
22 do { \
23 if (!(current->flags & PF_USEDFPU)) \
24 clts(); \
25 __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
26 } while (0)
28 #define FPU_RESTORE \
29 do { \
30 __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
31 if (!(current->flags & PF_USEDFPU)) \
32 stts(); \
33 } while (0)
35 #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
36 #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
37 #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
38 #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
39 #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
40 #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
43 static void
44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45 {
46 unsigned long lines = bytes >> 7;
47 char fpu_save[108];
49 FPU_SAVE;
51 __asm__ __volatile__ (
52 #undef BLOCK
53 #define BLOCK(i) \
54 LD(i,0) \
55 LD(i+1,1) \
56 LD(i+2,2) \
57 LD(i+3,3) \
58 XO1(i,0) \
59 ST(i,0) \
60 XO1(i+1,1) \
61 ST(i+1,1) \
62 XO1(i+2,2) \
63 ST(i+2,2) \
64 XO1(i+3,3) \
65 ST(i+3,3)
67 " .align 32 ;\n"
68 " 1: ;\n"
70 BLOCK(0)
71 BLOCK(4)
72 BLOCK(8)
73 BLOCK(12)
75 " addl $128, %1 ;\n"
76 " addl $128, %2 ;\n"
77 " decl %0 ;\n"
78 " jnz 1b ;\n"
79 : "+r" (lines),
80 "+r" (p1), "+r" (p2)
81 :
82 : "memory");
84 FPU_RESTORE;
85 }
87 static void
88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89 unsigned long *p3)
90 {
91 unsigned long lines = bytes >> 7;
92 char fpu_save[108];
94 FPU_SAVE;
96 __asm__ __volatile__ (
97 #undef BLOCK
98 #define BLOCK(i) \
99 LD(i,0) \
100 LD(i+1,1) \
101 LD(i+2,2) \
102 LD(i+3,3) \
103 XO1(i,0) \
104 XO1(i+1,1) \
105 XO1(i+2,2) \
106 XO1(i+3,3) \
107 XO2(i,0) \
108 ST(i,0) \
109 XO2(i+1,1) \
110 ST(i+1,1) \
111 XO2(i+2,2) \
112 ST(i+2,2) \
113 XO2(i+3,3) \
114 ST(i+3,3)
116 " .align 32 ;\n"
117 " 1: ;\n"
119 BLOCK(0)
120 BLOCK(4)
121 BLOCK(8)
122 BLOCK(12)
124 " addl $128, %1 ;\n"
125 " addl $128, %2 ;\n"
126 " addl $128, %3 ;\n"
127 " decl %0 ;\n"
128 " jnz 1b ;\n"
129 : "+r" (lines),
130 "+r" (p1), "+r" (p2), "+r" (p3)
131 :
132 : "memory");
134 FPU_RESTORE;
135 }
137 static void
138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139 unsigned long *p3, unsigned long *p4)
140 {
141 unsigned long lines = bytes >> 7;
142 char fpu_save[108];
144 FPU_SAVE;
146 __asm__ __volatile__ (
147 #undef BLOCK
148 #define BLOCK(i) \
149 LD(i,0) \
150 LD(i+1,1) \
151 LD(i+2,2) \
152 LD(i+3,3) \
153 XO1(i,0) \
154 XO1(i+1,1) \
155 XO1(i+2,2) \
156 XO1(i+3,3) \
157 XO2(i,0) \
158 XO2(i+1,1) \
159 XO2(i+2,2) \
160 XO2(i+3,3) \
161 XO3(i,0) \
162 ST(i,0) \
163 XO3(i+1,1) \
164 ST(i+1,1) \
165 XO3(i+2,2) \
166 ST(i+2,2) \
167 XO3(i+3,3) \
168 ST(i+3,3)
170 " .align 32 ;\n"
171 " 1: ;\n"
173 BLOCK(0)
174 BLOCK(4)
175 BLOCK(8)
176 BLOCK(12)
178 " addl $128, %1 ;\n"
179 " addl $128, %2 ;\n"
180 " addl $128, %3 ;\n"
181 " addl $128, %4 ;\n"
182 " decl %0 ;\n"
183 " jnz 1b ;\n"
184 : "+r" (lines),
185 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
186 :
187 : "memory");
189 FPU_RESTORE;
190 }
193 static void
194 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 unsigned long *p3, unsigned long *p4, unsigned long *p5)
196 {
197 unsigned long lines = bytes >> 7;
198 char fpu_save[108];
200 FPU_SAVE;
202 /* need to save/restore p4/p5 manually otherwise gcc's 10 argument
203 limit gets exceeded (+ counts as two arguments) */
204 __asm__ __volatile__ (
205 " pushl %4\n"
206 " pushl %5\n"
207 #undef BLOCK
208 #define BLOCK(i) \
209 LD(i,0) \
210 LD(i+1,1) \
211 LD(i+2,2) \
212 LD(i+3,3) \
213 XO1(i,0) \
214 XO1(i+1,1) \
215 XO1(i+2,2) \
216 XO1(i+3,3) \
217 XO2(i,0) \
218 XO2(i+1,1) \
219 XO2(i+2,2) \
220 XO2(i+3,3) \
221 XO3(i,0) \
222 XO3(i+1,1) \
223 XO3(i+2,2) \
224 XO3(i+3,3) \
225 XO4(i,0) \
226 ST(i,0) \
227 XO4(i+1,1) \
228 ST(i+1,1) \
229 XO4(i+2,2) \
230 ST(i+2,2) \
231 XO4(i+3,3) \
232 ST(i+3,3)
234 " .align 32 ;\n"
235 " 1: ;\n"
237 BLOCK(0)
238 BLOCK(4)
239 BLOCK(8)
240 BLOCK(12)
242 " addl $128, %1 ;\n"
243 " addl $128, %2 ;\n"
244 " addl $128, %3 ;\n"
245 " addl $128, %4 ;\n"
246 " addl $128, %5 ;\n"
247 " decl %0 ;\n"
248 " jnz 1b ;\n"
249 " popl %5\n"
250 " popl %4\n"
251 : "+r" (lines),
252 "+r" (p1), "+r" (p2), "+r" (p3)
253 : "r" (p4), "r" (p5)
254 : "memory");
256 FPU_RESTORE;
257 }
259 #undef LD
260 #undef XO1
261 #undef XO2
262 #undef XO3
263 #undef XO4
264 #undef ST
265 #undef BLOCK
267 static void
268 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
269 {
270 unsigned long lines = bytes >> 6;
271 char fpu_save[108];
273 FPU_SAVE;
275 __asm__ __volatile__ (
276 " .align 32 ;\n"
277 " 1: ;\n"
278 " movq (%1), %%mm0 ;\n"
279 " movq 8(%1), %%mm1 ;\n"
280 " pxor (%2), %%mm0 ;\n"
281 " movq 16(%1), %%mm2 ;\n"
282 " movq %%mm0, (%1) ;\n"
283 " pxor 8(%2), %%mm1 ;\n"
284 " movq 24(%1), %%mm3 ;\n"
285 " movq %%mm1, 8(%1) ;\n"
286 " pxor 16(%2), %%mm2 ;\n"
287 " movq 32(%1), %%mm4 ;\n"
288 " movq %%mm2, 16(%1) ;\n"
289 " pxor 24(%2), %%mm3 ;\n"
290 " movq 40(%1), %%mm5 ;\n"
291 " movq %%mm3, 24(%1) ;\n"
292 " pxor 32(%2), %%mm4 ;\n"
293 " movq 48(%1), %%mm6 ;\n"
294 " movq %%mm4, 32(%1) ;\n"
295 " pxor 40(%2), %%mm5 ;\n"
296 " movq 56(%1), %%mm7 ;\n"
297 " movq %%mm5, 40(%1) ;\n"
298 " pxor 48(%2), %%mm6 ;\n"
299 " pxor 56(%2), %%mm7 ;\n"
300 " movq %%mm6, 48(%1) ;\n"
301 " movq %%mm7, 56(%1) ;\n"
303 " addl $64, %1 ;\n"
304 " addl $64, %2 ;\n"
305 " decl %0 ;\n"
306 " jnz 1b ;\n"
307 : "+r" (lines),
308 "+r" (p1), "+r" (p2)
309 :
310 : "memory");
312 FPU_RESTORE;
313 }
315 static void
316 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
317 unsigned long *p3)
318 {
319 unsigned long lines = bytes >> 6;
320 char fpu_save[108];
322 FPU_SAVE;
324 __asm__ __volatile__ (
325 " .align 32,0x90 ;\n"
326 " 1: ;\n"
327 " movq (%1), %%mm0 ;\n"
328 " movq 8(%1), %%mm1 ;\n"
329 " pxor (%2), %%mm0 ;\n"
330 " movq 16(%1), %%mm2 ;\n"
331 " pxor 8(%2), %%mm1 ;\n"
332 " pxor (%3), %%mm0 ;\n"
333 " pxor 16(%2), %%mm2 ;\n"
334 " movq %%mm0, (%1) ;\n"
335 " pxor 8(%3), %%mm1 ;\n"
336 " pxor 16(%3), %%mm2 ;\n"
337 " movq 24(%1), %%mm3 ;\n"
338 " movq %%mm1, 8(%1) ;\n"
339 " movq 32(%1), %%mm4 ;\n"
340 " movq 40(%1), %%mm5 ;\n"
341 " pxor 24(%2), %%mm3 ;\n"
342 " movq %%mm2, 16(%1) ;\n"
343 " pxor 32(%2), %%mm4 ;\n"
344 " pxor 24(%3), %%mm3 ;\n"
345 " pxor 40(%2), %%mm5 ;\n"
346 " movq %%mm3, 24(%1) ;\n"
347 " pxor 32(%3), %%mm4 ;\n"
348 " pxor 40(%3), %%mm5 ;\n"
349 " movq 48(%1), %%mm6 ;\n"
350 " movq %%mm4, 32(%1) ;\n"
351 " movq 56(%1), %%mm7 ;\n"
352 " pxor 48(%2), %%mm6 ;\n"
353 " movq %%mm5, 40(%1) ;\n"
354 " pxor 56(%2), %%mm7 ;\n"
355 " pxor 48(%3), %%mm6 ;\n"
356 " pxor 56(%3), %%mm7 ;\n"
357 " movq %%mm6, 48(%1) ;\n"
358 " movq %%mm7, 56(%1) ;\n"
360 " addl $64, %1 ;\n"
361 " addl $64, %2 ;\n"
362 " addl $64, %3 ;\n"
363 " decl %0 ;\n"
364 " jnz 1b ;\n"
365 : "+r" (lines),
366 "+r" (p1), "+r" (p2), "+r" (p3)
367 :
368 : "memory" );
370 FPU_RESTORE;
371 }
373 static void
374 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
375 unsigned long *p3, unsigned long *p4)
376 {
377 unsigned long lines = bytes >> 6;
378 char fpu_save[108];
380 FPU_SAVE;
382 __asm__ __volatile__ (
383 " .align 32,0x90 ;\n"
384 " 1: ;\n"
385 " movq (%1), %%mm0 ;\n"
386 " movq 8(%1), %%mm1 ;\n"
387 " pxor (%2), %%mm0 ;\n"
388 " movq 16(%1), %%mm2 ;\n"
389 " pxor 8(%2), %%mm1 ;\n"
390 " pxor (%3), %%mm0 ;\n"
391 " pxor 16(%2), %%mm2 ;\n"
392 " pxor 8(%3), %%mm1 ;\n"
393 " pxor (%4), %%mm0 ;\n"
394 " movq 24(%1), %%mm3 ;\n"
395 " pxor 16(%3), %%mm2 ;\n"
396 " pxor 8(%4), %%mm1 ;\n"
397 " movq %%mm0, (%1) ;\n"
398 " movq 32(%1), %%mm4 ;\n"
399 " pxor 24(%2), %%mm3 ;\n"
400 " pxor 16(%4), %%mm2 ;\n"
401 " movq %%mm1, 8(%1) ;\n"
402 " movq 40(%1), %%mm5 ;\n"
403 " pxor 32(%2), %%mm4 ;\n"
404 " pxor 24(%3), %%mm3 ;\n"
405 " movq %%mm2, 16(%1) ;\n"
406 " pxor 40(%2), %%mm5 ;\n"
407 " pxor 32(%3), %%mm4 ;\n"
408 " pxor 24(%4), %%mm3 ;\n"
409 " movq %%mm3, 24(%1) ;\n"
410 " movq 56(%1), %%mm7 ;\n"
411 " movq 48(%1), %%mm6 ;\n"
412 " pxor 40(%3), %%mm5 ;\n"
413 " pxor 32(%4), %%mm4 ;\n"
414 " pxor 48(%2), %%mm6 ;\n"
415 " movq %%mm4, 32(%1) ;\n"
416 " pxor 56(%2), %%mm7 ;\n"
417 " pxor 40(%4), %%mm5 ;\n"
418 " pxor 48(%3), %%mm6 ;\n"
419 " pxor 56(%3), %%mm7 ;\n"
420 " movq %%mm5, 40(%1) ;\n"
421 " pxor 48(%4), %%mm6 ;\n"
422 " pxor 56(%4), %%mm7 ;\n"
423 " movq %%mm6, 48(%1) ;\n"
424 " movq %%mm7, 56(%1) ;\n"
426 " addl $64, %1 ;\n"
427 " addl $64, %2 ;\n"
428 " addl $64, %3 ;\n"
429 " addl $64, %4 ;\n"
430 " decl %0 ;\n"
431 " jnz 1b ;\n"
432 : "+r" (lines),
433 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
434 :
435 : "memory");
437 FPU_RESTORE;
438 }
440 static void
441 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
442 unsigned long *p3, unsigned long *p4, unsigned long *p5)
443 {
444 unsigned long lines = bytes >> 6;
445 char fpu_save[108];
447 FPU_SAVE;
449 /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
450 __asm__ __volatile__ (
451 " pushl %4\n"
452 " pushl %5\n"
453 " .align 32,0x90 ;\n"
454 " 1: ;\n"
455 " movq (%1), %%mm0 ;\n"
456 " movq 8(%1), %%mm1 ;\n"
457 " pxor (%2), %%mm0 ;\n"
458 " pxor 8(%2), %%mm1 ;\n"
459 " movq 16(%1), %%mm2 ;\n"
460 " pxor (%3), %%mm0 ;\n"
461 " pxor 8(%3), %%mm1 ;\n"
462 " pxor 16(%2), %%mm2 ;\n"
463 " pxor (%4), %%mm0 ;\n"
464 " pxor 8(%4), %%mm1 ;\n"
465 " pxor 16(%3), %%mm2 ;\n"
466 " movq 24(%1), %%mm3 ;\n"
467 " pxor (%5), %%mm0 ;\n"
468 " pxor 8(%5), %%mm1 ;\n"
469 " movq %%mm0, (%1) ;\n"
470 " pxor 16(%4), %%mm2 ;\n"
471 " pxor 24(%2), %%mm3 ;\n"
472 " movq %%mm1, 8(%1) ;\n"
473 " pxor 16(%5), %%mm2 ;\n"
474 " pxor 24(%3), %%mm3 ;\n"
475 " movq 32(%1), %%mm4 ;\n"
476 " movq %%mm2, 16(%1) ;\n"
477 " pxor 24(%4), %%mm3 ;\n"
478 " pxor 32(%2), %%mm4 ;\n"
479 " movq 40(%1), %%mm5 ;\n"
480 " pxor 24(%5), %%mm3 ;\n"
481 " pxor 32(%3), %%mm4 ;\n"
482 " pxor 40(%2), %%mm5 ;\n"
483 " movq %%mm3, 24(%1) ;\n"
484 " pxor 32(%4), %%mm4 ;\n"
485 " pxor 40(%3), %%mm5 ;\n"
486 " movq 48(%1), %%mm6 ;\n"
487 " movq 56(%1), %%mm7 ;\n"
488 " pxor 32(%5), %%mm4 ;\n"
489 " pxor 40(%4), %%mm5 ;\n"
490 " pxor 48(%2), %%mm6 ;\n"
491 " pxor 56(%2), %%mm7 ;\n"
492 " movq %%mm4, 32(%1) ;\n"
493 " pxor 48(%3), %%mm6 ;\n"
494 " pxor 56(%3), %%mm7 ;\n"
495 " pxor 40(%5), %%mm5 ;\n"
496 " pxor 48(%4), %%mm6 ;\n"
497 " pxor 56(%4), %%mm7 ;\n"
498 " movq %%mm5, 40(%1) ;\n"
499 " pxor 48(%5), %%mm6 ;\n"
500 " pxor 56(%5), %%mm7 ;\n"
501 " movq %%mm6, 48(%1) ;\n"
502 " movq %%mm7, 56(%1) ;\n"
504 " addl $64, %1 ;\n"
505 " addl $64, %2 ;\n"
506 " addl $64, %3 ;\n"
507 " addl $64, %4 ;\n"
508 " addl $64, %5 ;\n"
509 " decl %0 ;\n"
510 " jnz 1b ;\n"
511 " popl %5\n"
512 " popl %4\n"
513 : "+g" (lines),
514 "+r" (p1), "+r" (p2), "+r" (p3)
515 : "r" (p4), "r" (p5)
516 : "memory");
518 FPU_RESTORE;
519 }
521 static struct xor_block_template xor_block_pII_mmx = {
522 name: "pII_mmx",
523 do_2: xor_pII_mmx_2,
524 do_3: xor_pII_mmx_3,
525 do_4: xor_pII_mmx_4,
526 do_5: xor_pII_mmx_5,
527 };
529 static struct xor_block_template xor_block_p5_mmx = {
530 name: "p5_mmx",
531 do_2: xor_p5_mmx_2,
532 do_3: xor_p5_mmx_3,
533 do_4: xor_p5_mmx_4,
534 do_5: xor_p5_mmx_5,
535 };
537 #undef FPU_SAVE
538 #undef FPU_RESTORE
540 /*
541 * Cache avoiding checksumming functions utilizing KNI instructions
542 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
543 */
545 #define XMMS_SAVE \
546 if (!(current->flags & PF_USEDFPU)) \
547 clts(); \
548 __asm__ __volatile__ ( \
549 "movups %%xmm0,(%1) ;\n\t" \
550 "movups %%xmm1,0x10(%1) ;\n\t" \
551 "movups %%xmm2,0x20(%1) ;\n\t" \
552 "movups %%xmm3,0x30(%1) ;\n\t" \
553 : "=&r" (cr0) \
554 : "r" (xmm_save) \
555 : "memory")
557 #define XMMS_RESTORE \
558 __asm__ __volatile__ ( \
559 "sfence ;\n\t" \
560 "movups (%1),%%xmm0 ;\n\t" \
561 "movups 0x10(%1),%%xmm1 ;\n\t" \
562 "movups 0x20(%1),%%xmm2 ;\n\t" \
563 "movups 0x30(%1),%%xmm3 ;\n\t" \
564 : \
565 : "r" (cr0), "r" (xmm_save) \
566 : "memory"); \
567 if (!(current->flags & PF_USEDFPU)) \
568 stts()
570 #define ALIGN16 __attribute__((aligned(16)))
572 #define OFFS(x) "16*("#x")"
573 #define PF_OFFS(x) "256+16*("#x")"
574 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
575 #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
576 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
577 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
578 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
579 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
580 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
581 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
582 #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
583 #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
584 #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
585 #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
586 #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
589 static void
590 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
591 {
592 unsigned long lines = bytes >> 8;
593 char xmm_save[16*4] ALIGN16;
594 int cr0;
596 XMMS_SAVE;
598 __asm__ __volatile__ (
599 #undef BLOCK
600 #define BLOCK(i) \
601 LD(i,0) \
602 LD(i+1,1) \
603 PF1(i) \
604 PF1(i+2) \
605 LD(i+2,2) \
606 LD(i+3,3) \
607 PF0(i+4) \
608 PF0(i+6) \
609 XO1(i,0) \
610 XO1(i+1,1) \
611 XO1(i+2,2) \
612 XO1(i+3,3) \
613 ST(i,0) \
614 ST(i+1,1) \
615 ST(i+2,2) \
616 ST(i+3,3) \
619 PF0(0)
620 PF0(2)
622 " .align 32 ;\n"
623 " 1: ;\n"
625 BLOCK(0)
626 BLOCK(4)
627 BLOCK(8)
628 BLOCK(12)
630 " addl $256, %1 ;\n"
631 " addl $256, %2 ;\n"
632 " decl %0 ;\n"
633 " jnz 1b ;\n"
634 : "+r" (lines),
635 "+r" (p1), "+r" (p2)
636 :
637 : "memory");
639 XMMS_RESTORE;
640 }
642 static void
643 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
644 unsigned long *p3)
645 {
646 unsigned long lines = bytes >> 8;
647 char xmm_save[16*4] ALIGN16;
648 int cr0;
650 XMMS_SAVE;
652 __asm__ __volatile__ (
653 #undef BLOCK
654 #define BLOCK(i) \
655 PF1(i) \
656 PF1(i+2) \
657 LD(i,0) \
658 LD(i+1,1) \
659 LD(i+2,2) \
660 LD(i+3,3) \
661 PF2(i) \
662 PF2(i+2) \
663 PF0(i+4) \
664 PF0(i+6) \
665 XO1(i,0) \
666 XO1(i+1,1) \
667 XO1(i+2,2) \
668 XO1(i+3,3) \
669 XO2(i,0) \
670 XO2(i+1,1) \
671 XO2(i+2,2) \
672 XO2(i+3,3) \
673 ST(i,0) \
674 ST(i+1,1) \
675 ST(i+2,2) \
676 ST(i+3,3) \
679 PF0(0)
680 PF0(2)
682 " .align 32 ;\n"
683 " 1: ;\n"
685 BLOCK(0)
686 BLOCK(4)
687 BLOCK(8)
688 BLOCK(12)
690 " addl $256, %1 ;\n"
691 " addl $256, %2 ;\n"
692 " addl $256, %3 ;\n"
693 " decl %0 ;\n"
694 " jnz 1b ;\n"
695 : "+r" (lines),
696 "+r" (p1), "+r"(p2), "+r"(p3)
697 :
698 : "memory" );
700 XMMS_RESTORE;
701 }
703 static void
704 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
705 unsigned long *p3, unsigned long *p4)
706 {
707 unsigned long lines = bytes >> 8;
708 char xmm_save[16*4] ALIGN16;
709 int cr0;
711 XMMS_SAVE;
713 __asm__ __volatile__ (
714 #undef BLOCK
715 #define BLOCK(i) \
716 PF1(i) \
717 PF1(i+2) \
718 LD(i,0) \
719 LD(i+1,1) \
720 LD(i+2,2) \
721 LD(i+3,3) \
722 PF2(i) \
723 PF2(i+2) \
724 XO1(i,0) \
725 XO1(i+1,1) \
726 XO1(i+2,2) \
727 XO1(i+3,3) \
728 PF3(i) \
729 PF3(i+2) \
730 PF0(i+4) \
731 PF0(i+6) \
732 XO2(i,0) \
733 XO2(i+1,1) \
734 XO2(i+2,2) \
735 XO2(i+3,3) \
736 XO3(i,0) \
737 XO3(i+1,1) \
738 XO3(i+2,2) \
739 XO3(i+3,3) \
740 ST(i,0) \
741 ST(i+1,1) \
742 ST(i+2,2) \
743 ST(i+3,3) \
746 PF0(0)
747 PF0(2)
749 " .align 32 ;\n"
750 " 1: ;\n"
752 BLOCK(0)
753 BLOCK(4)
754 BLOCK(8)
755 BLOCK(12)
757 " addl $256, %1 ;\n"
758 " addl $256, %2 ;\n"
759 " addl $256, %3 ;\n"
760 " addl $256, %4 ;\n"
761 " decl %0 ;\n"
762 " jnz 1b ;\n"
763 : "+r" (lines),
764 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
765 :
766 : "memory" );
768 XMMS_RESTORE;
769 }
771 static void
772 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
773 unsigned long *p3, unsigned long *p4, unsigned long *p5)
774 {
775 unsigned long lines = bytes >> 8;
776 char xmm_save[16*4] ALIGN16;
777 int cr0;
779 XMMS_SAVE;
781 /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
782 __asm__ __volatile__ (
783 " pushl %4\n"
784 " pushl %5\n"
785 #undef BLOCK
786 #define BLOCK(i) \
787 PF1(i) \
788 PF1(i+2) \
789 LD(i,0) \
790 LD(i+1,1) \
791 LD(i+2,2) \
792 LD(i+3,3) \
793 PF2(i) \
794 PF2(i+2) \
795 XO1(i,0) \
796 XO1(i+1,1) \
797 XO1(i+2,2) \
798 XO1(i+3,3) \
799 PF3(i) \
800 PF3(i+2) \
801 XO2(i,0) \
802 XO2(i+1,1) \
803 XO2(i+2,2) \
804 XO2(i+3,3) \
805 PF4(i) \
806 PF4(i+2) \
807 PF0(i+4) \
808 PF0(i+6) \
809 XO3(i,0) \
810 XO3(i+1,1) \
811 XO3(i+2,2) \
812 XO3(i+3,3) \
813 XO4(i,0) \
814 XO4(i+1,1) \
815 XO4(i+2,2) \
816 XO4(i+3,3) \
817 ST(i,0) \
818 ST(i+1,1) \
819 ST(i+2,2) \
820 ST(i+3,3) \
823 PF0(0)
824 PF0(2)
826 " .align 32 ;\n"
827 " 1: ;\n"
829 BLOCK(0)
830 BLOCK(4)
831 BLOCK(8)
832 BLOCK(12)
834 " addl $256, %1 ;\n"
835 " addl $256, %2 ;\n"
836 " addl $256, %3 ;\n"
837 " addl $256, %4 ;\n"
838 " addl $256, %5 ;\n"
839 " decl %0 ;\n"
840 " jnz 1b ;\n"
841 " popl %5\n"
842 " popl %4\n"
843 : "+r" (lines),
844 "+r" (p1), "+r" (p2), "+r" (p3)
845 : "r" (p4), "r" (p5)
846 : "memory");
848 XMMS_RESTORE;
849 }
851 static struct xor_block_template xor_block_pIII_sse = {
852 name: "pIII_sse",
853 do_2: xor_sse_2,
854 do_3: xor_sse_3,
855 do_4: xor_sse_4,
856 do_5: xor_sse_5,
857 };
859 /* Also try the generic routines. */
860 #include <asm-generic/xor.h>
862 #undef XOR_TRY_TEMPLATES
863 #define XOR_TRY_TEMPLATES \
864 do { \
865 xor_speed(&xor_block_8regs); \
866 xor_speed(&xor_block_32regs); \
867 if (cpu_has_xmm) \
868 xor_speed(&xor_block_pIII_sse); \
869 if (md_cpu_has_mmx()) { \
870 xor_speed(&xor_block_pII_mmx); \
871 xor_speed(&xor_block_p5_mmx); \
872 } \
873 } while (0)
875 /* We force the use of the SSE xor block because it can write around L2.
876 We may also be able to load into the L1 only depending on how the cpu
877 deals with a load to a line that is being prefetched. */
878 #define XOR_SELECT_TEMPLATE(FASTEST) \
879 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)