ia64/xen-unstable

view xen/crypto/vmac.c @ 19568:4dd8ed253ee0

x86: Remove bogus vtd frametable range check

Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Apr 23 10:10:11 2009 +0100 (2009-04-23)
parents 7d55cc23493b
children
line source
1 /* --------------------------------------------------------------------------
2 * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
3 * This implementation is herby placed in the public domain.
4 * The authors offers no warranty. Use at your own risk.
5 * Please send bug reports to the authors.
6 * Last modified: 17 APR 08, 1700 PDT
7 * ----------------------------------------------------------------------- */
9 /* start for Xen */
10 #include <xen/config.h>
11 #include <xen/init.h>
12 #include <xen/types.h>
13 #include <xen/lib.h>
14 #include <crypto/vmac.h>
15 #define UINT64_C(x) x##ULL
16 /* end for Xen */
18 /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
19 #ifndef VMAC_ARCH_64
20 #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
21 #endif
23 /* Enable code tuned for Intel SSE2 instruction set */
24 #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
25 #define VMAC_USE_SSE2 1
26 #include <emmintrin.h>
27 #endif
29 /* Native word reads. Update (or define via compiler) if incorrect */
30 #ifndef VMAC_ARCH_BIG_ENDIAN /* Assume big-endian unless on the list */
31 #define VMAC_ARCH_BIG_ENDIAN \
32 (!(__x86_64__ || __i386__ || _M_IX86 || \
33 _M_X64 || __ARMEL__ || __MIPSEL__))
34 #endif
36 /* ----------------------------------------------------------------------- */
37 /* Constants and masks */
39 const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */
40 const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */
41 const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */
42 const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */
43 const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */
45 /* ----------------------------------------------------------------------- *
46 * The following routines are used in this implementation. They are
47 * written via macros to simulate zero-overhead call-by-reference.
48 * All have default implemantations for when they are not defined in an
49 * architecture-specific manner.
50 *
51 * MUL64: 64x64->128-bit multiplication
52 * PMUL64: assumes top bits cleared on inputs
53 * ADD128: 128x128->128-bit addition
54 * GET_REVERSED_64: load and byte-reverse 64-bit word
55 * ----------------------------------------------------------------------- */
57 /* ----------------------------------------------------------------------- */
58 #if (__GNUC__ && (__x86_64__ || __amd64__))
59 /* ----------------------------------------------------------------------- */
61 #define ADD128(rh,rl,ih,il) \
62 asm ("addq %3, %1 \n\t" \
63 "adcq %2, %0" \
64 : "+r"(rh),"+r"(rl) \
65 : "r"(ih),"r"(il) : "cc");
67 #define MUL64(rh,rl,i1,i2) \
68 asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
70 #define PMUL64 MUL64
72 #define GET_REVERSED_64(p) \
73 ({uint64_t x; \
74 asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;})
76 /* ----------------------------------------------------------------------- */
77 #elif (__GNUC__ && __i386__)
78 /* ----------------------------------------------------------------------- */
80 #define GET_REVERSED_64(p) \
81 ({ uint64_t x; \
82 uint32_t *tp = (uint32_t *)(p); \
83 asm ("bswap %%edx\n\t" \
84 "bswap %%eax" \
85 : "=A"(x) \
86 : "a"(tp[1]), "d"(tp[0])); \
87 x; })
89 /* ----------------------------------------------------------------------- */
90 #elif (__GNUC__ && __ppc64__)
91 /* ----------------------------------------------------------------------- */
93 #define ADD128(rh,rl,ih,il) \
94 asm volatile ( "addc %1, %1, %3 \n\t" \
95 "adde %0, %0, %2" \
96 : "+r"(rh),"+r"(rl) \
97 : "r"(ih),"r"(il));
99 #define MUL64(rh,rl,i1,i2) \
100 { uint64_t _i1 = (i1), _i2 = (i2); \
101 rl = _i1 * _i2; \
102 asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
103 }
105 #define PMUL64 MUL64
107 #define GET_REVERSED_64(p) \
108 ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \
109 asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \
110 asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \
111 ((uint64_t)hi << 32) | (uint64_t)lo; } )
113 /* ----------------------------------------------------------------------- */
114 #elif (__GNUC__ && (__ppc__ || __PPC__))
115 /* ----------------------------------------------------------------------- */
117 #define GET_REVERSED_64(p) \
118 ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \
119 asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \
120 asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \
121 ((uint64_t)hi << 32) | (uint64_t)lo; } )
123 /* ----------------------------------------------------------------------- */
124 #elif (__GNUC__ && (__ARMEL__ || __ARM__))
125 /* ----------------------------------------------------------------------- */
127 #define bswap32(v) \
128 ({ uint32_t tmp,out; \
129 asm volatile( \
130 "eor %1, %2, %2, ror #16\n" \
131 "bic %1, %1, #0x00ff0000\n" \
132 "mov %0, %2, ror #8\n" \
133 "eor %0, %0, %1, lsr #8" \
134 : "=r" (out), "=&r" (tmp) \
135 : "r" (v)); \
136 out;})
138 /* ----------------------------------------------------------------------- */
139 #elif _MSC_VER
140 /* ----------------------------------------------------------------------- */
142 #include <intrin.h>
144 #if (_M_IA64 || _M_X64) && \
145 (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
146 #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
147 #pragma intrinsic(_umul128)
148 #define PMUL64 MUL64
149 #endif
151 /* MSVC uses add, adc in this version */
152 #define ADD128(rh,rl,ih,il) \
153 { uint64_t _il = (il); \
154 (rl) += (_il); \
155 (rh) += (ih) + ((rl) < (_il)); \
156 }
158 #if _MSC_VER >= 1300
159 #define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p))
160 #pragma intrinsic(_byteswap_uint64)
161 #endif
163 #if _MSC_VER >= 1400 && \
164 (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
165 #define MUL32(i1,i2) (__emulu((uint32_t)(i1),(uint32_t)(i2)))
166 #pragma intrinsic(__emulu)
167 #endif
169 /* ----------------------------------------------------------------------- */
170 #endif
171 /* ----------------------------------------------------------------------- */
173 #if __GNUC__
174 #define ALIGN(n) __attribute__ ((aligned(n)))
175 #define NOINLINE __attribute__ ((noinline))
176 #define FASTCALL
177 #elif _MSC_VER
178 #define ALIGN(n) __declspec(align(n))
179 #define NOINLINE __declspec(noinline)
180 #define FASTCALL __fastcall
181 #else
182 #define ALIGN(n)
183 #define NOINLINE
184 #define FASTCALL
185 #endif
187 /* ----------------------------------------------------------------------- */
188 /* Default implementations, if not defined above */
189 /* ----------------------------------------------------------------------- */
191 #ifndef ADD128
192 #define ADD128(rh,rl,ih,il) \
193 { uint64_t _il = (il); \
194 (rl) += (_il); \
195 if ((rl) < (_il)) (rh)++; \
196 (rh) += (ih); \
197 }
198 #endif
200 #ifndef MUL32
201 #define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
202 #endif
204 #ifndef PMUL64 /* rh may not be same as i1 or i2 */
205 #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow */ \
206 { uint64_t _i1 = (i1), _i2 = (i2); \
207 uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2); \
208 rh = MUL32(_i1>>32,_i2>>32); \
209 rl = MUL32(_i1,_i2); \
210 ADD128(rh,rl,(m >> 32),(m << 32)); \
211 }
212 #endif
214 #ifndef MUL64
215 #define MUL64(rh,rl,i1,i2) \
216 { uint64_t _i1 = (i1), _i2 = (i2); \
217 uint64_t m1= MUL32(_i1,_i2>>32); \
218 uint64_t m2= MUL32(_i1>>32,_i2); \
219 rh = MUL32(_i1>>32,_i2>>32); \
220 rl = MUL32(_i1,_i2); \
221 ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
222 ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
223 }
224 #endif
226 #ifndef GET_REVERSED_64
227 #ifndef bswap64
228 #ifndef bswap32
229 #define bswap32(x) \
230 ({ uint32_t bsx = (x); \
231 ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >> 8) | \
232 (((bsx) & 0x0000ff00u) << 8) | (((bsx) & 0x000000ffu) << 24)); })
233 #endif
234 #define bswap64(x) \
235 ({ union { uint64_t ll; uint32_t l[2]; } w, r; \
236 w.ll = (x); \
237 r.l[0] = bswap32 (w.l[1]); \
238 r.l[1] = bswap32 (w.l[0]); \
239 r.ll; })
240 #endif
241 #define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p))
242 #endif
244 /* ----------------------------------------------------------------------- */
246 #if (VMAC_PREFER_BIG_ENDIAN)
247 # define get64PE get64BE
248 #else
249 # define get64PE get64LE
250 #endif
252 #if (VMAC_ARCH_BIG_ENDIAN)
253 # define get64BE(ptr) (*(uint64_t *)(ptr))
254 # define get64LE(ptr) GET_REVERSED_64(ptr)
255 #else /* assume little-endian */
256 # define get64BE(ptr) GET_REVERSED_64(ptr)
257 # define get64LE(ptr) (*(uint64_t *)(ptr))
258 #endif
261 /* --------------------------------------------------------------------- *
262 * For highest performance the L1 NH and L2 polynomial hashes should be
263 * carefully implemented to take advantage of one's target architechture.
264 * Here these two hash functions are defined multiple time; once for
265 * 64-bit architectures, once for 32-bit SSE2 architectures, and once
266 * for the rest (32-bit) architectures.
267 * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
268 * Optionally, nh_vmac_nhbytes can be defined (for multiples of
269 * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
270 * NH computations at once).
271 * --------------------------------------------------------------------- */
273 /* ----------------------------------------------------------------------- */
274 #if VMAC_ARCH_64
275 /* ----------------------------------------------------------------------- */
277 #define nh_16(mp, kp, nw, rh, rl) \
278 { int i; uint64_t th, tl; \
279 rh = rl = 0; \
280 for (i = 0; i < nw; i+= 2) { \
281 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
282 ADD128(rh,rl,th,tl); \
283 } \
284 }
285 #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1) \
286 { int i; uint64_t th, tl; \
287 rh1 = rl1 = rh = rl = 0; \
288 for (i = 0; i < nw; i+= 2) { \
289 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
290 ADD128(rh,rl,th,tl); \
291 MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
292 ADD128(rh1,rl1,th,tl); \
293 } \
294 }
296 #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
297 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
298 { int i; uint64_t th, tl; \
299 rh = rl = 0; \
300 for (i = 0; i < nw; i+= 8) { \
301 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
302 ADD128(rh,rl,th,tl); \
303 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
304 ADD128(rh,rl,th,tl); \
305 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
306 ADD128(rh,rl,th,tl); \
307 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
308 ADD128(rh,rl,th,tl); \
309 } \
310 }
311 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1) \
312 { int i; uint64_t th, tl; \
313 rh1 = rl1 = rh = rl = 0; \
314 for (i = 0; i < nw; i+= 8) { \
315 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
316 ADD128(rh,rl,th,tl); \
317 MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
318 ADD128(rh1,rl1,th,tl); \
319 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
320 ADD128(rh,rl,th,tl); \
321 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
322 ADD128(rh1,rl1,th,tl); \
323 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
324 ADD128(rh,rl,th,tl); \
325 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
326 ADD128(rh1,rl1,th,tl); \
327 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
328 ADD128(rh,rl,th,tl); \
329 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
330 ADD128(rh1,rl1,th,tl); \
331 } \
332 }
333 #endif
335 #define poly_step(ah, al, kh, kl, mh, ml) \
336 { uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
337 /* compute ab*cd, put bd into result registers */ \
338 PMUL64(t3h,t3l,al,kh); \
339 PMUL64(t2h,t2l,ah,kl); \
340 PMUL64(t1h,t1l,ah,2*kh); \
341 PMUL64(ah,al,al,kl); \
342 /* add 2 * ac to result */ \
343 ADD128(ah,al,t1h,t1l); \
344 /* add together ad + bc */ \
345 ADD128(t2h,t2l,t3h,t3l); \
346 /* now (ah,al), (t2l,2*t2h) need summing */ \
347 /* first add the high registers, carrying into t2h */ \
348 ADD128(t2h,ah,z,t2l); \
349 /* double t2h and add top bit of ah */ \
350 t2h = 2 * t2h + (ah >> 63); \
351 ah &= m63; \
352 /* now add the low registers */ \
353 ADD128(ah,al,mh,ml); \
354 ADD128(ah,al,z,t2h); \
355 }
357 /* ----------------------------------------------------------------------- */
358 #elif VMAC_USE_SSE2
359 /* ----------------------------------------------------------------------- */
361 // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
362 #if defined(__GNUC__)
363 // define these in two steps to allow arguments to be expanded
364 #define GNU_AS2(x, y) #x ", " #y ";"
365 #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
366 #define GNU_ASL(x) "\n" #x ":"
367 #define GNU_ASJ(x, y, z) #x " " #y #z ";"
368 #define AS2(x, y) GNU_AS2(x, y)
369 #define AS3(x, y, z) GNU_AS3(x, y, z)
370 #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
371 #define ASL(x) GNU_ASL(x)
372 #define ASJ(x, y, z) GNU_ASJ(x, y, z)
373 #else
374 #define AS2(x, y) __asm {x, y}
375 #define AS3(x, y, z) __asm {x, y, z}
376 #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
377 #define ASL(x) __asm {label##x:}
378 #define ASJ(x, y, z) __asm {x label##y}
379 #endif
381 static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
382 {
383 // This assembly version, using MMX registers, is just as fast as the
384 // intrinsics version (which uses XMM registers) on the Intel Core 2,
385 // but is much faster on the Pentium 4. In order to schedule multiplies
386 // as early as possible, the loop interleaves operations for the current
387 // block and the next block. To mask out high 32-bits, we use "movd"
388 // to move the lower 32-bits to the stack and then back. Surprisingly,
389 // this is faster than any other method.
390 #ifdef __GNUC__
391 __asm__ __volatile__
392 (
393 ".intel_syntax noprefix;"
394 #else
395 AS2( mov esi, mp)
396 AS2( mov edi, kp)
397 AS2( mov ecx, nw)
398 AS2( mov eax, rl)
399 AS2( mov edx, rh)
400 #endif
401 AS2( sub esp, 12)
402 AS2( movq mm6, [esi])
403 AS2( paddq mm6, [edi])
404 AS2( movq mm5, [esi+8])
405 AS2( paddq mm5, [edi+8])
406 AS2( add esi, 16)
407 AS2( add edi, 16)
408 AS2( movq mm4, mm6)
409 ASS( pshufw mm2, mm6, 1, 0, 3, 2)
410 AS2( pmuludq mm6, mm5)
411 ASS( pshufw mm3, mm5, 1, 0, 3, 2)
412 AS2( pmuludq mm5, mm2)
413 AS2( pmuludq mm2, mm3)
414 AS2( pmuludq mm3, mm4)
415 AS2( pxor mm7, mm7)
416 AS2( movd [esp], mm6)
417 AS2( psrlq mm6, 32)
418 AS2( movd [esp+4], mm5)
419 AS2( psrlq mm5, 32)
420 AS2( sub ecx, 2)
421 ASJ( jz, 1, f)
422 ASL(0)
423 AS2( movq mm0, [esi])
424 AS2( paddq mm0, [edi])
425 AS2( movq mm1, [esi+8])
426 AS2( paddq mm1, [edi+8])
427 AS2( add esi, 16)
428 AS2( add edi, 16)
429 AS2( movq mm4, mm0)
430 AS2( paddq mm5, mm2)
431 ASS( pshufw mm2, mm0, 1, 0, 3, 2)
432 AS2( pmuludq mm0, mm1)
433 AS2( movd [esp+8], mm3)
434 AS2( psrlq mm3, 32)
435 AS2( paddq mm5, mm3)
436 ASS( pshufw mm3, mm1, 1, 0, 3, 2)
437 AS2( pmuludq mm1, mm2)
438 AS2( pmuludq mm2, mm3)
439 AS2( pmuludq mm3, mm4)
440 AS2( movd mm4, [esp])
441 AS2( paddq mm7, mm4)
442 AS2( movd mm4, [esp+4])
443 AS2( paddq mm6, mm4)
444 AS2( movd mm4, [esp+8])
445 AS2( paddq mm6, mm4)
446 AS2( movd [esp], mm0)
447 AS2( psrlq mm0, 32)
448 AS2( paddq mm6, mm0)
449 AS2( movd [esp+4], mm1)
450 AS2( psrlq mm1, 32)
451 AS2( paddq mm5, mm1)
452 AS2( sub ecx, 2)
453 ASJ( jnz, 0, b)
454 ASL(1)
455 AS2( paddq mm5, mm2)
456 AS2( movd [esp+8], mm3)
457 AS2( psrlq mm3, 32)
458 AS2( paddq mm5, mm3)
459 AS2( movd mm4, [esp])
460 AS2( paddq mm7, mm4)
461 AS2( movd mm4, [esp+4])
462 AS2( paddq mm6, mm4)
463 AS2( movd mm4, [esp+8])
464 AS2( paddq mm6, mm4)
466 ASS( pshufw mm0, mm7, 3, 2, 1, 0)
467 AS2( psrlq mm7, 32)
468 AS2( paddq mm6, mm7)
469 AS2( punpckldq mm0, mm6)
470 AS2( psrlq mm6, 32)
471 AS2( paddq mm5, mm6)
472 AS2( movq [eax], mm0)
473 AS2( movq [edx], mm5)
474 AS2( add esp, 12)
475 #ifdef __GNUC__
476 ".att_syntax prefix;"
477 :
478 : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
479 : "memory", "cc"
480 );
481 #endif
482 }
483 #define nh_16(mp, kp, nw, rh, rl) nh_16_func(mp, kp, nw, &(rh), &(rl));
485 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
486 const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
487 {
488 // This code tries to schedule the multiplies as early as possible to overcome
489 // the long latencies on the Pentium 4. It also minimizes "movq" instructions
490 // which are very expensive on the P4.
492 #define a0 [eax+0]
493 #define a1 [eax+4]
494 #define a2 [ebx+0]
495 #define a3 [ebx+4]
496 #define k0 [ecx+0]
497 #define k1 [ecx+4]
498 #define k2 [edx+0]
499 #define k3 [edx+4]
501 #ifdef __GNUC__
502 uint32_t temp;
503 __asm__ __volatile__
504 (
505 "mov %%ebx, %0;"
506 "mov %1, %%ebx;"
507 ".intel_syntax noprefix;"
508 #else
509 AS2( mov ebx, ahi)
510 AS2( mov edx, kh)
511 AS2( mov eax, alo)
512 AS2( mov ecx, kl)
513 AS2( mov esi, mh)
514 AS2( mov edi, ml)
515 #endif
517 AS2( movd mm0, a3)
518 AS2( movq mm4, mm0)
519 AS2( pmuludq mm0, k3) // a3*k3
520 AS2( movd mm1, a0)
521 AS2( pmuludq mm1, k2) // a0*k2
522 AS2( movd mm2, a1)
523 AS2( movd mm6, k1)
524 AS2( pmuludq mm2, mm6) // a1*k1
525 AS2( movd mm3, a2)
526 AS2( movq mm5, mm3)
527 AS2( movd mm7, k0)
528 AS2( pmuludq mm3, mm7) // a2*k0
529 AS2( pmuludq mm4, mm7) // a3*k0
530 AS2( pmuludq mm5, mm6) // a2*k1
531 AS2( psllq mm0, 1)
532 AS2( paddq mm0, [esi])
533 AS2( paddq mm0, mm1)
534 AS2( movd mm1, a1)
535 AS2( paddq mm4, mm5)
536 AS2( movq mm5, mm1)
537 AS2( pmuludq mm1, k2) // a1*k2
538 AS2( paddq mm0, mm2)
539 AS2( movd mm2, a0)
540 AS2( paddq mm0, mm3)
541 AS2( movq mm3, mm2)
542 AS2( pmuludq mm2, k3) // a0*k3
543 AS2( pmuludq mm3, mm7) // a0*k0
544 AS2( movd esi, mm0)
545 AS2( psrlq mm0, 32)
546 AS2( pmuludq mm7, mm5) // a1*k0
547 AS2( pmuludq mm5, k3) // a1*k3
548 AS2( paddq mm0, mm1)
549 AS2( movd mm1, a2)
550 AS2( pmuludq mm1, k2) // a2*k2
551 AS2( paddq mm0, mm2)
552 AS2( paddq mm0, mm4)
553 AS2( movq mm4, mm0)
554 AS2( movd mm2, a3)
555 AS2( pmuludq mm2, mm6) // a3*k1
556 AS2( pmuludq mm6, a0) // a0*k1
557 AS2( psrlq mm0, 31)
558 AS2( paddq mm0, mm3)
559 AS2( movd mm3, [edi])
560 AS2( paddq mm0, mm3)
561 AS2( movd mm3, a2)
562 AS2( pmuludq mm3, k3) // a2*k3
563 AS2( paddq mm5, mm1)
564 AS2( movd mm1, a3)
565 AS2( pmuludq mm1, k2) // a3*k2
566 AS2( paddq mm5, mm2)
567 AS2( movd mm2, [edi+4])
568 AS2( psllq mm5, 1)
569 AS2( paddq mm0, mm5)
570 AS2( movq mm5, mm0)
571 AS2( psllq mm4, 33)
572 AS2( psrlq mm0, 32)
573 AS2( paddq mm6, mm7)
574 AS2( movd mm7, esi)
575 AS2( paddq mm0, mm6)
576 AS2( paddq mm0, mm2)
577 AS2( paddq mm3, mm1)
578 AS2( psllq mm3, 1)
579 AS2( paddq mm0, mm3)
580 AS2( psrlq mm4, 1)
581 AS2( punpckldq mm5, mm0)
582 AS2( psrlq mm0, 32)
583 AS2( por mm4, mm7)
584 AS2( paddq mm0, mm4)
585 AS2( movq a0, mm5)
586 AS2( movq a2, mm0)
587 #ifdef __GNUC__
588 ".att_syntax prefix;"
589 "mov %0, %%ebx;"
590 : "=m" (temp)
591 : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
592 : "memory", "cc"
593 );
594 #endif
597 #undef a0
598 #undef a1
599 #undef a2
600 #undef a3
601 #undef k0
602 #undef k1
603 #undef k2
604 #undef k3
605 }
607 #define poly_step(ah, al, kh, kl, mh, ml) \
608 poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
610 /* ----------------------------------------------------------------------- */
611 #else /* not VMAC_ARCH_64 and not SSE2 */
612 /* ----------------------------------------------------------------------- */
614 #ifndef nh_16
615 #define nh_16(mp, kp, nw, rh, rl) \
616 { uint64_t t1,t2,m1,m2,t; \
617 int i; \
618 rh = rl = t = 0; \
619 for (i = 0; i < nw; i+=2) { \
620 t1 = get64PE(mp+i) + kp[i]; \
621 t2 = get64PE(mp+i+1) + kp[i+1]; \
622 m2 = MUL32(t1 >> 32, t2); \
623 m1 = MUL32(t1, t2 >> 32); \
624 ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2)); \
625 rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32); \
626 t += (uint64_t)(uint32_t)m1 + (uint32_t)m2; \
627 } \
628 ADD128(rh,rl,(t >> 32),(t << 32)); \
629 }
630 #endif
632 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
633 const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
634 {
636 #if VMAC_ARCH_BIG_ENDIAN
637 #define INDEX_HIGH 0
638 #define INDEX_LOW 1
639 #else
640 #define INDEX_HIGH 1
641 #define INDEX_LOW 0
642 #endif
644 #define a0 *(((uint32_t*)alo)+INDEX_LOW)
645 #define a1 *(((uint32_t*)alo)+INDEX_HIGH)
646 #define a2 *(((uint32_t*)ahi)+INDEX_LOW)
647 #define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
648 #define k0 *(((uint32_t*)kl)+INDEX_LOW)
649 #define k1 *(((uint32_t*)kl)+INDEX_HIGH)
650 #define k2 *(((uint32_t*)kh)+INDEX_LOW)
651 #define k3 *(((uint32_t*)kh)+INDEX_HIGH)
653 uint64_t p, q, t;
654 uint32_t t2;
656 p = MUL32(a3, k3);
657 p += p;
658 p += *(uint64_t *)mh;
659 p += MUL32(a0, k2);
660 p += MUL32(a1, k1);
661 p += MUL32(a2, k0);
662 t = (uint32_t)(p);
663 p >>= 32;
664 p += MUL32(a0, k3);
665 p += MUL32(a1, k2);
666 p += MUL32(a2, k1);
667 p += MUL32(a3, k0);
668 t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
669 p >>= 31;
670 p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
671 p += MUL32(a0, k0);
672 q = MUL32(a1, k3);
673 q += MUL32(a2, k2);
674 q += MUL32(a3, k1);
675 q += q;
676 p += q;
677 t2 = (uint32_t)(p);
678 p >>= 32;
679 p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
680 p += MUL32(a0, k1);
681 p += MUL32(a1, k0);
682 q = MUL32(a2, k3);
683 q += MUL32(a3, k2);
684 q += q;
685 p += q;
686 *(uint64_t *)(alo) = (p << 32) | t2;
687 p >>= 32;
688 *(uint64_t *)(ahi) = p + t;
690 #undef a0
691 #undef a1
692 #undef a2
693 #undef a3
694 #undef k0
695 #undef k1
696 #undef k2
697 #undef k3
698 }
700 #define poly_step(ah, al, kh, kl, mh, ml) \
701 poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
703 /* ----------------------------------------------------------------------- */
704 #endif /* end of specialized NH and poly definitions */
705 /* ----------------------------------------------------------------------- */
707 /* At least nh_16 is defined. Defined others as needed here */
708 #ifndef nh_16_2
709 #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2) \
710 nh_16(mp, kp, nw, rh, rl); \
711 nh_16(mp, ((kp)+2), nw, rh2, rl2);
712 #endif
713 #ifndef nh_vmac_nhbytes
714 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
715 nh_16(mp, kp, nw, rh, rl)
716 #endif
717 #ifndef nh_vmac_nhbytes_2
718 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2) \
719 nh_vmac_nhbytes(mp, kp, nw, rh, rl); \
720 nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
721 #endif
723 /* ----------------------------------------------------------------------- */
725 void vhash_abort(vmac_ctx_t *ctx)
726 {
727 ctx->polytmp[0] = ctx->polykey[0] ;
728 ctx->polytmp[1] = ctx->polykey[1] ;
729 #if (VMAC_TAG_LEN == 128)
730 ctx->polytmp[2] = ctx->polykey[2] ;
731 ctx->polytmp[3] = ctx->polykey[3] ;
732 #endif
733 ctx->first_block_processed = 0;
734 }
736 /* ----------------------------------------------------------------------- */
737 static uint64_t l3hash(uint64_t p1, uint64_t p2,
738 uint64_t k1, uint64_t k2, uint64_t len)
739 {
740 uint64_t rh, rl, t, z=0;
742 /* fully reduce (p1,p2)+(len,0) mod p127 */
743 t = p1 >> 63;
744 p1 &= m63;
745 ADD128(p1, p2, len, t);
746 /* At this point, (p1,p2) is at most 2^127+(len<<64) */
747 t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
748 ADD128(p1, p2, z, t);
749 p1 &= m63;
751 /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
752 t = p1 + (p2 >> 32);
753 t += (t >> 32);
754 t += (uint32_t)t > 0xfffffffeu;
755 p1 += (t >> 32);
756 p2 += (p1 << 32);
758 /* compute (p1+k1)%p64 and (p2+k2)%p64 */
759 p1 += k1;
760 p1 += (0 - (p1 < k1)) & 257;
761 p2 += k2;
762 p2 += (0 - (p2 < k2)) & 257;
764 /* compute (p1+k1)*(p2+k2)%p64 */
765 MUL64(rh, rl, p1, p2);
766 t = rh >> 56;
767 ADD128(t, rl, z, rh);
768 rh <<= 8;
769 ADD128(t, rl, z, rh);
770 t += t << 8;
771 rl += t;
772 rl += (0 - (rl < t)) & 257;
773 rl += (0 - (rl > p64-1)) & 257;
774 return rl;
775 }
777 /* ----------------------------------------------------------------------- */
779 void vhash_update(unsigned char *m,
780 unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */
781 vmac_ctx_t *ctx)
782 {
783 uint64_t rh, rl, *mptr;
784 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
785 int i;
786 uint64_t ch, cl;
787 uint64_t pkh = ctx->polykey[0];
788 uint64_t pkl = ctx->polykey[1];
789 #if (VMAC_TAG_LEN == 128)
790 uint64_t ch2, cl2, rh2, rl2;
791 uint64_t pkh2 = ctx->polykey[2];
792 uint64_t pkl2 = ctx->polykey[3];
793 #endif
795 mptr = (uint64_t *)m;
796 i = mbytes / VMAC_NHBYTES; /* Must be non-zero */
798 ch = ctx->polytmp[0];
799 cl = ctx->polytmp[1];
800 #if (VMAC_TAG_LEN == 128)
801 ch2 = ctx->polytmp[2];
802 cl2 = ctx->polytmp[3];
803 #endif
805 if ( ! ctx->first_block_processed) {
806 ctx->first_block_processed = 1;
807 #if (VMAC_TAG_LEN == 64)
808 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
809 #else
810 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
811 rh2 &= m62;
812 ADD128(ch2,cl2,rh2,rl2);
813 #endif
814 rh &= m62;
815 ADD128(ch,cl,rh,rl);
816 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
817 i--;
818 }
820 while (i--) {
821 #if (VMAC_TAG_LEN == 64)
822 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
823 #else
824 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
825 rh2 &= m62;
826 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
827 #endif
828 rh &= m62;
829 poly_step(ch,cl,pkh,pkl,rh,rl);
830 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
831 }
833 ctx->polytmp[0] = ch;
834 ctx->polytmp[1] = cl;
835 #if (VMAC_TAG_LEN == 128)
836 ctx->polytmp[2] = ch2;
837 ctx->polytmp[3] = cl2;
838 #endif
839 #if VMAC_USE_SSE2
840 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
841 #endif
842 }
844 /* ----------------------------------------------------------------------- */
846 uint64_t xvhash(unsigned char m[],
847 unsigned int mbytes,
848 uint64_t *tagl,
849 vmac_ctx_t *ctx)
850 {
851 uint64_t ch, cl, rh, rl, *mptr;
852 #if (VMAC_TAG_LEN == 128)
853 uint64_t ch2, cl2, rh2, rl2;
854 #endif
855 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
856 int i, remaining;
858 remaining = mbytes % VMAC_NHBYTES;
859 i = mbytes-remaining;
860 mptr = (uint64_t *)(m+i);
861 if (i) vhash_update(m,i,ctx);
863 ch = ctx->polytmp[0];
864 cl = ctx->polytmp[1];
865 #if (VMAC_TAG_LEN == 128)
866 ch2 = ctx->polytmp[2];
867 cl2 = ctx->polytmp[3];
868 #endif
870 if (remaining) {
871 #if (VMAC_TAG_LEN == 128)
872 nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
873 rh2 &= m62;
874 #else
875 nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
876 #endif
877 rh &= m62;
878 if (i) {
879 poly_step(ch,cl,ctx->polykey[0],ctx->polykey[1],rh,rl);
880 #if (VMAC_TAG_LEN == 128)
881 poly_step(ch2,cl2,ctx->polykey[2],ctx->polykey[3],rh2,rl2);
882 #endif
883 } else {
884 ADD128(ch,cl,rh,rl);
885 #if (VMAC_TAG_LEN == 128)
886 ADD128(ch2,cl2,rh2,rl2);
887 #endif
888 }
889 }
891 #if VMAC_USE_SSE2
892 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
893 #endif
894 vhash_abort(ctx);
895 remaining *= 8;
896 #if (VMAC_TAG_LEN == 128)
897 *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
898 #endif
899 return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
900 }
902 uint64_t vhash(unsigned char m[],
903 unsigned int mbytes,
904 uint64_t *tagl,
905 vmac_ctx_t *ctx)
906 {
907 uint64_t rh, rl, *mptr;
908 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
909 int i, remaining;
910 uint64_t ch, cl;
911 uint64_t pkh = ctx->polykey[0];
912 uint64_t pkl = ctx->polykey[1];
913 #if (VMAC_TAG_LEN == 128)
914 uint64_t ch2, cl2, rh2, rl2;
915 uint64_t pkh2 = ctx->polykey[2];
916 uint64_t pkl2 = ctx->polykey[3];
917 #endif
919 mptr = (uint64_t *)m;
920 i = mbytes / VMAC_NHBYTES;
921 remaining = mbytes % VMAC_NHBYTES;
923 if (ctx->first_block_processed)
924 {
925 ch = ctx->polytmp[0];
926 cl = ctx->polytmp[1];
927 #if (VMAC_TAG_LEN == 128)
928 ch2 = ctx->polytmp[2];
929 cl2 = ctx->polytmp[3];
930 #endif
931 }
932 else if (i)
933 {
934 #if (VMAC_TAG_LEN == 64)
935 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
936 #else
937 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
938 ch2 &= m62;
939 ADD128(ch2,cl2,pkh2,pkl2);
940 #endif
941 ch &= m62;
942 ADD128(ch,cl,pkh,pkl);
943 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
944 i--;
945 }
946 else if (remaining)
947 {
948 #if (VMAC_TAG_LEN == 64)
949 nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
950 #else
951 nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
952 ch2 &= m62;
953 ADD128(ch2,cl2,pkh2,pkl2);
954 #endif
955 ch &= m62;
956 ADD128(ch,cl,pkh,pkl);
957 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
958 goto do_l3;
959 }
960 else /* Empty String */
961 {
962 ch = pkh; cl = pkl;
963 #if (VMAC_TAG_LEN == 128)
964 ch2 = pkh2; cl2 = pkl2;
965 #endif
966 goto do_l3;
967 }
969 while (i--) {
970 #if (VMAC_TAG_LEN == 64)
971 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
972 #else
973 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
974 rh2 &= m62;
975 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
976 #endif
977 rh &= m62;
978 poly_step(ch,cl,pkh,pkl,rh,rl);
979 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
980 }
981 if (remaining) {
982 #if (VMAC_TAG_LEN == 64)
983 nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
984 #else
985 nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
986 rh2 &= m62;
987 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
988 #endif
989 rh &= m62;
990 poly_step(ch,cl,pkh,pkl,rh,rl);
991 }
993 do_l3:
994 #if VMAC_USE_SSE2
995 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
996 #endif
997 vhash_abort(ctx);
998 remaining *= 8;
999 #if (VMAC_TAG_LEN == 128)
1000 *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
1001 #endif
1002 return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
1005 /* ----------------------------------------------------------------------- */
1007 uint64_t vmac(unsigned char m[],
1008 unsigned int mbytes,
1009 unsigned char n[16],
1010 uint64_t *tagl,
1011 vmac_ctx_t *ctx)
1013 #if (VMAC_TAG_LEN == 64)
1014 uint64_t *in_n, *out_p;
1015 uint64_t p, h;
1016 int i;
1018 #if VMAC_CACHE_NONCES
1019 in_n = ctx->cached_nonce;
1020 out_p = ctx->cached_aes;
1021 #else
1022 uint64_t tmp[2];
1023 in_n = out_p = tmp;
1024 #endif
1026 i = n[15] & 1;
1027 #if VMAC_CACHE_NONCES
1028 if ((*(uint64_t *)(n+8) != in_n[1]) ||
1029 (*(uint64_t *)(n ) != in_n[0])) {
1030 #endif
1032 in_n[0] = *(uint64_t *)(n );
1033 in_n[1] = *(uint64_t *)(n+8);
1034 ((unsigned char *)in_n)[15] &= 0xFE;
1035 aes_encryption(in_n, out_p, &ctx->cipher_key);
1037 #if VMAC_CACHE_NONCES
1038 ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
1040 #endif
1041 p = get64BE(out_p + i);
1042 h = vhash(m, mbytes, (uint64_t *)0, ctx);
1043 return p + h;
1044 #else
1045 uint64_t tmp[2];
1046 uint64_t th,tl;
1047 aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
1048 th = vhash(m, mbytes, &tl, ctx);
1049 th += get64BE(tmp);
1050 *tagl = tl + get64BE(tmp+1);
1051 return th;
1052 #endif
1055 /* ----------------------------------------------------------------------- */
1057 void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
1059 uint64_t in[2] = {0}, out[2];
1060 unsigned i;
1061 aes_key_setup(user_key, &ctx->cipher_key);
1063 /* Fill nh key */
1064 ((unsigned char *)in)[0] = 0x80;
1065 for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
1066 aes_encryption((unsigned char *)in, (unsigned char *)out,
1067 &ctx->cipher_key);
1068 ctx->nhkey[i ] = get64BE(out);
1069 ctx->nhkey[i+1] = get64BE(out+1);
1070 ((unsigned char *)in)[15] += 1;
1073 /* Fill poly key */
1074 ((unsigned char *)in)[0] = 0xC0;
1075 in[1] = 0;
1076 for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
1077 aes_encryption((unsigned char *)in, (unsigned char *)out,
1078 &ctx->cipher_key);
1079 ctx->polytmp[i ] = ctx->polykey[i ] = get64BE(out) & mpoly;
1080 ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
1081 ((unsigned char *)in)[15] += 1;
1084 /* Fill ip key */
1085 ((unsigned char *)in)[0] = 0xE0;
1086 in[1] = 0;
1087 for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
1088 do {
1089 aes_encryption((unsigned char *)in, (unsigned char *)out,
1090 &ctx->cipher_key);
1091 ctx->l3key[i ] = get64BE(out);
1092 ctx->l3key[i+1] = get64BE(out+1);
1093 ((unsigned char *)in)[15] += 1;
1094 } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
1097 /* Invalidate nonce/aes cache and reset other elements */
1098 #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
1099 ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
1100 ctx->cached_nonce[1] = (uint64_t)0; /* Ensure illegal nonce */
1101 #endif
1102 ctx->first_block_processed = 0;
1105 /* ----------------------------------------------------------------------- */
1108 #if VMAC_RUN_TESTS
1110 #include <stdlib.h>
1111 #include <stdio.h>
1112 #include <time.h>
1113 #include <string.h>
1115 unsigned prime(void) /* Wake variable speed cpu, get rough speed estimate */
1117 volatile uint64_t i;
1118 volatile uint64_t j=1;
1119 unsigned cnt=0;
1120 volatile clock_t ticks = clock();
1121 do {
1122 for (i = 0; i < 500000; i++) {
1123 uint64_t x = get64PE(&j);
1124 j = x * x + (uint64_t)ticks;
1126 cnt++;
1127 } while (clock() - ticks < (CLOCKS_PER_SEC/2));
1128 return cnt; /* cnt is millions of iterations per second */
1131 int main(void)
1133 ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
1134 uint64_t res, tagl;
1135 void *p;
1136 unsigned char *m;
1137 ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
1138 ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
1139 unsigned int vector_lengths[] = {0,3,48,300,3000000};
1140 #if (VMAC_TAG_LEN == 64)
1141 ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
1142 "E8421F61D573D298","4492DF6C5CAC1BBE",
1143 "09BA597DD7601113"};
1144 #else
1145 ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
1146 "4EE815A06A1D71EDD36FC75D51188A42",
1147 "09F2C80C8E1007A0C12FAE19FE4504AE",
1148 "66438817154850C61D8A412164803BCB",
1149 "2B6B02288FFC461B75485DE893C629DC"};
1150 #endif
1151 unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
1152 unsigned i, j, *speed_iters;
1153 clock_t ticks;
1154 double cpb;
1155 const unsigned int buf_len = 3 * (1 << 20);
1157 j = prime();
1158 i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
1159 speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
1160 speed_iters[i-1] = j * (1 << 12);
1161 while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
1163 /* Initialize context and message buffer, all 16-byte aligned */
1164 p = malloc(buf_len + 32);
1165 m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
1166 memset(m, 0, buf_len + 16);
1167 vmac_set_key(key, &ctx);
1169 /* Test incremental and all-in-one interfaces for correctness */
1170 vmac_set_key(key, &ctx_aio);
1171 vmac_set_key(key, &ctx_inc1);
1172 vmac_set_key(key, &ctx_inc2);
1175 /*
1176 for (i = 0; i <= 512; i++) {
1177 vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1178 tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
1179 nonce, &tagl, &ctx);
1180 vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1181 for (j = 0; j < vector_lengths[i]; j++)
1182 m[j] = (unsigned char)('a'+j%3);
1185 */
1187 /* Generate vectors */
1188 for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
1189 for (j = 0; j < vector_lengths[i]; j++)
1190 m[j] = (unsigned char)('a'+j%3);
1191 res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
1192 #if (VMAC_TAG_LEN == 64)
1193 printf("\'abc\' * %7u: %016llX Should be: %s\n",
1194 vector_lengths[i]/3,res,should_be[i]);
1195 #else
1196 printf("\'abc\' * %7u: %016llX%016llX\nShould be : %s\n",
1197 vector_lengths[i]/3,res,tagl,should_be[i]);
1198 #endif
1201 /* Speed test */
1202 for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
1203 ticks = clock();
1204 for (j = 0; j < speed_iters[i]; j++) {
1205 #if HASH_ONLY
1206 res = vhash(m, speed_lengths[i], &tagl, &ctx);
1207 #else
1208 res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
1209 nonce[7]++;
1210 #endif
1212 ticks = clock() - ticks;
1213 cpb = ((ticks*VMAC_HZ)/
1214 ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
1215 printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
1217 return 1;
1220 #endif