ia64/xen-unstable

changeset 18791:8de4b4e9a435

x86: add SSE-based copy_page()

In top of the highmem asstance hypercalls added earlier, this provides
a performance improvement of another 12% (measured on Xeon E5345) for
the page copying case.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Nov 12 12:04:15 2008 +0000 (2008-11-12)
parents 8e18dd41c6c7
children 9f68b6ae6243
files xen/arch/x86/Makefile xen/arch/x86/copy_page.S xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/include/asm-x86/page.h
line diff
     1.1 --- a/xen/arch/x86/Makefile	Wed Nov 12 12:01:35 2008 +0000
     1.2 +++ b/xen/arch/x86/Makefile	Wed Nov 12 12:04:15 2008 +0000
     1.3 @@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
     1.4  obj-y += apic.o
     1.5  obj-y += bitops.o
     1.6  obj-y += clear_page.o
     1.7 +obj-y += copy_page.o
     1.8  obj-y += compat.o
     1.9  obj-y += delay.o
    1.10  obj-y += dmi_scan.o
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/copy_page.S	Wed Nov 12 12:04:15 2008 +0000
     2.3 @@ -0,0 +1,66 @@
     2.4 +#include <xen/config.h>
     2.5 +#include <asm/page.h>
     2.6 +
     2.7 +#ifdef __i386__
     2.8 +#define src_reg %esi
     2.9 +#define dst_reg %edi
    2.10 +#define WORD_SIZE 4
    2.11 +#define tmp1_reg %eax
    2.12 +#define tmp2_reg %edx
    2.13 +#define tmp3_reg %ebx
    2.14 +#define tmp4_reg %ebp
    2.15 +#else
    2.16 +#define src_reg %rsi
    2.17 +#define dst_reg %rdi
    2.18 +#define WORD_SIZE 8
    2.19 +#define tmp1_reg %r8
    2.20 +#define tmp2_reg %r9
    2.21 +#define tmp3_reg %r10
    2.22 +#define tmp4_reg %r11
    2.23 +#endif
    2.24 +
    2.25 +ENTRY(copy_page_sse2)
    2.26 +#ifdef __i386__
    2.27 +        push    %ebx
    2.28 +        push    %ebp
    2.29 +        push    %esi
    2.30 +        push    %edi
    2.31 +        mov     6*4(%esp), src_reg
    2.32 +        mov     5*4(%esp), dst_reg
    2.33 +#endif
    2.34 +        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
    2.35 +
    2.36 +        prefetchnta 2*4*WORD_SIZE(src_reg)
    2.37 +        mov     (src_reg), tmp1_reg
    2.38 +        mov     WORD_SIZE(src_reg), tmp2_reg
    2.39 +        mov     2*WORD_SIZE(src_reg), tmp3_reg
    2.40 +        mov     3*WORD_SIZE(src_reg), tmp4_reg
    2.41 +
    2.42 +0:      prefetchnta 3*4*WORD_SIZE(src_reg)
    2.43 +1:      add     $4*WORD_SIZE, src_reg
    2.44 +        movnti  tmp1_reg, (dst_reg)
    2.45 +        mov     (src_reg), tmp1_reg
    2.46 +        dec     %ecx
    2.47 +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
    2.48 +        mov     WORD_SIZE(src_reg), tmp2_reg
    2.49 +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
    2.50 +        mov     2*WORD_SIZE(src_reg), tmp3_reg
    2.51 +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
    2.52 +        lea     4*WORD_SIZE(dst_reg), dst_reg
    2.53 +        mov     3*WORD_SIZE(src_reg), tmp4_reg
    2.54 +        jg      0b
    2.55 +        jpe     1b
    2.56 +
    2.57 +        movnti  tmp1_reg, (dst_reg)
    2.58 +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
    2.59 +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
    2.60 +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
    2.61 +
    2.62 +#ifdef __i386__
    2.63 +        pop     %edi
    2.64 +        pop     %esi
    2.65 +        pop     %ebp
    2.66 +        pop     %ebx
    2.67 +#endif
    2.68 +        sfence
    2.69 +        ret
     3.1 --- a/xen/arch/x86/domain.c	Wed Nov 12 12:01:35 2008 +0000
     3.2 +++ b/xen/arch/x86/domain.c	Wed Nov 12 12:04:15 2008 +0000
     3.3 @@ -184,7 +184,8 @@ static int setup_compat_l4(struct vcpu *
     3.4      /* This page needs to look like a pagetable so that it can be shadowed */
     3.5      pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
     3.6  
     3.7 -    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
     3.8 +    l4tab = page_to_virt(pg);
     3.9 +    copy_page(l4tab, idle_pg_table);
    3.10      l4tab[0] = l4e_empty();
    3.11      l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
    3.12          l4e_from_page(pg, __PAGE_HYPERVISOR);
     4.1 --- a/xen/arch/x86/domain_build.c	Wed Nov 12 12:01:35 2008 +0000
     4.2 +++ b/xen/arch/x86/domain_build.c	Wed Nov 12 12:04:15 2008 +0000
     4.3 @@ -455,8 +455,9 @@ int __init construct_dom0(
     4.4      /* WARNING: The new domain must have its 'processor' field filled in! */
     4.5      l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
     4.6      l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
     4.7 -    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
     4.8 -    for (i = 0; i < 4; i++) {
     4.9 +    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
    4.10 +        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
    4.11 +                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
    4.12          l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
    4.13          l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
    4.14              l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
     5.1 --- a/xen/include/asm-x86/page.h	Wed Nov 12 12:01:35 2008 +0000
     5.2 +++ b/xen/include/asm-x86/page.h	Wed Nov 12 12:04:15 2008 +0000
     5.3 @@ -215,7 +215,10 @@ void clear_page_sse2(void *);
     5.4  #define clear_page(_p)      (cpu_has_xmm2 ?                             \
     5.5                               clear_page_sse2((void *)(_p)) :            \
     5.6                               (void)memset((void *)(_p), 0, PAGE_SIZE))
     5.7 -#define copy_page(_t,_f)    memcpy((void *)(_t), (void *)(_f), PAGE_SIZE)
     5.8 +void copy_page_sse2(void *, const void *);
     5.9 +#define copy_page(_t,_f)    (cpu_has_xmm2 ?                             \
    5.10 +                             copy_page_sse2(_t, _f) :                   \
    5.11 +                             (void)memcpy(_t, _f, PAGE_SIZE))
    5.12  
    5.13  #define mfn_valid(mfn)      ((mfn) < max_page)
    5.14