direct-io.hg

changeset 6836:b7276814008c

Begin updating to 2.6.13 base
author djm@kirby.fc.hp.com
date Wed Aug 31 14:32:27 2005 -0600 (2005-08-31)
parents 44316ce83277
children 4e4f1db8ea94
files xen/arch/ia64/Makefile xen/arch/ia64/linux-xen/minstate.h xen/arch/ia64/linux-xen/setup.c xen/arch/ia64/linux-xen/sort.c xen/arch/ia64/linux/README.origin xen/arch/ia64/linux/extable.c xen/arch/ia64/linux/ia64_ksyms.c xen/arch/ia64/linux/irq_lsapic.c xen/arch/ia64/linux/lib/flush.S xen/arch/ia64/linux/lib/memcpy_mck.S xen/arch/ia64/linux/lib/memset.S xen/arch/ia64/linux/pcdp.h xen/include/asm-ia64/linux/sort.h
line diff
     1.1 --- a/xen/arch/ia64/Makefile	Tue Aug 30 17:51:51 2005 -0600
     1.2 +++ b/xen/arch/ia64/Makefile	Wed Aug 31 14:32:27 2005 -0600
     1.3 @@ -1,19 +1,22 @@
     1.4  include $(BASEDIR)/Rules.mk
     1.5  
     1.6 -VPATH = linux linux-xen
     1.7 +VPATH = linux linux-xen linux/lib
     1.8 +#VPATH = linux-xen linux/lib
     1.9  
    1.10  # libs-y	+= arch/ia64/lib/lib.a
    1.11  
    1.12  OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \
    1.13 -	xenmisc.o pdb-stub.o acpi.o hypercall.o \
    1.14 +	xenmisc.o acpi.o hypercall.o \
    1.15  	machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \
    1.16  	idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \
    1.17  	xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \
    1.18 -	extable.o linuxextable.o xenirq.o xentime.o \
    1.19 +	extable.o linuxextable.o sort.o xenirq.o xentime.o \
    1.20  	regionreg.o entry.o unaligned.o privop.o vcpu.o \
    1.21  	irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \
    1.22  	grant_table.o sn_console.o
    1.23  
    1.24 +#OBJS += idiv64.o idiv32.o			\
    1.25 +
    1.26  # TMP holder to contain *.0 moved out of CONFIG_VTI
    1.27  OBJS += vmx_init.o
    1.28  
    1.29 @@ -22,6 +25,13 @@ OBJS += vmx_virt.o vmx_vcpu.o vmx_proces
    1.30  	vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \
    1.31  	vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o
    1.32  endif
    1.33 +
    1.34 +# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
    1.35 +OBJS +=	bitop.o clear_page.o flush.o copy_page_mck.o			\
    1.36 +	memset.o strlen.o memcpy_mck.o 					\
    1.37 +	__divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
    1.38 +	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
    1.39 +
    1.40  # perfmon.o
    1.41  # unwind.o needed for kernel unwinding (rare)
    1.42  
    1.43 @@ -30,8 +40,8 @@ OBJS := $(subst $(TARGET_ARCH)/asm-offse
    1.44  # remove following line if not privifying in memory
    1.45  # OBJS += privify.o
    1.46  
    1.47 -default: $(OBJS) head.o ia64lib.o xen.lds.s
    1.48 -	$(LD) -r -o arch.o $(OBJS) ia64lib.o
    1.49 +default: $(OBJS) head.o xen.lds.s
    1.50 +	$(LD) -r -o arch.o $(OBJS)
    1.51  	$(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \
    1.52  		-Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms
    1.53  	$(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET)
    1.54 @@ -79,12 +89,29 @@ xen.lds.s: xen.lds.S
    1.55  	$(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \
    1.56  		-o xen.lds.s xen.lds.S
    1.57  
    1.58 -ia64lib.o:
    1.59 -	$(MAKE) -C linux/lib && cp linux/lib/ia64lib.o .
    1.60 +# variants of divide/modulo
    1.61 +# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
    1.62 +__divdi3.o: idiv64.S
    1.63 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    1.64 +__udivdi3.o: idiv64.S
    1.65 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    1.66 +__moddi3.o: idiv64.S
    1.67 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    1.68 +__umoddi3.o: idiv64.S
    1.69 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    1.70 +__divsi3.o: idiv32.S
    1.71 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    1.72 +__udivsi3.o: idiv32.S
    1.73 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    1.74 +__modsi3.o: idiv32.S
    1.75 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    1.76 +__umodsi3.o: idiv32.S
    1.77 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    1.78 +
    1.79  
    1.80  clean:
    1.81  	rm -f *.o *~ core  xen.lds.s $(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s
    1.82  	rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h
    1.83 -	rm -f lib/*.o
    1.84 +	rm -f linux/lib/*.o
    1.85  
    1.86  .PHONY: default clean
     2.1 --- a/xen/arch/ia64/lib/Makefile	Tue Aug 30 17:51:51 2005 -0600
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,44 +0,0 @@
     2.4 -#
     2.5 -# Makefile for ia64-specific library routines..
     2.6 -#
     2.7 -
     2.8 -include $(BASEDIR)/Rules.mk
     2.9 -
    2.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
    2.11 -	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
    2.12 -	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
    2.13 -	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
    2.14 -	flush.o ip_fast_csum.o do_csum.o copy_user.o			\
    2.15 -	memset.o strlen.o memcpy.o 
    2.16 -
    2.17 -default: $(OBJS)
    2.18 -	$(LD) -r -o ia64lib.o $(OBJS)
    2.19 -
    2.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
    2.21 -
    2.22 -__divdi3.o: idiv64.S
    2.23 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    2.24 -
    2.25 -__udivdi3.o: idiv64.S
    2.26 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    2.27 -
    2.28 -__moddi3.o: idiv64.S
    2.29 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    2.30 -
    2.31 -__umoddi3.o: idiv64.S
    2.32 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    2.33 -
    2.34 -__divsi3.o: idiv32.S
    2.35 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    2.36 -
    2.37 -__udivsi3.o: idiv32.S
    2.38 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    2.39 -
    2.40 -__modsi3.o: idiv32.S
    2.41 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    2.42 -
    2.43 -__umodsi3.o: idiv32.S
    2.44 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    2.45 -
    2.46 -clean:
    2.47 -	rm -f *.o *~
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/ia64/linux-xen/minstate.h	Wed Aug 31 14:32:27 2005 -0600
     3.3 @@ -0,0 +1,254 @@
     3.4 +#include <linux/config.h>
     3.5 +
     3.6 +#include <asm/cache.h>
     3.7 +
     3.8 +#include "entry.h"
     3.9 +
    3.10 +/*
    3.11 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
    3.12 + * on interrupts.
    3.13 + *
    3.14 + *  On entry:
    3.15 + *	r1:	pointer to current task (ar.k6)
    3.16 + */
    3.17 +#define MINSTATE_START_SAVE_MIN_VIRT								\
    3.18 +(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
    3.19 +	;;											\
    3.20 +(pUStk)	mov.m r24=ar.rnat;									\
    3.21 +(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
    3.22 +(pKStk) mov r1=sp;					/* get sp  */				\
    3.23 +	;;											\
    3.24 +(pUStk) lfetch.fault.excl.nt1 [r22];								\
    3.25 +(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
    3.26 +(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
    3.27 +	;;											\
    3.28 +(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
    3.29 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
    3.30 +	;;											\
    3.31 +(pUStk)	mov r18=ar.bsp;										\
    3.32 +(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
    3.33 +
    3.34 +#define MINSTATE_END_SAVE_MIN_VIRT								\
    3.35 +	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
    3.36 +	;;
    3.37 +
    3.38 +/*
    3.39 + * For mca_asm.S we want to access the stack physically since the state is saved before we
    3.40 + * go virtual and don't want to destroy the iip or ipsr.
    3.41 + */
    3.42 +#define MINSTATE_START_SAVE_MIN_PHYS								\
    3.43 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;								\
    3.44 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;							\
    3.45 +(pKStk) ld8 r3 = [r3];;										\
    3.46 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;						\
    3.47 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;						\
    3.48 +(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
    3.49 +(pUStk)	addl r22=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	\
    3.50 +	;;											\
    3.51 +(pUStk)	mov r24=ar.rnat;									\
    3.52 +(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
    3.53 +(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
    3.54 +(pUStk)	dep r22=-1,r22,61,3;			/* compute kernel virtual addr of RBS */	\
    3.55 +	;;											\
    3.56 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		\
    3.57 +(pUStk)	mov ar.bspstore=r22;			/* switch to kernel RBS */			\
    3.58 +	;;											\
    3.59 +(pUStk)	mov r18=ar.bsp;										\
    3.60 +(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
    3.61 +
    3.62 +#define MINSTATE_END_SAVE_MIN_PHYS								\
    3.63 +	dep r12=-1,r12,61,3;		/* make sp a kernel virtual address */			\
    3.64 +	;;
    3.65 +
    3.66 +#ifdef MINSTATE_VIRT
    3.67 +# define MINSTATE_GET_CURRENT(reg)	\
    3.68 +		movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
    3.69 +		ld8 reg=[reg]
    3.70 +# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_VIRT
    3.71 +# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_VIRT
    3.72 +#endif
    3.73 +
    3.74 +#ifdef MINSTATE_PHYS
    3.75 +# define MINSTATE_GET_CURRENT(reg)	mov reg=IA64_KR(CURRENT);; tpa reg=reg
    3.76 +# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_PHYS
    3.77 +# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_PHYS
    3.78 +#endif
    3.79 +
    3.80 +/*
    3.81 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
    3.82 + * the minimum state necessary that allows us to turn psr.ic back
    3.83 + * on.
    3.84 + *
    3.85 + * Assumed state upon entry:
    3.86 + *	psr.ic: off
    3.87 + *	r31:	contains saved predicates (pr)
    3.88 + *
    3.89 + * Upon exit, the state is as follows:
    3.90 + *	psr.ic: off
    3.91 + *	 r2 = points to &pt_regs.r16
    3.92 + *	 r8 = contents of ar.ccv
    3.93 + *	 r9 = contents of ar.csd
    3.94 + *	r10 = contents of ar.ssd
    3.95 + *	r11 = FPSR_DEFAULT
    3.96 + *	r12 = kernel sp (kernel virtual address)
    3.97 + *	r13 = points to current task_struct (kernel virtual address)
    3.98 + *	p15 = TRUE if psr.i is set in cr.ipsr
    3.99 + *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
   3.100 + *		preserved
   3.101 + *
   3.102 + * Note that psr.ic is NOT turned on by this macro.  This is so that
   3.103 + * we can pass interruption state as arguments to a handler.
   3.104 + */
   3.105 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)							\
   3.106 +	MINSTATE_GET_CURRENT(r16);	/* M (or M;;I) */					\
   3.107 +	mov r27=ar.rsc;			/* M */							\
   3.108 +	mov r20=r1;			/* A */							\
   3.109 +	mov r25=ar.unat;		/* M */							\
   3.110 +	mov r29=cr.ipsr;		/* M */							\
   3.111 +	mov r26=ar.pfs;			/* I */							\
   3.112 +	mov r28=cr.iip;			/* M */							\
   3.113 +	mov r21=ar.fpsr;		/* M */							\
   3.114 +	COVER;				/* B;; (or nothing) */					\
   3.115 +	;;											\
   3.116 +	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
   3.117 +	;;											\
   3.118 +	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
   3.119 +	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
   3.120 +	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
   3.121 +	/* switch from user to kernel RBS: */							\
   3.122 +	;;											\
   3.123 +	invala;				/* M */							\
   3.124 +	SAVE_IFS;										\
   3.125 +	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
   3.126 +	;;											\
   3.127 +	MINSTATE_START_SAVE_MIN									\
   3.128 +	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
   3.129 +	adds r16=PT(CR_IPSR),r1;								\
   3.130 +	;;											\
   3.131 +	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
   3.132 +	st8 [r16]=r29;		/* save cr.ipsr */						\
   3.133 +	;;											\
   3.134 +	lfetch.fault.excl.nt1 [r17];								\
   3.135 +	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
   3.136 +	mov r29=b0										\
   3.137 +	;;											\
   3.138 +	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
   3.139 +	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
   3.140 +(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
   3.141 +	;;											\
   3.142 +.mem.offset 0,0; st8.spill [r16]=r8,16;								\
   3.143 +.mem.offset 8,0; st8.spill [r17]=r9,16;								\
   3.144 +        ;;											\
   3.145 +.mem.offset 0,0; st8.spill [r16]=r10,24;							\
   3.146 +.mem.offset 8,0; st8.spill [r17]=r11,24;							\
   3.147 +        ;;											\
   3.148 +	st8 [r16]=r28,16;	/* save cr.iip */						\
   3.149 +	st8 [r17]=r30,16;	/* save cr.ifs */						\
   3.150 +(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
   3.151 +	mov r8=ar.ccv;										\
   3.152 +	mov r9=ar.csd;										\
   3.153 +	mov r10=ar.ssd;										\
   3.154 +	movl r11=FPSR_DEFAULT;   /* L-unit */							\
   3.155 +	;;											\
   3.156 +	st8 [r16]=r25,16;	/* save ar.unat */						\
   3.157 +	st8 [r17]=r26,16;	/* save ar.pfs */						\
   3.158 +	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
   3.159 +	;;											\
   3.160 +	st8 [r16]=r27,16;	/* save ar.rsc */						\
   3.161 +(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
   3.162 +(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
   3.163 +	;;			/* avoid RAW on r16 & r17 */					\
   3.164 +(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
   3.165 +	st8 [r17]=r31,16;	/* save predicates */						\
   3.166 +(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
   3.167 +	;;											\
   3.168 +	st8 [r16]=r29,16;	/* save b0 */							\
   3.169 +	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
   3.170 +	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
   3.171 +	;;											\
   3.172 +.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
   3.173 +.mem.offset 8,0; st8.spill [r17]=r12,16;							\
   3.174 +	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
   3.175 +	;;											\
   3.176 +.mem.offset 0,0; st8.spill [r16]=r13,16;							\
   3.177 +.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
   3.178 +	movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;					\
   3.179 +	ld8 r13=[r13];			/* establish 'current' */				\
   3.180 +	;;											\
   3.181 +.mem.offset 0,0; st8.spill [r16]=r15,16;							\
   3.182 +.mem.offset 8,0; st8.spill [r17]=r14,16;							\
   3.183 +	;;											\
   3.184 +.mem.offset 0,0; st8.spill [r16]=r2,16;								\
   3.185 +.mem.offset 8,0; st8.spill [r17]=r3,16;								\
   3.186 +	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
   3.187 +	;;											\
   3.188 +	EXTRA;											\
   3.189 +	movl r1=__gp;		/* establish kernel global pointer */				\
   3.190 +	;;											\
   3.191 +	MINSTATE_END_SAVE_MIN
   3.192 +
   3.193 +/*
   3.194 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
   3.195 + *
   3.196 + * Assumed state upon entry:
   3.197 + *	psr.ic: on
   3.198 + *	r2:	points to &pt_regs.r16
   3.199 + *	r3:	points to &pt_regs.r17
   3.200 + *	r8:	contents of ar.ccv
   3.201 + *	r9:	contents of ar.csd
   3.202 + *	r10:	contents of ar.ssd
   3.203 + *	r11:	FPSR_DEFAULT
   3.204 + *
   3.205 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
   3.206 + */
   3.207 +#define SAVE_REST				\
   3.208 +.mem.offset 0,0; st8.spill [r2]=r16,16;		\
   3.209 +.mem.offset 8,0; st8.spill [r3]=r17,16;		\
   3.210 +	;;					\
   3.211 +.mem.offset 0,0; st8.spill [r2]=r18,16;		\
   3.212 +.mem.offset 8,0; st8.spill [r3]=r19,16;		\
   3.213 +	;;					\
   3.214 +.mem.offset 0,0; st8.spill [r2]=r20,16;		\
   3.215 +.mem.offset 8,0; st8.spill [r3]=r21,16;		\
   3.216 +	mov r18=b6;				\
   3.217 +	;;					\
   3.218 +.mem.offset 0,0; st8.spill [r2]=r22,16;		\
   3.219 +.mem.offset 8,0; st8.spill [r3]=r23,16;		\
   3.220 +	mov r19=b7;				\
   3.221 +	;;					\
   3.222 +.mem.offset 0,0; st8.spill [r2]=r24,16;		\
   3.223 +.mem.offset 8,0; st8.spill [r3]=r25,16;		\
   3.224 +	;;					\
   3.225 +.mem.offset 0,0; st8.spill [r2]=r26,16;		\
   3.226 +.mem.offset 8,0; st8.spill [r3]=r27,16;		\
   3.227 +	;;					\
   3.228 +.mem.offset 0,0; st8.spill [r2]=r28,16;		\
   3.229 +.mem.offset 8,0; st8.spill [r3]=r29,16;		\
   3.230 +	;;					\
   3.231 +.mem.offset 0,0; st8.spill [r2]=r30,16;		\
   3.232 +.mem.offset 8,0; st8.spill [r3]=r31,32;		\
   3.233 +	;;					\
   3.234 +	mov ar.fpsr=r11;	/* M-unit */	\
   3.235 +	st8 [r2]=r8,8;		/* ar.ccv */	\
   3.236 +	adds r24=PT(B6)-PT(F7),r3;		\
   3.237 +	;;					\
   3.238 +	stf.spill [r2]=f6,32;			\
   3.239 +	stf.spill [r3]=f7,32;			\
   3.240 +	;;					\
   3.241 +	stf.spill [r2]=f8,32;			\
   3.242 +	stf.spill [r3]=f9,32;			\
   3.243 +	;;					\
   3.244 +	stf.spill [r2]=f10;			\
   3.245 +	stf.spill [r3]=f11;			\
   3.246 +	adds r25=PT(B7)-PT(F11),r3;		\
   3.247 +	;;					\
   3.248 +	st8 [r24]=r18,16;       /* b6 */	\
   3.249 +	st8 [r25]=r19,16;       /* b7 */	\
   3.250 +	;;					\
   3.251 +	st8 [r24]=r9;        	/* ar.csd */	\
   3.252 +	st8 [r25]=r10;      	/* ar.ssd */	\
   3.253 +	;;
   3.254 +
   3.255 +#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(cover, mov r30=cr.ifs,)
   3.256 +#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
   3.257 +#define SAVE_MIN		DO_SAVE_MIN(     , mov r30=r0, )
     4.1 --- a/xen/arch/ia64/linux-xen/setup.c	Tue Aug 30 17:51:51 2005 -0600
     4.2 +++ b/xen/arch/ia64/linux-xen/setup.c	Wed Aug 31 14:32:27 2005 -0600
     4.3 @@ -4,10 +4,15 @@
     4.4   * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
     4.5   *	David Mosberger-Tang <davidm@hpl.hp.com>
     4.6   *	Stephane Eranian <eranian@hpl.hp.com>
     4.7 - * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
     4.8 + * Copyright (C) 2000, 2004 Intel Corp
     4.9 + * 	Rohit Seth <rohit.seth@intel.com>
    4.10 + * 	Suresh Siddha <suresh.b.siddha@intel.com>
    4.11 + * 	Gordon Jin <gordon.jin@intel.com>
    4.12   * Copyright (C) 1999 VA Linux Systems
    4.13   * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
    4.14   *
    4.15 + * 12/26/04 S.Siddha, G.Jin, R.Seth
    4.16 + *			Add multi-threading and multi-core detection
    4.17   * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
    4.18   * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
    4.19   * 03/31/00 R.Seth	cpu_initialized and current->processor fixes
    4.20 @@ -15,6 +20,7 @@
    4.21   * 02/01/00 R.Seth	fixed get_cpuinfo for SMP
    4.22   * 01/07/99 S.Eranian	added the support for command line argument
    4.23   * 06/24/99 W.Drummond	added boot_cpu_data.
    4.24 + * 05/28/05 Z. Menyhart	Dynamic stride size for "flush_icache_range()"
    4.25   */
    4.26  #include <linux/config.h>
    4.27  #include <linux/module.h>
    4.28 @@ -35,6 +41,10 @@
    4.29  #include <linux/serial_core.h>
    4.30  #include <linux/efi.h>
    4.31  #include <linux/initrd.h>
    4.32 +#ifndef XEN
    4.33 +#include <linux/platform.h>
    4.34 +#include <linux/pm.h>
    4.35 +#endif
    4.36  
    4.37  #include <asm/ia32.h>
    4.38  #include <asm/machvec.h>
    4.39 @@ -51,8 +61,10 @@
    4.40  #include <asm/smp.h>
    4.41  #include <asm/system.h>
    4.42  #include <asm/unistd.h>
    4.43 +#ifdef XEN
    4.44  #include <asm/vmx.h>
    4.45  #include <asm/io.h>
    4.46 +#endif
    4.47  
    4.48  #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
    4.49  # error "struct cpuinfo_ia64 too big!"
    4.50 @@ -64,12 +76,16 @@ EXPORT_SYMBOL(__per_cpu_offset);
    4.51  #endif
    4.52  
    4.53  DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
    4.54 +#ifdef XEN
    4.55  DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr);
    4.56 +#endif
    4.57  DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
    4.58  DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
    4.59  unsigned long ia64_cycles_per_usec;
    4.60  struct ia64_boot_param *ia64_boot_param;
    4.61  struct screen_info screen_info;
    4.62 +unsigned long vga_console_iobase;
    4.63 +unsigned long vga_console_membase;
    4.64  
    4.65  unsigned long ia64_max_cacheline_size;
    4.66  unsigned long ia64_iobase;	/* virtual address for I/O accesses */
    4.67 @@ -78,7 +94,12 @@ struct io_space io_space[MAX_IO_SPACES];
    4.68  EXPORT_SYMBOL(io_space);
    4.69  unsigned int num_io_spaces;
    4.70  
    4.71 -unsigned char aux_device_present = 0xaa;        /* XXX remove this when legacy I/O is gone */
    4.72 +/*
    4.73 + * "flush_icache_range()" needs to know what processor dependent stride size to use
    4.74 + * when it makes i-cache(s) coherent with d-caches.
    4.75 + */
    4.76 +#define	I_CACHE_STRIDE_SHIFT	5	/* Safest way to go: 32 bytes by 32 bytes */
    4.77 +unsigned long ia64_i_cache_stride_shift = ~0;
    4.78  
    4.79  /*
    4.80   * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
    4.81 @@ -287,23 +308,25 @@ io_port_init (void)
    4.82  static inline int __init
    4.83  early_console_setup (char *cmdline)
    4.84  {
    4.85 +	int earlycons = 0;
    4.86 +
    4.87  #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
    4.88  	{
    4.89  		extern int sn_serial_console_early_setup(void);
    4.90  		if (!sn_serial_console_early_setup())
    4.91 -			return 0;
    4.92 +			earlycons++;
    4.93  	}
    4.94  #endif
    4.95  #ifdef CONFIG_EFI_PCDP
    4.96  	if (!efi_setup_pcdp_console(cmdline))
    4.97 -		return 0;
    4.98 +		earlycons++;
    4.99  #endif
   4.100  #ifdef CONFIG_SERIAL_8250_CONSOLE
   4.101  	if (!early_serial_console_init(cmdline))
   4.102 -		return 0;
   4.103 +		earlycons++;
   4.104  #endif
   4.105  
   4.106 -	return -1;
   4.107 +	return (earlycons) ? 0 : -1;
   4.108  }
   4.109  
   4.110  static inline void
   4.111 @@ -315,7 +338,34 @@ mark_bsp_online (void)
   4.112  #endif
   4.113  }
   4.114  
   4.115 -void __init
   4.116 +#ifdef CONFIG_SMP
   4.117 +static void
   4.118 +check_for_logical_procs (void)
   4.119 +{
   4.120 +	pal_logical_to_physical_t info;
   4.121 +	s64 status;
   4.122 +
   4.123 +	status = ia64_pal_logical_to_phys(0, &info);
   4.124 +	if (status == -1) {
   4.125 +		printk(KERN_INFO "No logical to physical processor mapping "
   4.126 +		       "available\n");
   4.127 +		return;
   4.128 +	}
   4.129 +	if (status) {
   4.130 +		printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
   4.131 +		       status);
   4.132 +		return;
   4.133 +	}
   4.134 +	/*
   4.135 +	 * Total number of siblings that BSP has.  Though not all of them 
   4.136 +	 * may have booted successfully. The correct number of siblings 
   4.137 +	 * booted is in info.overview_num_log.
   4.138 +	 */
   4.139 +	smp_num_siblings = info.overview_tpc;
   4.140 +	smp_num_cpucores = info.overview_cpp;
   4.141 +}
   4.142 +#endif
   4.143 +
   4.144  #ifdef XEN
   4.145  early_setup_arch (char **cmdline_p)
   4.146  #else
   4.147 @@ -398,6 +448,19 @@ late_setup_arch (char **cmdline_p)
   4.148  
   4.149  #ifdef CONFIG_SMP
   4.150  	cpu_physical_id(0) = hard_smp_processor_id();
   4.151 +
   4.152 +	cpu_set(0, cpu_sibling_map[0]);
   4.153 +	cpu_set(0, cpu_core_map[0]);
   4.154 +
   4.155 +	check_for_logical_procs();
   4.156 +	if (smp_num_cpucores > 1)
   4.157 +		printk(KERN_INFO
   4.158 +		       "cpu package is Multi-Core capable: number of cores=%d\n",
   4.159 +		       smp_num_cpucores);
   4.160 +	if (smp_num_siblings > 1)
   4.161 +		printk(KERN_INFO
   4.162 +		       "cpu package is Multi-Threading capable: number of siblings=%d\n",
   4.163 +		       smp_num_siblings);
   4.164  #endif
   4.165  
   4.166  #ifdef XEN
   4.167 @@ -505,12 +568,23 @@ show_cpuinfo (struct seq_file *m, void *
   4.168  		   "cpu regs   : %u\n"
   4.169  		   "cpu MHz    : %lu.%06lu\n"
   4.170  		   "itc MHz    : %lu.%06lu\n"
   4.171 -		   "BogoMIPS   : %lu.%02lu\n\n",
   4.172 +		   "BogoMIPS   : %lu.%02lu\n",
   4.173  		   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
   4.174  		   features, c->ppn, c->number,
   4.175  		   c->proc_freq / 1000000, c->proc_freq % 1000000,
   4.176  		   c->itc_freq / 1000000, c->itc_freq % 1000000,
   4.177  		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
   4.178 +#ifdef CONFIG_SMP
   4.179 +	seq_printf(m, "siblings   : %u\n", c->num_log);
   4.180 +	if (c->threads_per_core > 1 || c->cores_per_socket > 1)
   4.181 +		seq_printf(m,
   4.182 +		   	   "physical id: %u\n"
   4.183 +		   	   "core id    : %u\n"
   4.184 +		   	   "thread id  : %u\n",
   4.185 +		   	   c->socket_id, c->core_id, c->thread_id);
   4.186 +#endif
   4.187 +	seq_printf(m,"\n");
   4.188 +
   4.189  	return 0;
   4.190  }
   4.191  
   4.192 @@ -581,6 +655,14 @@ identify_cpu (struct cpuinfo_ia64 *c)
   4.193  	memcpy(c->vendor, cpuid.field.vendor, 16);
   4.194  #ifdef CONFIG_SMP
   4.195  	c->cpu = smp_processor_id();
   4.196 +
   4.197 +	/* below default values will be overwritten  by identify_siblings() 
   4.198 +	 * for Multi-Threading/Multi-Core capable cpu's
   4.199 +	 */
   4.200 +	c->threads_per_core = c->cores_per_socket = c->num_log = 1;
   4.201 +	c->socket_id = -1;
   4.202 +
   4.203 +	identify_siblings(c);
   4.204  #endif
   4.205  	c->ppn = cpuid.field.ppn;
   4.206  	c->number = cpuid.field.number;
   4.207 @@ -611,6 +693,12 @@ setup_per_cpu_areas (void)
   4.208  	/* start_kernel() requires this... */
   4.209  }
   4.210  
   4.211 +/*
   4.212 + * Calculate the max. cache line size.
   4.213 + *
   4.214 + * In addition, the minimum of the i-cache stride sizes is calculated for
   4.215 + * "flush_icache_range()".
   4.216 + */
   4.217  static void
   4.218  get_max_cacheline_size (void)
   4.219  {
   4.220 @@ -624,6 +712,8 @@ get_max_cacheline_size (void)
   4.221                  printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
   4.222                         __FUNCTION__, status);
   4.223                  max = SMP_CACHE_BYTES;
   4.224 +		/* Safest setup for "flush_icache_range()" */
   4.225 +		ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
   4.226  		goto out;
   4.227          }
   4.228  
   4.229 @@ -632,14 +722,31 @@ get_max_cacheline_size (void)
   4.230  						    &cci);
   4.231  		if (status != 0) {
   4.232  			printk(KERN_ERR
   4.233 -			       "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n",
   4.234 +			       "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
   4.235  			       __FUNCTION__, l, status);
   4.236  			max = SMP_CACHE_BYTES;
   4.237 +			/* The safest setup for "flush_icache_range()" */
   4.238 +			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
   4.239 +			cci.pcci_unified = 1;
   4.240  		}
   4.241  		line_size = 1 << cci.pcci_line_size;
   4.242  		if (line_size > max)
   4.243  			max = line_size;
   4.244 -        }
   4.245 +		if (!cci.pcci_unified) {
   4.246 +			status = ia64_pal_cache_config_info(l,
   4.247 +						    /* cache_type (instruction)= */ 1,
   4.248 +						    &cci);
   4.249 +			if (status != 0) {
   4.250 +				printk(KERN_ERR
   4.251 +				"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
   4.252 +					__FUNCTION__, l, status);
   4.253 +				/* The safest setup for "flush_icache_range()" */
   4.254 +				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
   4.255 +			}
   4.256 +		}
   4.257 +		if (cci.pcci_stride < ia64_i_cache_stride_shift)
   4.258 +			ia64_i_cache_stride_shift = cci.pcci_stride;
   4.259 +	}
   4.260    out:
   4.261  	if (max > ia64_max_cacheline_size)
   4.262  		ia64_max_cacheline_size = max;
   4.263 @@ -700,7 +807,17 @@ cpu_init (void)
   4.264  	ia64_set_kr(IA64_KR_FPU_OWNER, 0);
   4.265  
   4.266  	/*
   4.267 -	 * Initialize default control register to defer all speculative faults.  The
   4.268 +	 * Initialize the page-table base register to a global
   4.269 +	 * directory with all zeroes.  This ensure that we can handle
   4.270 +	 * TLB-misses to user address-space even before we created the
   4.271 +	 * first user address-space.  This may happen, e.g., due to
   4.272 +	 * aggressive use of lfetch.fault.
   4.273 +	 */
   4.274 +	ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
   4.275 +
   4.276 +	/*
   4.277 +	 * Initialize default control register to defer speculative faults except
   4.278 +	 * for those arising from TLB misses, which are not deferred.  The
   4.279  	 * kernel MUST NOT depend on a particular setting of these bits (in other words,
   4.280  	 * the kernel must have recovery code for all speculative accesses).  Turn on
   4.281  	 * dcr.lc as per recommendation by the architecture team.  Most IA-32 apps
   4.282 @@ -762,6 +879,9 @@ cpu_init (void)
   4.283  	/* size of physical stacked register partition plus 8 bytes: */
   4.284  	__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
   4.285  	platform_cpu_init();
   4.286 +#ifndef XEN
   4.287 +	pm_idle = default_idle;
   4.288 +#endif
   4.289  }
   4.290  
   4.291  void
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/xen/arch/ia64/linux-xen/sort.c	Wed Aug 31 14:32:27 2005 -0600
     5.3 @@ -0,0 +1,122 @@
     5.4 +/*
     5.5 + * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
     5.6 + *
     5.7 + * Jan 23 2005  Matt Mackall <mpm@selenic.com>
     5.8 + */
     5.9 +
    5.10 +#include <linux/kernel.h>
    5.11 +#include <linux/module.h>
    5.12 +#ifdef XEN
    5.13 +#include <linux/types.h>
    5.14 +#endif
    5.15 +
    5.16 +void u32_swap(void *a, void *b, int size)
    5.17 +{
    5.18 +	u32 t = *(u32 *)a;
    5.19 +	*(u32 *)a = *(u32 *)b;
    5.20 +	*(u32 *)b = t;
    5.21 +}
    5.22 +
    5.23 +void generic_swap(void *a, void *b, int size)
    5.24 +{
    5.25 +	char t;
    5.26 +
    5.27 +	do {
    5.28 +		t = *(char *)a;
    5.29 +		*(char *)a++ = *(char *)b;
    5.30 +		*(char *)b++ = t;
    5.31 +	} while (--size > 0);
    5.32 +}
    5.33 +
    5.34 +/*
    5.35 + * sort - sort an array of elements
    5.36 + * @base: pointer to data to sort
    5.37 + * @num: number of elements
    5.38 + * @size: size of each element
    5.39 + * @cmp: pointer to comparison function
    5.40 + * @swap: pointer to swap function or NULL
    5.41 + *
    5.42 + * This function does a heapsort on the given array. You may provide a
    5.43 + * swap function optimized to your element type.
    5.44 + *
    5.45 + * Sorting time is O(n log n) both on average and worst-case. While
    5.46 + * qsort is about 20% faster on average, it suffers from exploitable
    5.47 + * O(n*n) worst-case behavior and extra memory requirements that make
    5.48 + * it less suitable for kernel use.
    5.49 + */
    5.50 +
    5.51 +void sort(void *base, size_t num, size_t size,
    5.52 +	  int (*cmp)(const void *, const void *),
    5.53 +	  void (*swap)(void *, void *, int size))
    5.54 +{
    5.55 +	/* pre-scale counters for performance */
    5.56 +	int i = (num/2) * size, n = num * size, c, r;
    5.57 +
    5.58 +	if (!swap)
    5.59 +		swap = (size == 4 ? u32_swap : generic_swap);
    5.60 +
    5.61 +	/* heapify */
    5.62 +	for ( ; i >= 0; i -= size) {
    5.63 +		for (r = i; r * 2 < n; r  = c) {
    5.64 +			c = r * 2;
    5.65 +			if (c < n - size && cmp(base + c, base + c + size) < 0)
    5.66 +				c += size;
    5.67 +			if (cmp(base + r, base + c) >= 0)
    5.68 +				break;
    5.69 +			swap(base + r, base + c, size);
    5.70 +		}
    5.71 +	}
    5.72 +
    5.73 +	/* sort */
    5.74 +	for (i = n - size; i >= 0; i -= size) {
    5.75 +		swap(base, base + i, size);
    5.76 +		for (r = 0; r * 2 < i; r = c) {
    5.77 +			c = r * 2;
    5.78 +			if (c < i - size && cmp(base + c, base + c + size) < 0)
    5.79 +				c += size;
    5.80 +			if (cmp(base + r, base + c) >= 0)
    5.81 +				break;
    5.82 +			swap(base + r, base + c, size);
    5.83 +		}
    5.84 +	}
    5.85 +}
    5.86 +
    5.87 +EXPORT_SYMBOL(sort);
    5.88 +
    5.89 +#if 0
    5.90 +/* a simple boot-time regression test */
    5.91 +
    5.92 +int cmpint(const void *a, const void *b)
    5.93 +{
    5.94 +	return *(int *)a - *(int *)b;
    5.95 +}
    5.96 +
    5.97 +static int sort_test(void)
    5.98 +{
    5.99 +	int *a, i, r = 1;
   5.100 +
   5.101 +	a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
   5.102 +	BUG_ON(!a);
   5.103 +
   5.104 +	printk("testing sort()\n");
   5.105 +
   5.106 +	for (i = 0; i < 1000; i++) {
   5.107 +		r = (r * 725861) % 6599;
   5.108 +		a[i] = r;
   5.109 +	}
   5.110 +
   5.111 +	sort(a, 1000, sizeof(int), cmpint, NULL);
   5.112 +
   5.113 +	for (i = 0; i < 999; i++)
   5.114 +		if (a[i] > a[i+1]) {
   5.115 +			printk("sort() failed!\n");
   5.116 +			break;
   5.117 +		}
   5.118 +
   5.119 +	kfree(a);
   5.120 +
   5.121 +	return 0;
   5.122 +}
   5.123 +
   5.124 +module_init(sort_test);
   5.125 +#endif
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/xen/arch/ia64/linux/README.origin	Wed Aug 31 14:32:27 2005 -0600
     6.3 @@ -0,0 +1,20 @@
     6.4 +Source files in this directory are identical copies of linux-2.6.13 files:
     6.5 +
     6.6 +cmdline.c		-> linux/lib/cmdline.c
     6.7 +efi_stub.S		-> linux/arch/ia64/efi_stub.S
     6.8 +extable.c		-> linux/arch/ia64/mm/extable.c
     6.9 +hpsim.S			-> linux/arch/ia64/hp/sim/hpsim.S
    6.10 +ia64_ksyms.c		-> linux/arch/ia64/kernel/ia64_ksyms.c
    6.11 +linuxextable.c		-> linux/kernel/extable.c
    6.12 +machvec.c		-> linux/arch/ia64/kernel/machvec.c
    6.13 +patch.c			-> linux/arch/ia64/kernel/patch.c
    6.14 +pcdp.h			-> drivers/firmware/pcdp.h
    6.15 +lib/bitop.c		-> linux/arch/ia64/lib/bitop.c
    6.16 +lib/clear_page.S	-> linux/arch/ia64/lib/clear_page.S
    6.17 +lib/copy_page_mck.S	-> linux/arch/ia64/lib/copy_page_mck.S
    6.18 +lib/flush.S		-> linux/arch/ia64/lib/flush.S
    6.19 +lib/idiv32.S		-> linux/arch/ia64/lib/idiv32.S
    6.20 +lib/idiv64.S		-> linux/arch/ia64/lib/idiv64.S
    6.21 +lib/memcpy_mck.S	-> linux/arch/ia64/lib/memcpy_mck.S
    6.22 +lib/memset.S		-> linux/arch/ia64/lib/memset.S
    6.23 +lib/strlen.S		-> linux/arch/ia64/lib/strlen.S
     7.1 --- a/xen/arch/ia64/linux/extable.c	Tue Aug 30 17:51:51 2005 -0600
     7.2 +++ b/xen/arch/ia64/linux/extable.c	Wed Aug 31 14:32:27 2005 -0600
     7.3 @@ -6,29 +6,29 @@
     7.4   */
     7.5  
     7.6  #include <linux/config.h>
     7.7 +#include <linux/sort.h>
     7.8  
     7.9  #include <asm/uaccess.h>
    7.10  #include <asm/module.h>
    7.11  
    7.12 -static inline int
    7.13 -compare_entries (struct exception_table_entry *l, struct exception_table_entry *r)
    7.14 +static int cmp_ex(const void *a, const void *b)
    7.15  {
    7.16 +	const struct exception_table_entry *l = a, *r = b;
    7.17  	u64 lip = (u64) &l->addr + l->addr;
    7.18  	u64 rip = (u64) &r->addr + r->addr;
    7.19  
    7.20 +	/* avoid overflow */
    7.21 +	if (lip > rip)
    7.22 +		return 1;
    7.23  	if (lip < rip)
    7.24  		return -1;
    7.25 -	if (lip == rip)
    7.26 -		return 0;
    7.27 -	else
    7.28 -		return 1;
    7.29 +	return 0;
    7.30  }
    7.31  
    7.32 -static inline void
    7.33 -swap_entries (struct exception_table_entry *l, struct exception_table_entry *r)
    7.34 +static void swap_ex(void *a, void *b, int size)
    7.35  {
    7.36 +	struct exception_table_entry *l = a, *r = b, tmp;
    7.37  	u64 delta = (u64) r - (u64) l;
    7.38 -	struct exception_table_entry tmp;
    7.39  
    7.40  	tmp = *l;
    7.41  	l->addr = r->addr + delta;
    7.42 @@ -38,23 +38,20 @@ swap_entries (struct exception_table_ent
    7.43  }
    7.44  
    7.45  /*
    7.46 - * Sort the exception table.  It's usually already sorted, but there may be unordered
    7.47 - * entries due to multiple text sections (such as the .init text section).  Note that the
    7.48 - * exception-table-entries contain location-relative addresses, which requires a bit of
    7.49 - * care during sorting to avoid overflows in the offset members (e.g., it would not be
    7.50 - * safe to make a temporary copy of an exception-table entry on the stack, because the
    7.51 - * stack may be more than 2GB away from the exception-table).
    7.52 + * Sort the exception table. It's usually already sorted, but there
    7.53 + * may be unordered entries due to multiple text sections (such as the
    7.54 + * .init text section). Note that the exception-table-entries contain
    7.55 + * location-relative addresses, which requires a bit of care during
    7.56 + * sorting to avoid overflows in the offset members (e.g., it would
    7.57 + * not be safe to make a temporary copy of an exception-table entry on
    7.58 + * the stack, because the stack may be more than 2GB away from the
    7.59 + * exception-table).
    7.60   */
    7.61 -void
    7.62 -sort_extable (struct exception_table_entry *start, struct exception_table_entry *finish)
    7.63 +void sort_extable (struct exception_table_entry *start,
    7.64 +		   struct exception_table_entry *finish)
    7.65  {
    7.66 -	struct exception_table_entry *p, *q;
    7.67 -
    7.68 - 	/* insertion sort */
    7.69 -	for (p = start + 1; p < finish; ++p)
    7.70 -		/* start .. p-1 is sorted; push p down to it's proper place */
    7.71 -		for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; --q)
    7.72 -			swap_entries(&q[0], &q[-1]);
    7.73 +	sort(start, finish - start, sizeof(struct exception_table_entry),
    7.74 +	     cmp_ex, swap_ex);
    7.75  }
    7.76  
    7.77  const struct exception_table_entry *
     8.1 --- a/xen/arch/ia64/linux/ia64_ksyms.c	Tue Aug 30 17:51:51 2005 -0600
     8.2 +++ b/xen/arch/ia64/linux/ia64_ksyms.c	Wed Aug 31 14:32:27 2005 -0600
     8.3 @@ -58,9 +58,6 @@ EXPORT_SYMBOL(__strlen_user);
     8.4  EXPORT_SYMBOL(__strncpy_from_user);
     8.5  EXPORT_SYMBOL(__strnlen_user);
     8.6  
     8.7 -#include <asm/unistd.h>
     8.8 -EXPORT_SYMBOL(__ia64_syscall);
     8.9 -
    8.10  /* from arch/ia64/lib */
    8.11  extern void __divsi3(void);
    8.12  extern void __udivsi3(void);
    10.1 --- a/xen/arch/ia64/linux/lib/Makefile	Tue Aug 30 17:51:51 2005 -0600
    10.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.3 @@ -1,44 +0,0 @@
    10.4 -#
    10.5 -# Makefile for ia64-specific library routines..
    10.6 -#
    10.7 -
    10.8 -include $(BASEDIR)/Rules.mk
    10.9 -
   10.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
   10.11 -	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
   10.12 -	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
   10.13 -	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
   10.14 -	flush.o ip_fast_csum.o do_csum.o copy_user.o			\
   10.15 -	memset.o strlen.o memcpy.o 
   10.16 -
   10.17 -default: $(OBJS)
   10.18 -	$(LD) -r -o ia64lib.o $(OBJS)
   10.19 -
   10.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
   10.21 -
   10.22 -__divdi3.o: idiv64.S
   10.23 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
   10.24 -
   10.25 -__udivdi3.o: idiv64.S
   10.26 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
   10.27 -
   10.28 -__moddi3.o: idiv64.S
   10.29 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
   10.30 -
   10.31 -__umoddi3.o: idiv64.S
   10.32 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
   10.33 -
   10.34 -__divsi3.o: idiv32.S
   10.35 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
   10.36 -
   10.37 -__udivsi3.o: idiv32.S
   10.38 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
   10.39 -
   10.40 -__modsi3.o: idiv32.S
   10.41 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
   10.42 -
   10.43 -__umodsi3.o: idiv32.S
   10.44 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
   10.45 -
   10.46 -clean:
   10.47 -	rm -f *.o *~
    11.1 --- a/xen/arch/ia64/linux/lib/carta_random.S	Tue Aug 30 17:51:51 2005 -0600
    11.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.3 @@ -1,54 +0,0 @@
    11.4 -/*
    11.5 - * Fast, simple, yet decent quality random number generator based on
    11.6 - * a paper by David G. Carta ("Two Fast Implementations of the
    11.7 - * `Minimal Standard' Random Number Generator," Communications of the
    11.8 - * ACM, January, 1990).
    11.9 - *
   11.10 - * Copyright (C) 2002 Hewlett-Packard Co
   11.11 - *	David Mosberger-Tang <davidm@hpl.hp.com>
   11.12 - */
   11.13 -
   11.14 -#include <asm/asmmacro.h>
   11.15 -
   11.16 -#define a	r2
   11.17 -#define m	r3
   11.18 -#define lo	r8
   11.19 -#define hi	r9
   11.20 -#define t0	r16
   11.21 -#define t1	r17
   11.22 -#define	seed	r32
   11.23 -
   11.24 -GLOBAL_ENTRY(carta_random32)
   11.25 -	movl	a = (16807 << 16) | 16807
   11.26 -	;;
   11.27 -	pmpyshr2.u t0 = a, seed, 0
   11.28 -	pmpyshr2.u t1 = a, seed, 16
   11.29 -	;;
   11.30 -	unpack2.l t0 = t1, t0
   11.31 -	dep	m = -1, r0, 0, 31
   11.32 -	;;
   11.33 -	zxt4	lo = t0
   11.34 -	shr.u	hi = t0, 32
   11.35 -	;;
   11.36 -	dep	t0 = 0, hi, 15, 49	// t0 = (hi & 0x7fff)
   11.37 -	;;
   11.38 -	shl	t0 = t0, 16		// t0 = (hi & 0x7fff) << 16
   11.39 -	shr	t1 = hi, 15		// t1 = (hi >> 15)
   11.40 -	;;
   11.41 -	add	lo = lo, t0
   11.42 -	;;
   11.43 -	cmp.gtu	p6, p0 = lo, m
   11.44 -	;;
   11.45 -(p6)	and	lo = lo, m
   11.46 -	;;
   11.47 -(p6)	add	lo = 1, lo
   11.48 -	;;
   11.49 -	add	lo = lo, t1
   11.50 -	;;
   11.51 -	cmp.gtu p6, p0 = lo, m
   11.52 -	;;
   11.53 -(p6)	and	lo = lo, m
   11.54 -	;;
   11.55 -(p6)	add	lo = 1, lo
   11.56 -	br.ret.sptk.many rp
   11.57 -END(carta_random32)
    12.1 --- a/xen/arch/ia64/linux/lib/checksum.c	Tue Aug 30 17:51:51 2005 -0600
    12.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.3 @@ -1,102 +0,0 @@
    12.4 -/*
    12.5 - * Network checksum routines
    12.6 - *
    12.7 - * Copyright (C) 1999, 2003 Hewlett-Packard Co
    12.8 - *	Stephane Eranian <eranian@hpl.hp.com>
    12.9 - *
   12.10 - * Most of the code coming from arch/alpha/lib/checksum.c
   12.11 - *
   12.12 - * This file contains network checksum routines that are better done
   12.13 - * in an architecture-specific manner due to speed..
   12.14 - */
   12.15 -
   12.16 -#include <linux/module.h>
   12.17 -#include <linux/string.h>
   12.18 -
   12.19 -#include <asm/byteorder.h>
   12.20 -
   12.21 -static inline unsigned short
   12.22 -from64to16 (unsigned long x)
   12.23 -{
   12.24 -	/* add up 32-bit words for 33 bits */
   12.25 -	x = (x & 0xffffffff) + (x >> 32);
   12.26 -	/* add up 16-bit and 17-bit words for 17+c bits */
   12.27 -	x = (x & 0xffff) + (x >> 16);
   12.28 -	/* add up 16-bit and 2-bit for 16+c bit */
   12.29 -	x = (x & 0xffff) + (x >> 16);
   12.30 -	/* add up carry.. */
   12.31 -	x = (x & 0xffff) + (x >> 16);
   12.32 -	return x;
   12.33 -}
   12.34 -
   12.35 -/*
   12.36 - * computes the checksum of the TCP/UDP pseudo-header
   12.37 - * returns a 16-bit checksum, already complemented.
   12.38 - */
   12.39 -unsigned short int
   12.40 -csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
   12.41 -		   unsigned short proto, unsigned int sum)
   12.42 -{
   12.43 -	return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
   12.44 -			   ((unsigned long) proto << 8));
   12.45 -}
   12.46 -
   12.47 -EXPORT_SYMBOL(csum_tcpudp_magic);
   12.48 -
   12.49 -unsigned int
   12.50 -csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
   12.51 -		    unsigned short proto, unsigned int sum)
   12.52 -{
   12.53 -	unsigned long result;
   12.54 -
   12.55 -	result = (saddr + daddr + sum +
   12.56 -		  ((unsigned long) ntohs(len) << 16) +
   12.57 -		  ((unsigned long) proto << 8));
   12.58 -
   12.59 -	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
   12.60 -	/* 64 to 33 */
   12.61 -	result = (result & 0xffffffff) + (result >> 32);
   12.62 -	/* 33 to 32 */
   12.63 -	result = (result & 0xffffffff) + (result >> 32);
   12.64 -	return result;
   12.65 -}
   12.66 -
   12.67 -extern unsigned long do_csum (const unsigned char *, long);
   12.68 -
   12.69 -/*
   12.70 - * computes the checksum of a memory block at buff, length len,
   12.71 - * and adds in "sum" (32-bit)
   12.72 - *
   12.73 - * returns a 32-bit number suitable for feeding into itself
   12.74 - * or csum_tcpudp_magic
   12.75 - *
   12.76 - * this function must be called with even lengths, except
   12.77 - * for the last fragment, which may be odd
   12.78 - *
   12.79 - * it's best to have buff aligned on a 32-bit boundary
   12.80 - */
   12.81 -unsigned int
   12.82 -csum_partial (const unsigned char * buff, int len, unsigned int sum)
   12.83 -{
   12.84 -	unsigned long result = do_csum(buff, len);
   12.85 -
   12.86 -	/* add in old sum, and carry.. */
   12.87 -	result += sum;
   12.88 -	/* 32+c bits -> 32 bits */
   12.89 -	result = (result & 0xffffffff) + (result >> 32);
   12.90 -	return result;
   12.91 -}
   12.92 -
   12.93 -EXPORT_SYMBOL(csum_partial);
   12.94 -
   12.95 -/*
   12.96 - * this routine is used for miscellaneous IP-like checksums, mainly
   12.97 - * in icmp.c
   12.98 - */
   12.99 -unsigned short
  12.100 -ip_compute_csum (unsigned char * buff, int len)
  12.101 -{
  12.102 -	return ~do_csum(buff,len);
  12.103 -}
  12.104 -
  12.105 -EXPORT_SYMBOL(ip_compute_csum);
    13.1 --- a/xen/arch/ia64/linux/lib/clear_user.S	Tue Aug 30 17:51:51 2005 -0600
    13.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.3 @@ -1,209 +0,0 @@
    13.4 -/*
    13.5 - * This routine clears to zero a linear memory buffer in user space.
    13.6 - *
    13.7 - * Inputs:
    13.8 - *	in0:	address of buffer
    13.9 - *	in1:	length of buffer in bytes
   13.10 - * Outputs:
   13.11 - *	r8:	number of bytes that didn't get cleared due to a fault
   13.12 - *
   13.13 - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
   13.14 - *	Stephane Eranian <eranian@hpl.hp.com>
   13.15 - */
   13.16 -
   13.17 -#include <asm/asmmacro.h>
   13.18 -
   13.19 -//
   13.20 -// arguments
   13.21 -//
   13.22 -#define buf		r32
   13.23 -#define len		r33
   13.24 -
   13.25 -//
   13.26 -// local registers
   13.27 -//
   13.28 -#define cnt		r16
   13.29 -#define buf2		r17
   13.30 -#define saved_lc	r18
   13.31 -#define saved_pfs	r19
   13.32 -#define tmp		r20
   13.33 -#define len2		r21
   13.34 -#define len3		r22
   13.35 -
   13.36 -//
   13.37 -// Theory of operations:
   13.38 -//	- we check whether or not the buffer is small, i.e., less than 17
   13.39 -//	  in which case we do the byte by byte loop.
   13.40 -//
   13.41 -//	- Otherwise we go progressively from 1 byte store to 8byte store in
   13.42 -//	  the head part, the body is a 16byte store loop and we finish we the
   13.43 -//	  tail for the last 15 bytes.
   13.44 -//	  The good point about this breakdown is that the long buffer handling
   13.45 -//	  contains only 2 branches.
   13.46 -//
   13.47 -//	The reason for not using shifting & masking for both the head and the
   13.48 -//	tail is to stay semantically correct. This routine is not supposed
   13.49 -//	to write bytes outside of the buffer. While most of the time this would
   13.50 -//	be ok, we can't tolerate a mistake. A classical example is the case
   13.51 -//	of multithreaded code were to the extra bytes touched is actually owned
   13.52 -//	by another thread which runs concurrently to ours. Another, less likely,
   13.53 -//	example is with device drivers where reading an I/O mapped location may
   13.54 -//	have side effects (same thing for writing).
   13.55 -//
   13.56 -
   13.57 -GLOBAL_ENTRY(__do_clear_user)
   13.58 -	.prologue
   13.59 -	.save ar.pfs, saved_pfs
   13.60 -	alloc	saved_pfs=ar.pfs,2,0,0,0
   13.61 -	cmp.eq p6,p0=r0,len		// check for zero length
   13.62 -	.save ar.lc, saved_lc
   13.63 -	mov saved_lc=ar.lc		// preserve ar.lc (slow)
   13.64 -	.body
   13.65 -	;;				// avoid WAW on CFM
   13.66 -	adds tmp=-1,len			// br.ctop is repeat/until
   13.67 -	mov ret0=len			// return value is length at this point
   13.68 -(p6)	br.ret.spnt.many rp
   13.69 -	;;
   13.70 -	cmp.lt p6,p0=16,len		// if len > 16 then long memset
   13.71 -	mov ar.lc=tmp			// initialize lc for small count
   13.72 -(p6)	br.cond.dptk .long_do_clear
   13.73 -	;;				// WAR on ar.lc
   13.74 -	//
   13.75 -	// worst case 16 iterations, avg 8 iterations
   13.76 -	//
   13.77 -	// We could have played with the predicates to use the extra
   13.78 -	// M slot for 2 stores/iteration but the cost the initialization
   13.79 -	// the various counters compared to how long the loop is supposed
   13.80 -	// to last on average does not make this solution viable.
   13.81 -	//
   13.82 -1:
   13.83 -	EX( .Lexit1, st1 [buf]=r0,1 )
   13.84 -	adds len=-1,len			// countdown length using len
   13.85 -	br.cloop.dptk 1b
   13.86 -	;;				// avoid RAW on ar.lc
   13.87 -	//
   13.88 -	// .Lexit4: comes from byte by byte loop
   13.89 -	//	    len contains bytes left
   13.90 -.Lexit1:
   13.91 -	mov ret0=len			// faster than using ar.lc
   13.92 -	mov ar.lc=saved_lc
   13.93 -	br.ret.sptk.many rp		// end of short clear_user
   13.94 -
   13.95 -
   13.96 -	//
   13.97 -	// At this point we know we have more than 16 bytes to copy
   13.98 -	// so we focus on alignment (no branches required)
   13.99 -	//
  13.100 -	// The use of len/len2 for countdown of the number of bytes left
  13.101 -	// instead of ret0 is due to the fact that the exception code
  13.102 -	// changes the values of r8.
  13.103 -	//
  13.104 -.long_do_clear:
  13.105 -	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
  13.106 -	;;
  13.107 -	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
  13.108 -(p6)	adds len=-1,len;;		// sync because buf is modified
  13.109 -	tbit.nz p6,p0=buf,1
  13.110 -	;;
  13.111 -	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
  13.112 -(p6)	adds len=-2,len;;
  13.113 -	tbit.nz p6,p0=buf,2
  13.114 -	;;
  13.115 -	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
  13.116 -(p6)	adds len=-4,len;;
  13.117 -	tbit.nz p6,p0=buf,3
  13.118 -	;;
  13.119 -	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
  13.120 -(p6)	adds len=-8,len;;
  13.121 -	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
  13.122 -	;;
  13.123 -	cmp.eq p6,p0=r0,cnt
  13.124 -	adds tmp=-1,cnt
  13.125 -(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
  13.126 -	;;
  13.127 -	adds buf2=8,buf			// setup second base pointer
  13.128 -	mov ar.lc=tmp
  13.129 -	;;
  13.130 -
  13.131 -	//
  13.132 -	// 16bytes/iteration core loop
  13.133 -	//
  13.134 -	// The second store can never generate a fault because
  13.135 -	// we come into the loop only when we are 16-byte aligned.
  13.136 -	// This means that if we cross a page then it will always be
  13.137 -	// in the first store and never in the second.
  13.138 -	//
  13.139 -	//
  13.140 -	// We need to keep track of the remaining length. A possible (optimistic)
  13.141 -	// way would be to use ar.lc and derive how many byte were left by
  13.142 -	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
  13.143 -	// every iteration.
  13.144 -	// However we need to keep the synchronization point. A template
  13.145 -	// M;;MB does not exist and thus we can keep the addition at no
  13.146 -	// extra cycle cost (use a nop slot anyway). It also simplifies the
  13.147 -	// (unlikely)  error recovery code
  13.148 -	//
  13.149 -
  13.150 -2:	EX(.Lexit3, st8 [buf]=r0,16 )
  13.151 -	;;				// needed to get len correct when error
  13.152 -	st8 [buf2]=r0,16
  13.153 -	adds len=-16,len
  13.154 -	br.cloop.dptk 2b
  13.155 -	;;
  13.156 -	mov ar.lc=saved_lc
  13.157 -	//
  13.158 -	// tail correction based on len only
  13.159 -	//
  13.160 -	// We alternate the use of len3,len2 to allow parallelism and correct
  13.161 -	// error handling. We also reuse p6/p7 to return correct value.
  13.162 -	// The addition of len2/len3 does not cost anything more compared to
  13.163 -	// the regular memset as we had empty slots.
  13.164 -	//
  13.165 -.dotail:
  13.166 -	mov len2=len			// for parallelization of error handling
  13.167 -	mov len3=len
  13.168 -	tbit.nz p6,p0=len,3
  13.169 -	;;
  13.170 -	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
  13.171 -(p6)	adds len3=-8,len2
  13.172 -	tbit.nz p7,p6=len,2
  13.173 -	;;
  13.174 -	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
  13.175 -(p7)	adds len2=-4,len3
  13.176 -	tbit.nz p6,p7=len,1
  13.177 -	;;
  13.178 -	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
  13.179 -(p6)	adds len3=-2,len2
  13.180 -	tbit.nz p7,p6=len,0
  13.181 -	;;
  13.182 -	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
  13.183 -	mov ret0=r0				// success
  13.184 -	br.ret.sptk.many rp			// end of most likely path
  13.185 -
  13.186 -	//
  13.187 -	// Outlined error handling code
  13.188 -	//
  13.189 -
  13.190 -	//
  13.191 -	// .Lexit3: comes from core loop, need restore pr/lc
  13.192 -	//	    len contains bytes left
  13.193 -	//
  13.194 -	//
  13.195 -	// .Lexit2:
  13.196 -	//	if p6 -> coming from st8 or st2 : len2 contains what's left
  13.197 -	//	if p7 -> coming from st4 or st1 : len3 contains what's left
  13.198 -	// We must restore lc/pr even though might not have been used.
  13.199 -.Lexit2:
  13.200 -	.pred.rel "mutex", p6, p7
  13.201 -(p6)	mov len=len2
  13.202 -(p7)	mov len=len3
  13.203 -	;;
  13.204 -	//
  13.205 -	// .Lexit4: comes from head, need not restore pr/lc
  13.206 -	//	    len contains bytes left
  13.207 -	//
  13.208 -.Lexit3:
  13.209 -	mov ret0=len
  13.210 -	mov ar.lc=saved_lc
  13.211 -	br.ret.sptk.many rp
  13.212 -END(__do_clear_user)
    14.1 --- a/xen/arch/ia64/linux/lib/copy_page.S	Tue Aug 30 17:51:51 2005 -0600
    14.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.3 @@ -1,98 +0,0 @@
    14.4 -/*
    14.5 - *
    14.6 - * Optimized version of the standard copy_page() function
    14.7 - *
    14.8 - * Inputs:
    14.9 - *	in0:	address of target page
   14.10 - *	in1:	address of source page
   14.11 - * Output:
   14.12 - *	no return value
   14.13 - *
   14.14 - * Copyright (C) 1999, 2001 Hewlett-Packard Co
   14.15 - *	Stephane Eranian <eranian@hpl.hp.com>
   14.16 - *	David Mosberger <davidm@hpl.hp.com>
   14.17 - *
   14.18 - * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
   14.19 - */
   14.20 -#include <asm/asmmacro.h>
   14.21 -#include <asm/page.h>
   14.22 -
   14.23 -#define PIPE_DEPTH	3
   14.24 -#define EPI		p[PIPE_DEPTH-1]
   14.25 -
   14.26 -#define lcount		r16
   14.27 -#define saved_pr	r17
   14.28 -#define saved_lc	r18
   14.29 -#define saved_pfs	r19
   14.30 -#define src1		r20
   14.31 -#define src2		r21
   14.32 -#define tgt1		r22
   14.33 -#define tgt2		r23
   14.34 -#define srcf		r24
   14.35 -#define tgtf		r25
   14.36 -#define tgt_last	r26
   14.37 -
   14.38 -#define Nrot		((8*PIPE_DEPTH+7)&~7)
   14.39 -
   14.40 -GLOBAL_ENTRY(copy_page)
   14.41 -	.prologue
   14.42 -	.save ar.pfs, saved_pfs
   14.43 -	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
   14.44 -
   14.45 -	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
   14.46 -	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
   14.47 -	.rotp p[PIPE_DEPTH]
   14.48 -
   14.49 -	.save ar.lc, saved_lc
   14.50 -	mov saved_lc=ar.lc
   14.51 -	mov ar.ec=PIPE_DEPTH
   14.52 -
   14.53 -	mov lcount=PAGE_SIZE/64-1
   14.54 -	.save pr, saved_pr
   14.55 -	mov saved_pr=pr
   14.56 -	mov pr.rot=1<<16
   14.57 -
   14.58 -	.body
   14.59 -
   14.60 -	mov src1=in1
   14.61 -	adds src2=8,in1
   14.62 -	mov tgt_last = PAGE_SIZE
   14.63 -	;;
   14.64 -	adds tgt2=8,in0
   14.65 -	add srcf=512,in1
   14.66 -	mov ar.lc=lcount
   14.67 -	mov tgt1=in0
   14.68 -	add tgtf=512,in0
   14.69 -	add tgt_last = tgt_last, in0
   14.70 -	;;
   14.71 -1:
   14.72 -(p[0])	ld8 t1[0]=[src1],16
   14.73 -(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
   14.74 -(p[0])	ld8 t2[0]=[src2],16
   14.75 -(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
   14.76 -	cmp.ltu p6,p0 = tgtf, tgt_last
   14.77 -	;;
   14.78 -(p[0])	ld8 t3[0]=[src1],16
   14.79 -(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
   14.80 -(p[0])	ld8 t4[0]=[src2],16
   14.81 -(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
   14.82 -	;;
   14.83 -(p[0])	ld8 t5[0]=[src1],16
   14.84 -(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
   14.85 -(p[0])	ld8 t6[0]=[src2],16
   14.86 -(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
   14.87 -	;;
   14.88 -(p[0])	ld8 t7[0]=[src1],16
   14.89 -(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
   14.90 -(p[0])	ld8 t8[0]=[src2],16
   14.91 -(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
   14.92 -
   14.93 -(p6)	lfetch [srcf], 64
   14.94 -(p6)	lfetch [tgtf], 64
   14.95 -	br.ctop.sptk.few 1b
   14.96 -	;;
   14.97 -	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
   14.98 -	mov ar.pfs=saved_pfs
   14.99 -	mov ar.lc=saved_lc
  14.100 -	br.ret.sptk.many rp
  14.101 -END(copy_page)
    15.1 --- a/xen/arch/ia64/linux/lib/copy_user.S	Tue Aug 30 17:51:51 2005 -0600
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,610 +0,0 @@
    15.4 -/*
    15.5 - *
    15.6 - * Optimized version of the copy_user() routine.
    15.7 - * It is used to copy date across the kernel/user boundary.
    15.8 - *
    15.9 - * The source and destination are always on opposite side of
   15.10 - * the boundary. When reading from user space we must catch
   15.11 - * faults on loads. When writing to user space we must catch
   15.12 - * errors on stores. Note that because of the nature of the copy
   15.13 - * we don't need to worry about overlapping regions.
   15.14 - *
   15.15 - *
   15.16 - * Inputs:
   15.17 - *	in0	address of source buffer
   15.18 - *	in1	address of destination buffer
   15.19 - *	in2	number of bytes to copy
   15.20 - *
   15.21 - * Outputs:
   15.22 - *	ret0	0 in case of success. The number of bytes NOT copied in
   15.23 - *		case of error.
   15.24 - *
   15.25 - * Copyright (C) 2000-2001 Hewlett-Packard Co
   15.26 - *	Stephane Eranian <eranian@hpl.hp.com>
   15.27 - *
   15.28 - * Fixme:
   15.29 - *	- handle the case where we have more than 16 bytes and the alignment
   15.30 - *	  are different.
   15.31 - *	- more benchmarking
   15.32 - *	- fix extraneous stop bit introduced by the EX() macro.
   15.33 - */
   15.34 -
   15.35 -#include <asm/asmmacro.h>
   15.36 -
   15.37 -//
   15.38 -// Tuneable parameters
   15.39 -//
   15.40 -#define COPY_BREAK	16	// we do byte copy below (must be >=16)
   15.41 -#define PIPE_DEPTH	21	// pipe depth
   15.42 -
   15.43 -#define EPI		p[PIPE_DEPTH-1]
   15.44 -
   15.45 -//
   15.46 -// arguments
   15.47 -//
   15.48 -#define dst		in0
   15.49 -#define src		in1
   15.50 -#define len		in2
   15.51 -
   15.52 -//
   15.53 -// local registers
   15.54 -//
   15.55 -#define t1		r2	// rshift in bytes
   15.56 -#define t2		r3	// lshift in bytes
   15.57 -#define rshift		r14	// right shift in bits
   15.58 -#define lshift		r15	// left shift in bits
   15.59 -#define word1		r16
   15.60 -#define word2		r17
   15.61 -#define cnt		r18
   15.62 -#define len2		r19
   15.63 -#define saved_lc	r20
   15.64 -#define saved_pr	r21
   15.65 -#define tmp		r22
   15.66 -#define val		r23
   15.67 -#define src1		r24
   15.68 -#define dst1		r25
   15.69 -#define src2		r26
   15.70 -#define dst2		r27
   15.71 -#define len1		r28
   15.72 -#define enddst		r29
   15.73 -#define endsrc		r30
   15.74 -#define saved_pfs	r31
   15.75 -
   15.76 -GLOBAL_ENTRY(__copy_user)
   15.77 -	.prologue
   15.78 -	.save ar.pfs, saved_pfs
   15.79 -	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
   15.80 -
   15.81 -	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
   15.82 -	.rotp p[PIPE_DEPTH]
   15.83 -
   15.84 -	adds len2=-1,len	// br.ctop is repeat/until
   15.85 -	mov ret0=r0
   15.86 -
   15.87 -	;;			// RAW of cfm when len=0
   15.88 -	cmp.eq p8,p0=r0,len	// check for zero length
   15.89 -	.save ar.lc, saved_lc
   15.90 -	mov saved_lc=ar.lc	// preserve ar.lc (slow)
   15.91 -(p8)	br.ret.spnt.many rp	// empty mempcy()
   15.92 -	;;
   15.93 -	add enddst=dst,len	// first byte after end of source
   15.94 -	add endsrc=src,len	// first byte after end of destination
   15.95 -	.save pr, saved_pr
   15.96 -	mov saved_pr=pr		// preserve predicates
   15.97 -
   15.98 -	.body
   15.99 -
  15.100 -	mov dst1=dst		// copy because of rotation
  15.101 -	mov ar.ec=PIPE_DEPTH
  15.102 -	mov pr.rot=1<<16	// p16=true all others are false
  15.103 -
  15.104 -	mov src1=src		// copy because of rotation
  15.105 -	mov ar.lc=len2		// initialize lc for small count
  15.106 -	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
  15.107 -
  15.108 -	xor tmp=src,dst		// same alignment test prepare
  15.109 -(p10)	br.cond.dptk .long_copy_user
  15.110 -	;;			// RAW pr.rot/p16 ?
  15.111 -	//
  15.112 -	// Now we do the byte by byte loop with software pipeline
  15.113 -	//
  15.114 -	// p7 is necessarily false by now
  15.115 -1:
  15.116 -	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  15.117 -	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  15.118 -	br.ctop.dptk.few 1b
  15.119 -	;;
  15.120 -	mov ar.lc=saved_lc
  15.121 -	mov pr=saved_pr,0xffffffffffff0000
  15.122 -	mov ar.pfs=saved_pfs		// restore ar.ec
  15.123 -	br.ret.sptk.many rp		// end of short memcpy
  15.124 -
  15.125 -	//
  15.126 -	// Not 8-byte aligned
  15.127 -	//
  15.128 -.diff_align_copy_user:
  15.129 -	// At this point we know we have more than 16 bytes to copy
  15.130 -	// and also that src and dest do _not_ have the same alignment.
  15.131 -	and src2=0x7,src1				// src offset
  15.132 -	and dst2=0x7,dst1				// dst offset
  15.133 -	;;
  15.134 -	// The basic idea is that we copy byte-by-byte at the head so
  15.135 -	// that we can reach 8-byte alignment for both src1 and dst1.
  15.136 -	// Then copy the body using software pipelined 8-byte copy,
  15.137 -	// shifting the two back-to-back words right and left, then copy
  15.138 -	// the tail by copying byte-by-byte.
  15.139 -	//
  15.140 -	// Fault handling. If the byte-by-byte at the head fails on the
  15.141 -	// load, then restart and finish the pipleline by copying zeros
  15.142 -	// to the dst1. Then copy zeros for the rest of dst1.
  15.143 -	// If 8-byte software pipeline fails on the load, do the same as
  15.144 -	// failure_in3 does. If the byte-by-byte at the tail fails, it is
  15.145 -	// handled simply by failure_in_pipe1.
  15.146 -	//
  15.147 -	// The case p14 represents the source has more bytes in the
  15.148 -	// the first word (by the shifted part), whereas the p15 needs to
  15.149 -	// copy some bytes from the 2nd word of the source that has the
  15.150 -	// tail of the 1st of the destination.
  15.151 -	//
  15.152 -
  15.153 -	//
  15.154 -	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
  15.155 -	// to copy the head to dst1, to start 8-byte copy software pipeline.
  15.156 -	// We know src1 is not 8-byte aligned in this case.
  15.157 -	//
  15.158 -	cmp.eq p14,p15=r0,dst2
  15.159 -(p15)	br.cond.spnt 1f
  15.160 -	;;
  15.161 -	sub t1=8,src2
  15.162 -	mov t2=src2
  15.163 -	;;
  15.164 -	shl rshift=t2,3
  15.165 -	sub len1=len,t1					// set len1
  15.166 -	;;
  15.167 -	sub lshift=64,rshift
  15.168 -	;;
  15.169 -	br.cond.spnt .word_copy_user
  15.170 -	;;
  15.171 -1:
  15.172 -	cmp.leu	p14,p15=src2,dst2
  15.173 -	sub t1=dst2,src2
  15.174 -	;;
  15.175 -	.pred.rel "mutex", p14, p15
  15.176 -(p14)	sub word1=8,src2				// (8 - src offset)
  15.177 -(p15)	sub t1=r0,t1					// absolute value
  15.178 -(p15)	sub word1=8,dst2				// (8 - dst offset)
  15.179 -	;;
  15.180 -	// For the case p14, we don't need to copy the shifted part to
  15.181 -	// the 1st word of destination.
  15.182 -	sub t2=8,t1
  15.183 -(p14)	sub word1=word1,t1
  15.184 -	;;
  15.185 -	sub len1=len,word1				// resulting len
  15.186 -(p15)	shl rshift=t1,3					// in bits
  15.187 -(p14)	shl rshift=t2,3
  15.188 -	;;
  15.189 -(p14)	sub len1=len1,t1
  15.190 -	adds cnt=-1,word1
  15.191 -	;;
  15.192 -	sub lshift=64,rshift
  15.193 -	mov ar.ec=PIPE_DEPTH
  15.194 -	mov pr.rot=1<<16	// p16=true all others are false
  15.195 -	mov ar.lc=cnt
  15.196 -	;;
  15.197 -2:
  15.198 -	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
  15.199 -	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  15.200 -	br.ctop.dptk.few 2b
  15.201 -	;;
  15.202 -	clrrrb
  15.203 -	;;
  15.204 -.word_copy_user:
  15.205 -	cmp.gtu p9,p0=16,len1
  15.206 -(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
  15.207 -	;;
  15.208 -	shr.u cnt=len1,3		// number of 64-bit words
  15.209 -	;;
  15.210 -	adds cnt=-1,cnt
  15.211 -	;;
  15.212 -	.pred.rel "mutex", p14, p15
  15.213 -(p14)	sub src1=src1,t2
  15.214 -(p15)	sub src1=src1,t1
  15.215 -	//
  15.216 -	// Now both src1 and dst1 point to an 8-byte aligned address. And
  15.217 -	// we have more than 8 bytes to copy.
  15.218 -	//
  15.219 -	mov ar.lc=cnt
  15.220 -	mov ar.ec=PIPE_DEPTH
  15.221 -	mov pr.rot=1<<16	// p16=true all others are false
  15.222 -	;;
  15.223 -3:
  15.224 -	//
  15.225 -	// The pipleline consists of 3 stages:
  15.226 -	// 1 (p16):	Load a word from src1
  15.227 -	// 2 (EPI_1):	Shift right pair, saving to tmp
  15.228 -	// 3 (EPI):	Store tmp to dst1
  15.229 -	//
  15.230 -	// To make it simple, use at least 2 (p16) loops to set up val1[n]
  15.231 -	// because we need 2 back-to-back val1[] to get tmp.
  15.232 -	// Note that this implies EPI_2 must be p18 or greater.
  15.233 -	//
  15.234 -
  15.235 -#define EPI_1		p[PIPE_DEPTH-2]
  15.236 -#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
  15.237 -#define CASE(pred, shift)	\
  15.238 -	(pred)	br.cond.spnt .copy_user_bit##shift
  15.239 -#define BODY(rshift)						\
  15.240 -.copy_user_bit##rshift:						\
  15.241 -1:								\
  15.242 -	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
  15.243 -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
  15.244 -	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
  15.245 -(p16)	mov val1[0]=r0;						\
  15.246 -	br.ctop.dptk 1b;					\
  15.247 -	;;							\
  15.248 -	br.cond.sptk.many .diff_align_do_tail;			\
  15.249 -2:								\
  15.250 -(EPI)	st8 [dst1]=tmp,8;					\
  15.251 -(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
  15.252 -3:								\
  15.253 -(p16)	mov val1[1]=r0;						\
  15.254 -(p16)	mov val1[0]=r0;						\
  15.255 -	br.ctop.dptk 2b;					\
  15.256 -	;;							\
  15.257 -	br.cond.sptk.many .failure_in2
  15.258 -
  15.259 -	//
  15.260 -	// Since the instruction 'shrp' requires a fixed 128-bit value
  15.261 -	// specifying the bits to shift, we need to provide 7 cases
  15.262 -	// below.
  15.263 -	//
  15.264 -	SWITCH(p6, 8)
  15.265 -	SWITCH(p7, 16)
  15.266 -	SWITCH(p8, 24)
  15.267 -	SWITCH(p9, 32)
  15.268 -	SWITCH(p10, 40)
  15.269 -	SWITCH(p11, 48)
  15.270 -	SWITCH(p12, 56)
  15.271 -	;;
  15.272 -	CASE(p6, 8)
  15.273 -	CASE(p7, 16)
  15.274 -	CASE(p8, 24)
  15.275 -	CASE(p9, 32)
  15.276 -	CASE(p10, 40)
  15.277 -	CASE(p11, 48)
  15.278 -	CASE(p12, 56)
  15.279 -	;;
  15.280 -	BODY(8)
  15.281 -	BODY(16)
  15.282 -	BODY(24)
  15.283 -	BODY(32)
  15.284 -	BODY(40)
  15.285 -	BODY(48)
  15.286 -	BODY(56)
  15.287 -	;;
  15.288 -.diff_align_do_tail:
  15.289 -	.pred.rel "mutex", p14, p15
  15.290 -(p14)	sub src1=src1,t1
  15.291 -(p14)	adds dst1=-8,dst1
  15.292 -(p15)	sub dst1=dst1,t1
  15.293 -	;;
  15.294 -4:
  15.295 -	// Tail correction.
  15.296 -	//
  15.297 -	// The problem with this piplelined loop is that the last word is not
  15.298 -	// loaded and thus parf of the last word written is not correct.
  15.299 -	// To fix that, we simply copy the tail byte by byte.
  15.300 -
  15.301 -	sub len1=endsrc,src1,1
  15.302 -	clrrrb
  15.303 -	;;
  15.304 -	mov ar.ec=PIPE_DEPTH
  15.305 -	mov pr.rot=1<<16	// p16=true all others are false
  15.306 -	mov ar.lc=len1
  15.307 -	;;
  15.308 -5:
  15.309 -	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  15.310 -	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  15.311 -	br.ctop.dptk.few 5b
  15.312 -	;;
  15.313 -	mov ar.lc=saved_lc
  15.314 -	mov pr=saved_pr,0xffffffffffff0000
  15.315 -	mov ar.pfs=saved_pfs
  15.316 -	br.ret.sptk.many rp
  15.317 -
  15.318 -	//
  15.319 -	// Beginning of long mempcy (i.e. > 16 bytes)
  15.320 -	//
  15.321 -.long_copy_user:
  15.322 -	tbit.nz p6,p7=src1,0	// odd alignment
  15.323 -	and tmp=7,tmp
  15.324 -	;;
  15.325 -	cmp.eq p10,p8=r0,tmp
  15.326 -	mov len1=len		// copy because of rotation
  15.327 -(p8)	br.cond.dpnt .diff_align_copy_user
  15.328 -	;;
  15.329 -	// At this point we know we have more than 16 bytes to copy
  15.330 -	// and also that both src and dest have the same alignment
  15.331 -	// which may not be the one we want. So for now we must move
  15.332 -	// forward slowly until we reach 16byte alignment: no need to
  15.333 -	// worry about reaching the end of buffer.
  15.334 -	//
  15.335 -	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
  15.336 -(p6)	adds len1=-1,len1;;
  15.337 -	tbit.nz p7,p0=src1,1
  15.338 -	;;
  15.339 -	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
  15.340 -(p7)	adds len1=-2,len1;;
  15.341 -	tbit.nz p8,p0=src1,2
  15.342 -	;;
  15.343 -	//
  15.344 -	// Stop bit not required after ld4 because if we fail on ld4
  15.345 -	// we have never executed the ld1, therefore st1 is not executed.
  15.346 -	//
  15.347 -	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
  15.348 -	;;
  15.349 -	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
  15.350 -	tbit.nz p9,p0=src1,3
  15.351 -	;;
  15.352 -	//
  15.353 -	// Stop bit not required after ld8 because if we fail on ld8
  15.354 -	// we have never executed the ld2, therefore st2 is not executed.
  15.355 -	//
  15.356 -	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
  15.357 -	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
  15.358 -(p8)	adds len1=-4,len1
  15.359 -	;;
  15.360 -	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
  15.361 -(p9)	adds len1=-8,len1;;
  15.362 -	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
  15.363 -	;;
  15.364 -	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
  15.365 -	tbit.nz p6,p0=len1,3
  15.366 -	cmp.eq p7,p0=r0,cnt
  15.367 -	adds tmp=-1,cnt			// br.ctop is repeat/until
  15.368 -(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
  15.369 -	;;
  15.370 -	adds src2=8,src1
  15.371 -	adds dst2=8,dst1
  15.372 -	mov ar.lc=tmp
  15.373 -	;;
  15.374 -	//
  15.375 -	// 16bytes/iteration
  15.376 -	//
  15.377 -2:
  15.378 -	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
  15.379 -(p16)	ld8 val2[0]=[src2],16
  15.380 -
  15.381 -	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
  15.382 -(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
  15.383 -	br.ctop.dptk 2b
  15.384 -	;;			// RAW on src1 when fall through from loop
  15.385 -	//
  15.386 -	// Tail correction based on len only
  15.387 -	//
  15.388 -	// No matter where we come from (loop or test) the src1 pointer
  15.389 -	// is 16 byte aligned AND we have less than 16 bytes to copy.
  15.390 -	//
  15.391 -.dotail:
  15.392 -	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
  15.393 -	tbit.nz p7,p0=len1,2
  15.394 -	;;
  15.395 -	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
  15.396 -	tbit.nz p8,p0=len1,1
  15.397 -	;;
  15.398 -	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
  15.399 -	tbit.nz p9,p0=len1,0
  15.400 -	;;
  15.401 -	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
  15.402 -	;;
  15.403 -	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
  15.404 -	mov ar.lc=saved_lc
  15.405 -	;;
  15.406 -	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
  15.407 -	mov pr=saved_pr,0xffffffffffff0000
  15.408 -	;;
  15.409 -	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
  15.410 -	mov ar.pfs=saved_pfs
  15.411 -	;;
  15.412 -	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
  15.413 -	br.ret.sptk.many rp
  15.414 -
  15.415 -
  15.416 -	//
  15.417 -	// Here we handle the case where the byte by byte copy fails
  15.418 -	// on the load.
  15.419 -	// Several factors make the zeroing of the rest of the buffer kind of
  15.420 -	// tricky:
  15.421 -	//	- the pipeline: loads/stores are not in sync (pipeline)
  15.422 -	//
  15.423 -	//	  In the same loop iteration, the dst1 pointer does not directly
  15.424 -	//	  reflect where the faulty load was.
  15.425 -	//
  15.426 -	//	- pipeline effect
  15.427 -	//	  When you get a fault on load, you may have valid data from
  15.428 -	//	  previous loads not yet store in transit. Such data must be
  15.429 -	//	  store normally before moving onto zeroing the rest.
  15.430 -	//
  15.431 -	//	- single/multi dispersal independence.
  15.432 -	//
  15.433 -	// solution:
  15.434 -	//	- we don't disrupt the pipeline, i.e. data in transit in
  15.435 -	//	  the software pipeline will be eventually move to memory.
  15.436 -	//	  We simply replace the load with a simple mov and keep the
  15.437 -	//	  pipeline going. We can't really do this inline because
  15.438 -	//	  p16 is always reset to 1 when lc > 0.
  15.439 -	//
  15.440 -.failure_in_pipe1:
  15.441 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  15.442 -1:
  15.443 -(p16)	mov val1[0]=r0
  15.444 -(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
  15.445 -	br.ctop.dptk 1b
  15.446 -	;;
  15.447 -	mov pr=saved_pr,0xffffffffffff0000
  15.448 -	mov ar.lc=saved_lc
  15.449 -	mov ar.pfs=saved_pfs
  15.450 -	br.ret.sptk.many rp
  15.451 -
  15.452 -	//
  15.453 -	// This is the case where the byte by byte copy fails on the load
  15.454 -	// when we copy the head. We need to finish the pipeline and copy
  15.455 -	// zeros for the rest of the destination. Since this happens
  15.456 -	// at the top we still need to fill the body and tail.
  15.457 -.failure_in_pipe2:
  15.458 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  15.459 -2:
  15.460 -(p16)	mov val1[0]=r0
  15.461 -(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
  15.462 -	br.ctop.dptk 2b
  15.463 -	;;
  15.464 -	sub len=enddst,dst1,1		// precompute len
  15.465 -	br.cond.dptk.many .failure_in1bis
  15.466 -	;;
  15.467 -
  15.468 -	//
  15.469 -	// Here we handle the head & tail part when we check for alignment.
  15.470 -	// The following code handles only the load failures. The
  15.471 -	// main diffculty comes from the fact that loads/stores are
  15.472 -	// scheduled. So when you fail on a load, the stores corresponding
  15.473 -	// to previous successful loads must be executed.
  15.474 -	//
  15.475 -	// However some simplifications are possible given the way
  15.476 -	// things work.
  15.477 -	//
  15.478 -	// 1) HEAD
  15.479 -	// Theory of operation:
  15.480 -	//
  15.481 -	//  Page A   | Page B
  15.482 -	//  ---------|-----
  15.483 -	//          1|8 x
  15.484 -	//	  1 2|8 x
  15.485 -	//	    4|8 x
  15.486 -	//	  1 4|8 x
  15.487 -	//        2 4|8 x
  15.488 -	//      1 2 4|8 x
  15.489 -	//	     |1
  15.490 -	//	     |2 x
  15.491 -	//	     |4 x
  15.492 -	//
  15.493 -	// page_size >= 4k (2^12).  (x means 4, 2, 1)
  15.494 -	// Here we suppose Page A exists and Page B does not.
  15.495 -	//
  15.496 -	// As we move towards eight byte alignment we may encounter faults.
  15.497 -	// The numbers on each page show the size of the load (current alignment).
  15.498 -	//
  15.499 -	// Key point:
  15.500 -	//	- if you fail on 1, 2, 4 then you have never executed any smaller
  15.501 -	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
  15.502 -	//	  before.
  15.503 -	//
  15.504 -	// This allows us to simplify the cleanup code, because basically you
  15.505 -	// only have to worry about "pending" stores in the case of a failing
  15.506 -	// ld8(). Given the way the code is written today, this means only
  15.507 -	// worry about st2, st4. There we can use the information encapsulated
  15.508 -	// into the predicates.
  15.509 -	//
  15.510 -	// Other key point:
  15.511 -	//	- if you fail on the ld8 in the head, it means you went straight
  15.512 -	//	  to it, i.e. 8byte alignment within an unexisting page.
  15.513 -	// Again this comes from the fact that if you crossed just for the ld8 then
  15.514 -	// you are 8byte aligned but also 16byte align, therefore you would
  15.515 -	// either go for the 16byte copy loop OR the ld8 in the tail part.
  15.516 -	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
  15.517 -	// because it would mean you had 15bytes to copy in which case you
  15.518 -	// would have defaulted to the byte by byte copy.
  15.519 -	//
  15.520 -	//
  15.521 -	// 2) TAIL
  15.522 -	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
  15.523 -	// aligned.
  15.524 -	//
  15.525 -	// Key point:
  15.526 -	// This means that we either:
  15.527 -	//		- are right on a page boundary
  15.528 -	//	OR
  15.529 -	//		- are at more than 16 bytes from a page boundary with
  15.530 -	//		  at most 15 bytes to copy: no chance of crossing.
  15.531 -	//
  15.532 -	// This allows us to assume that if we fail on a load we haven't possibly
  15.533 -	// executed any of the previous (tail) ones, so we don't need to do
  15.534 -	// any stores. For instance, if we fail on ld2, this means we had
  15.535 -	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
  15.536 -	//
  15.537 -	// This means that we are in a situation similar the a fault in the
  15.538 -	// head part. That's nice!
  15.539 -	//
  15.540 -.failure_in1:
  15.541 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  15.542 -	sub len=endsrc,src1,1
  15.543 -	//
  15.544 -	// we know that ret0 can never be zero at this point
  15.545 -	// because we failed why trying to do a load, i.e. there is still
  15.546 -	// some work to do.
  15.547 -	// The failure_in1bis and length problem is taken care of at the
  15.548 -	// calling side.
  15.549 -	//
  15.550 -	;;
  15.551 -.failure_in1bis:		// from (.failure_in3)
  15.552 -	mov ar.lc=len		// Continue with a stupid byte store.
  15.553 -	;;
  15.554 -5:
  15.555 -	st1 [dst1]=r0,1
  15.556 -	br.cloop.dptk 5b
  15.557 -	;;
  15.558 -	mov pr=saved_pr,0xffffffffffff0000
  15.559 -	mov ar.lc=saved_lc
  15.560 -	mov ar.pfs=saved_pfs
  15.561 -	br.ret.sptk.many rp
  15.562 -
  15.563 -	//
  15.564 -	// Here we simply restart the loop but instead
  15.565 -	// of doing loads we fill the pipeline with zeroes
  15.566 -	// We can't simply store r0 because we may have valid
  15.567 -	// data in transit in the pipeline.
  15.568 -	// ar.lc and ar.ec are setup correctly at this point
  15.569 -	//
  15.570 -	// we MUST use src1/endsrc here and not dst1/enddst because
  15.571 -	// of the pipeline effect.
  15.572 -	//
  15.573 -.failure_in3:
  15.574 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  15.575 -	;;
  15.576 -2:
  15.577 -(p16)	mov val1[0]=r0
  15.578 -(p16)	mov val2[0]=r0
  15.579 -(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
  15.580 -(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
  15.581 -	br.ctop.dptk 2b
  15.582 -	;;
  15.583 -	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
  15.584 -	sub len=enddst,dst1,1		// precompute len
  15.585 -(p6)	br.cond.dptk .failure_in1bis
  15.586 -	;;
  15.587 -	mov pr=saved_pr,0xffffffffffff0000
  15.588 -	mov ar.lc=saved_lc
  15.589 -	mov ar.pfs=saved_pfs
  15.590 -	br.ret.sptk.many rp
  15.591 -
  15.592 -.failure_in2:
  15.593 -	sub ret0=endsrc,src1
  15.594 -	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
  15.595 -	sub len=enddst,dst1,1		// precompute len
  15.596 -(p6)	br.cond.dptk .failure_in1bis
  15.597 -	;;
  15.598 -	mov pr=saved_pr,0xffffffffffff0000
  15.599 -	mov ar.lc=saved_lc
  15.600 -	mov ar.pfs=saved_pfs
  15.601 -	br.ret.sptk.many rp
  15.602 -
  15.603 -	//
  15.604 -	// handling of failures on stores: that's the easy part
  15.605 -	//
  15.606 -.failure_out:
  15.607 -	sub ret0=enddst,dst1
  15.608 -	mov pr=saved_pr,0xffffffffffff0000
  15.609 -	mov ar.lc=saved_lc
  15.610 -
  15.611 -	mov ar.pfs=saved_pfs
  15.612 -	br.ret.sptk.many rp
  15.613 -END(__copy_user)
    16.1 --- a/xen/arch/ia64/linux/lib/csum_partial_copy.c	Tue Aug 30 17:51:51 2005 -0600
    16.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.3 @@ -1,151 +0,0 @@
    16.4 -/*
    16.5 - * Network Checksum & Copy routine
    16.6 - *
    16.7 - * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
    16.8 - *	Stephane Eranian <eranian@hpl.hp.com>
    16.9 - *
   16.10 - * Most of the code has been imported from Linux/Alpha
   16.11 - */
   16.12 -
   16.13 -#include <linux/module.h>
   16.14 -#include <linux/types.h>
   16.15 -#include <linux/string.h>
   16.16 -
   16.17 -#include <asm/uaccess.h>
   16.18 -
   16.19 -/*
   16.20 - * XXX Fixme: those 2 inlines are meant for debugging and will go away
   16.21 - */
   16.22 -static inline unsigned
   16.23 -short from64to16(unsigned long x)
   16.24 -{
   16.25 -	/* add up 32-bit words for 33 bits */
   16.26 -	x = (x & 0xffffffff) + (x >> 32);
   16.27 -	/* add up 16-bit and 17-bit words for 17+c bits */
   16.28 -	x = (x & 0xffff) + (x >> 16);
   16.29 -	/* add up 16-bit and 2-bit for 16+c bit */
   16.30 -	x = (x & 0xffff) + (x >> 16);
   16.31 -	/* add up carry.. */
   16.32 -	x = (x & 0xffff) + (x >> 16);
   16.33 -	return x;
   16.34 -}
   16.35 -
   16.36 -static inline
   16.37 -unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
   16.38 -{
   16.39 -	int odd, count;
   16.40 -	unsigned long result = (unsigned long)psum;
   16.41 -
   16.42 -	if (len <= 0)
   16.43 -		goto out;
   16.44 -	odd = 1 & (unsigned long) buff;
   16.45 -	if (odd) {
   16.46 -		result = *buff << 8;
   16.47 -		len--;
   16.48 -		buff++;
   16.49 -	}
   16.50 -	count = len >> 1;		/* nr of 16-bit words.. */
   16.51 -	if (count) {
   16.52 -		if (2 & (unsigned long) buff) {
   16.53 -			result += *(unsigned short *) buff;
   16.54 -			count--;
   16.55 -			len -= 2;
   16.56 -			buff += 2;
   16.57 -		}
   16.58 -		count >>= 1;		/* nr of 32-bit words.. */
   16.59 -		if (count) {
   16.60 -			if (4 & (unsigned long) buff) {
   16.61 -				result += *(unsigned int *) buff;
   16.62 -				count--;
   16.63 -				len -= 4;
   16.64 -				buff += 4;
   16.65 -			}
   16.66 -			count >>= 1;	/* nr of 64-bit words.. */
   16.67 -			if (count) {
   16.68 -				unsigned long carry = 0;
   16.69 -				do {
   16.70 -					unsigned long w = *(unsigned long *) buff;
   16.71 -					count--;
   16.72 -					buff += 8;
   16.73 -					result += carry;
   16.74 -					result += w;
   16.75 -					carry = (w > result);
   16.76 -				} while (count);
   16.77 -				result += carry;
   16.78 -				result = (result & 0xffffffff) + (result >> 32);
   16.79 -			}
   16.80 -			if (len & 4) {
   16.81 -				result += *(unsigned int *) buff;
   16.82 -				buff += 4;
   16.83 -			}
   16.84 -		}
   16.85 -		if (len & 2) {
   16.86 -			result += *(unsigned short *) buff;
   16.87 -			buff += 2;
   16.88 -		}
   16.89 -	}
   16.90 -	if (len & 1)
   16.91 -		result += *buff;
   16.92 -
   16.93 -	result = from64to16(result);
   16.94 -
   16.95 -	if (odd)
   16.96 -		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
   16.97 -
   16.98 -out:
   16.99 -	return result;
  16.100 -}
  16.101 -
  16.102 -/*
  16.103 - * XXX Fixme
  16.104 - *
  16.105 - * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
  16.106 - * But it's very tricky to get right even in C.
  16.107 - */
  16.108 -extern unsigned long do_csum(const unsigned char *, long);
  16.109 -
  16.110 -static unsigned int
  16.111 -do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
  16.112 -				int len, unsigned int psum, int *errp)
  16.113 -{
  16.114 -	unsigned long result;
  16.115 -
  16.116 -	/* XXX Fixme
  16.117 -	 * for now we separate the copy from checksum for obvious
  16.118 -	 * alignment difficulties. Look at the Alpha code and you'll be
  16.119 -	 * scared.
  16.120 -	 */
  16.121 -
  16.122 -	if (__copy_from_user(dst, src, len) != 0 && errp)
  16.123 -		*errp = -EFAULT;
  16.124 -
  16.125 -	result = do_csum(dst, len);
  16.126 -
  16.127 -	/* add in old sum, and carry.. */
  16.128 -	result += psum;
  16.129 -	/* 32+c bits -> 32 bits */
  16.130 -	result = (result & 0xffffffff) + (result >> 32);
  16.131 -	return result;
  16.132 -}
  16.133 -
  16.134 -unsigned int
  16.135 -csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
  16.136 -			     int len, unsigned int sum, int *errp)
  16.137 -{
  16.138 -	if (!access_ok(VERIFY_READ, src, len)) {
  16.139 -		*errp = -EFAULT;
  16.140 -		memset(dst, 0, len);
  16.141 -		return sum;
  16.142 -	}
  16.143 -
  16.144 -	return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
  16.145 -}
  16.146 -
  16.147 -unsigned int
  16.148 -csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
  16.149 -			  int len, unsigned int sum)
  16.150 -{
  16.151 -	return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
  16.152 -}
  16.153 -
  16.154 -EXPORT_SYMBOL(csum_partial_copy_nocheck);
    17.1 --- a/xen/arch/ia64/linux/lib/dec_and_lock.c	Tue Aug 30 17:51:51 2005 -0600
    17.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.3 @@ -1,42 +0,0 @@
    17.4 -/*
    17.5 - * Copyright (C) 2003 Jerome Marchand, Bull S.A.
    17.6 - *	Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com>
    17.7 - *
    17.8 - * This file is released under the GPLv2, or at your option any later version.
    17.9 - *
   17.10 - * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction.  This
   17.11 - * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
   17.12 - */
   17.13 -
   17.14 -#include <linux/compiler.h>
   17.15 -#include <linux/module.h>
   17.16 -#include <linux/spinlock.h>
   17.17 -#include <asm/atomic.h>
   17.18 -
   17.19 -/*
   17.20 - * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  Both of these
   17.21 - * operations have to be done atomically, so that the count doesn't drop to zero without
   17.22 - * acquiring the spinlock first.
   17.23 - */
   17.24 -int
   17.25 -_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
   17.26 -{
   17.27 -	int old, new;
   17.28 -
   17.29 -	do {
   17.30 -		old = atomic_read(refcount);
   17.31 -		new = old - 1;
   17.32 -
   17.33 -		if (unlikely (old == 1)) {
   17.34 -			/* oops, we may be decrementing to zero, do it the slow way... */
   17.35 -			spin_lock(lock);
   17.36 -			if (atomic_dec_and_test(refcount))
   17.37 -				return 1;
   17.38 -			spin_unlock(lock);
   17.39 -			return 0;
   17.40 -		}
   17.41 -	} while (cmpxchg(&refcount->counter, old, new) != old);
   17.42 -	return 0;
   17.43 -}
   17.44 -
   17.45 -EXPORT_SYMBOL(_atomic_dec_and_lock);
    18.1 --- a/xen/arch/ia64/linux/lib/do_csum.S	Tue Aug 30 17:51:51 2005 -0600
    18.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.3 @@ -1,323 +0,0 @@
    18.4 -/*
    18.5 - *
    18.6 - * Optmized version of the standard do_csum() function
    18.7 - *
    18.8 - * Return: a 64bit quantity containing the 16bit Internet checksum
    18.9 - *
   18.10 - * Inputs:
   18.11 - *	in0: address of buffer to checksum (char *)
   18.12 - *	in1: length of the buffer (int)
   18.13 - *
   18.14 - * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
   18.15 - *	Stephane Eranian <eranian@hpl.hp.com>
   18.16 - *
   18.17 - * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
   18.18 - *		Data locality study on the checksum buffer.
   18.19 - *		More optimization cleanup - remove excessive stop bits.
   18.20 - * 02/04/08	David Mosberger <davidm@hpl.hp.com>
   18.21 - *		More cleanup and tuning.
   18.22 - * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
   18.23 - *		Clean up and optimize and the software pipeline, loading two
   18.24 - *		back-to-back 8-byte words per loop. Clean up the initialization
   18.25 - *		for the loop. Support the cases where load latency = 1 or 2.
   18.26 - *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
   18.27 - */
   18.28 -
   18.29 -#include <asm/asmmacro.h>
   18.30 -
   18.31 -//
   18.32 -// Theory of operations:
   18.33 -//	The goal is to go as quickly as possible to the point where
   18.34 -//	we can checksum 16 bytes/loop. Before reaching that point we must
   18.35 -//	take care of incorrect alignment of first byte.
   18.36 -//
   18.37 -//	The code hereafter also takes care of the "tail" part of the buffer
   18.38 -//	before entering the core loop, if any. The checksum is a sum so it
   18.39 -//	allows us to commute operations. So we do the "head" and "tail"
   18.40 -//	first to finish at full speed in the body. Once we get the head and
   18.41 -//	tail values, we feed them into the pipeline, very handy initialization.
   18.42 -//
   18.43 -//	Of course we deal with the special case where the whole buffer fits
   18.44 -//	into one 8 byte word. In this case we have only one entry in the pipeline.
   18.45 -//
   18.46 -//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
   18.47 -//	possible load latency and also to accommodate for head and tail.
   18.48 -//
   18.49 -//	The end of the function deals with folding the checksum from 64bits
   18.50 -//	down to 16bits taking care of the carry.
   18.51 -//
   18.52 -//	This version avoids synchronization in the core loop by also using a
   18.53 -//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
   18.54 -//
   18.55 -//	 wordx[] (x=1,2)
   18.56 -//	|---|
   18.57 -//      |   | 0			: new value loaded in pipeline
   18.58 -//	|---|
   18.59 -//      |   | -			: in transit data
   18.60 -//	|---|
   18.61 -//      |   | LOAD_LATENCY	: current value to add to checksum
   18.62 -//	|---|
   18.63 -//      |   | LOAD_LATENCY+1	: previous value added to checksum
   18.64 -//      |---|			(previous iteration)
   18.65 -//
   18.66 -//	resultx[] (x=1,2)
   18.67 -//	|---|
   18.68 -//      |   | 0			: initial value
   18.69 -//	|---|
   18.70 -//      |   | LOAD_LATENCY-1	: new checksum
   18.71 -//	|---|
   18.72 -//      |   | LOAD_LATENCY	: previous value of checksum
   18.73 -//	|---|
   18.74 -//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
   18.75 -//      |---|
   18.76 -//
   18.77 -//
   18.78 -//	See RFC1071 "Computing the Internet Checksum" for various techniques for
   18.79 -//	calculating the Internet checksum.
   18.80 -//
   18.81 -// NOT YET DONE:
   18.82 -//	- Maybe another algorithm which would take care of the folding at the
   18.83 -//	  end in a different manner
   18.84 -//	- Work with people more knowledgeable than me on the network stack
   18.85 -//	  to figure out if we could not split the function depending on the
   18.86 -//	  type of packet or alignment we get. Like the ip_fast_csum() routine
   18.87 -//	  where we know we have at least 20bytes worth of data to checksum.
   18.88 -//	- Do a better job of handling small packets.
   18.89 -//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
   18.90 -//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
   18.91 -//	  on the data that buffer points to (partly because the checksum is often preceded by
   18.92 -//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
   18.93 -//	  the data is already in the cache.
   18.94 -//
   18.95 -
   18.96 -#define saved_pfs	r11
   18.97 -#define hmask		r16
   18.98 -#define tmask		r17
   18.99 -#define first1		r18
  18.100 -#define firstval	r19
  18.101 -#define firstoff	r20
  18.102 -#define last		r21
  18.103 -#define lastval		r22
  18.104 -#define lastoff		r23
  18.105 -#define saved_lc	r24
  18.106 -#define saved_pr	r25
  18.107 -#define tmp1		r26
  18.108 -#define tmp2		r27
  18.109 -#define tmp3		r28
  18.110 -#define carry1		r29
  18.111 -#define carry2		r30
  18.112 -#define first2		r31
  18.113 -
  18.114 -#define buf		in0
  18.115 -#define len		in1
  18.116 -
  18.117 -#define LOAD_LATENCY	2	// XXX fix me
  18.118 -
  18.119 -#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
  18.120 -# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
  18.121 -#endif
  18.122 -
  18.123 -#define PIPE_DEPTH			(LOAD_LATENCY+2)
  18.124 -#define ELD	p[LOAD_LATENCY]		// end of load
  18.125 -#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
  18.126 -
  18.127 -// unsigned long do_csum(unsigned char *buf,long len)
  18.128 -
  18.129 -GLOBAL_ENTRY(do_csum)
  18.130 -	.prologue
  18.131 -	.save ar.pfs, saved_pfs
  18.132 -	alloc saved_pfs=ar.pfs,2,16,0,16
  18.133 -	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
  18.134 -	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
  18.135 -	mov ret0=r0		// in case we have zero length
  18.136 -	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
  18.137 -	;;
  18.138 -	add tmp1=buf,len	// last byte's address
  18.139 -	.save pr, saved_pr
  18.140 -	mov saved_pr=pr		// preserve predicates (rotation)
  18.141 -(p6)	br.ret.spnt.many rp	// return if zero or negative length
  18.142 -
  18.143 -	mov hmask=-1		// initialize head mask
  18.144 -	tbit.nz p15,p0=buf,0	// is buf an odd address?
  18.145 -	and first1=-8,buf	// 8-byte align down address of first1 element
  18.146 -
  18.147 -	and firstoff=7,buf	// how many bytes off for first1 element
  18.148 -	mov tmask=-1		// initialize tail mask
  18.149 -
  18.150 -	;;
  18.151 -	adds tmp2=-1,tmp1	// last-1
  18.152 -	and lastoff=7,tmp1	// how many bytes off for last element
  18.153 -	;;
  18.154 -	sub tmp1=8,lastoff	// complement to lastoff
  18.155 -	and last=-8,tmp2	// address of word containing last byte
  18.156 -	;;
  18.157 -	sub tmp3=last,first1	// tmp3=distance from first1 to last
  18.158 -	.save ar.lc, saved_lc
  18.159 -	mov saved_lc=ar.lc	// save lc
  18.160 -	cmp.eq p8,p9=last,first1	// everything fits in one word ?
  18.161 -
  18.162 -	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
  18.163 -	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
  18.164 -	shl tmp2=firstoff,3	// number of bits
  18.165 -	;;
  18.166 -(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
  18.167 -	shl tmp1=tmp1,3		// number of bits
  18.168 -(p9)	adds tmp3=-8,tmp3	// effectively loaded
  18.169 -	;;
  18.170 -(p8)	mov lastval=r0		// we don't need lastval if first1==last
  18.171 -	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
  18.172 -	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
  18.173 -	;;
  18.174 -	.body
  18.175 -#define count tmp3
  18.176 -
  18.177 -(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
  18.178 -(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
  18.179 -	shr.u count=count,3	// how many 8-byte?
  18.180 -	;;
  18.181 -	// If count is odd, finish this 8-byte word so that we can
  18.182 -	// load two back-to-back 8-byte words per loop thereafter.
  18.183 -	and word1[0]=firstval,hmask	// and mask it as appropriate
  18.184 -	tbit.nz p10,p11=count,0		// if (count is odd)
  18.185 -	;;
  18.186 -(p8)	mov result1[0]=word1[0]
  18.187 -(p9)	add result1[0]=word1[0],word2[0]
  18.188 -	;;
  18.189 -	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
  18.190 -	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
  18.191 -	;;
  18.192 -(p6)	adds result1[0]=1,result1[0]
  18.193 -(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
  18.194 -(p11)	br.cond.dptk .do_csum16		// if (count is even)
  18.195 -
  18.196 -	// Here count is odd.
  18.197 -	ld8 word1[1]=[first1],8		// load an 8-byte word
  18.198 -	cmp.eq p9,p10=1,count		// if (count == 1)
  18.199 -	adds count=-1,count		// loaded an 8-byte word
  18.200 -	;;
  18.201 -	add result1[0]=result1[0],word1[1]
  18.202 -	;;
  18.203 -	cmp.ltu p6,p0=result1[0],word1[1]
  18.204 -	;;
  18.205 -(p6)	adds result1[0]=1,result1[0]
  18.206 -(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
  18.207 -	// Fall through to caluculate the checksum, feeding result1[0] as
  18.208 -	// the initial value in result1[0].
  18.209 -	//
  18.210 -	// Calculate the checksum loading two 8-byte words per loop.
  18.211 -	//
  18.212 -.do_csum16:
  18.213 -	add first2=8,first1
  18.214 -	shr.u count=count,1	// we do 16 bytes per loop
  18.215 -	;;
  18.216 -	adds count=-1,count
  18.217 -	mov carry1=r0
  18.218 -	mov carry2=r0
  18.219 -	brp.loop.imp 1f,2f
  18.220 -	;;
  18.221 -	mov ar.ec=PIPE_DEPTH
  18.222 -	mov ar.lc=count	// set lc
  18.223 -	mov pr.rot=1<<16
  18.224 -	// result1[0] must be initialized in advance.
  18.225 -	mov result2[0]=r0
  18.226 -	;;
  18.227 -	.align 32
  18.228 -1:
  18.229 -(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
  18.230 -(pC1[1])adds carry1=1,carry1
  18.231 -(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
  18.232 -(pC2[1])adds carry2=1,carry2
  18.233 -(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
  18.234 -(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
  18.235 -2:
  18.236 -(p[0])	ld8 word1[0]=[first1],16
  18.237 -(p[0])	ld8 word2[0]=[first2],16
  18.238 -	br.ctop.sptk 1b
  18.239 -	;;
  18.240 -	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
  18.241 -(pC1[1])adds carry1=1,carry1	// since we miss the last one
  18.242 -(pC2[1])adds carry2=1,carry2
  18.243 -	;;
  18.244 -	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
  18.245 -	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
  18.246 -	;;
  18.247 -	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
  18.248 -	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
  18.249 -	;;
  18.250 -(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
  18.251 -(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
  18.252 -	;;
  18.253 -	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
  18.254 -	;;
  18.255 -	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
  18.256 -	;;
  18.257 -(p6)	adds result1[0]=1,result1[0]
  18.258 -	;;
  18.259 -.do_csum_exit:
  18.260 -	//
  18.261 -	// now fold 64 into 16 bits taking care of carry
  18.262 -	// that's not very good because it has lots of sequentiality
  18.263 -	//
  18.264 -	mov tmp3=0xffff
  18.265 -	zxt4 tmp1=result1[0]
  18.266 -	shr.u tmp2=result1[0],32
  18.267 -	;;
  18.268 -	add result1[0]=tmp1,tmp2
  18.269 -	;;
  18.270 -	and tmp1=result1[0],tmp3
  18.271 -	shr.u tmp2=result1[0],16
  18.272 -	;;
  18.273 -	add result1[0]=tmp1,tmp2
  18.274 -	;;
  18.275 -	and tmp1=result1[0],tmp3
  18.276 -	shr.u tmp2=result1[0],16
  18.277 -	;;
  18.278 -	add result1[0]=tmp1,tmp2
  18.279 -	;;
  18.280 -	and tmp1=result1[0],tmp3
  18.281 -	shr.u tmp2=result1[0],16
  18.282 -	;;
  18.283 -	add ret0=tmp1,tmp2
  18.284 -	mov pr=saved_pr,0xffffffffffff0000
  18.285 -	;;
  18.286 -	// if buf was odd then swap bytes
  18.287 -	mov ar.pfs=saved_pfs		// restore ar.ec
  18.288 -(p15)	mux1 ret0=ret0,@rev		// reverse word
  18.289 -	;;
  18.290 -	mov ar.lc=saved_lc
  18.291 -(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
  18.292 -	br.ret.sptk.many rp
  18.293 -
  18.294 -//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
  18.295 -//	not much better than the original. So keep the original there so that
  18.296 -//	someone else can challenge.
  18.297 -//
  18.298 -//	shr.u word1[0]=result1[0],32
  18.299 -//	zxt4 result1[0]=result1[0]
  18.300 -//	;;
  18.301 -//	add result1[0]=result1[0],word1[0]
  18.302 -//	;;
  18.303 -//	zxt2 result2[0]=result1[0]
  18.304 -//	extr.u word1[0]=result1[0],16,16
  18.305 -//	shr.u carry1=result1[0],32
  18.306 -//	;;
  18.307 -//	add result2[0]=result2[0],word1[0]
  18.308 -//	;;
  18.309 -//	add result2[0]=result2[0],carry1
  18.310 -//	;;
  18.311 -//	extr.u ret0=result2[0],16,16
  18.312 -//	;;
  18.313 -//	add ret0=ret0,result2[0]
  18.314 -//	;;
  18.315 -//	zxt2 ret0=ret0
  18.316 -//	mov ar.pfs=saved_pfs		 // restore ar.ec
  18.317 -//	mov pr=saved_pr,0xffffffffffff0000
  18.318 -//	;;
  18.319 -//	// if buf was odd then swap bytes
  18.320 -//	mov ar.lc=saved_lc
  18.321 -//(p15)	mux1 ret0=ret0,@rev		// reverse word
  18.322 -//	;;
  18.323 -//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
  18.324 -//	br.ret.sptk.many rp
  18.325 -
  18.326 -END(do_csum)
    19.1 --- a/xen/arch/ia64/linux/lib/flush.S	Tue Aug 30 17:51:51 2005 -0600
    19.2 +++ b/xen/arch/ia64/linux/lib/flush.S	Wed Aug 31 14:32:27 2005 -0600
    19.3 @@ -1,39 +1,61 @@
    19.4  /*
    19.5   * Cache flushing routines.
    19.6   *
    19.7 - * Copyright (C) 1999-2001 Hewlett-Packard Co
    19.8 - * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com>
    19.9 + * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
   19.10 + *	David Mosberger-Tang <davidm@hpl.hp.com>
   19.11 + *
   19.12 + * 05/28/05 Zoltan Menyhart	Dynamic stride size
   19.13   */
   19.14 +
   19.15  #include <asm/asmmacro.h>
   19.16 -#include <asm/page.h>
   19.17 +
   19.18  
   19.19  	/*
   19.20  	 * flush_icache_range(start,end)
   19.21 -	 *	Must flush range from start to end-1 but nothing else (need to
   19.22 +	 *
   19.23 +	 *	Make i-cache(s) coherent with d-caches.
   19.24 +	 *
   19.25 +	 *	Must deal with range from start to end-1 but nothing else (need to
   19.26  	 *	be careful not to touch addresses that may be unmapped).
   19.27 +	 *
   19.28 +	 *	Note: "in0" and "in1" are preserved for debugging purposes.
   19.29  	 */
   19.30  GLOBAL_ENTRY(flush_icache_range)
   19.31 +
   19.32  	.prologue
   19.33 -	alloc r2=ar.pfs,2,0,0,0
   19.34 -	sub r8=in1,in0,1
   19.35 +	alloc	r2=ar.pfs,2,0,0,0
   19.36 +	movl	r3=ia64_i_cache_stride_shift
   19.37 + 	mov	r21=1
   19.38 +	;;
   19.39 +	ld8	r20=[r3]		// r20: stride shift
   19.40 +	sub	r22=in1,r0,1		// last byte address
   19.41  	;;
   19.42 -	shr.u r8=r8,5			// we flush 32 bytes per iteration
   19.43 -	.save ar.lc, r3
   19.44 -	mov r3=ar.lc			// save ar.lc
   19.45 +	shr.u	r23=in0,r20		// start / (stride size)
   19.46 +	shr.u	r22=r22,r20		// (last byte address) / (stride size)
   19.47 +	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
   19.48 +	;;
   19.49 +	sub	r8=r22,r23		// number of strides - 1
   19.50 +	shl	r24=r23,r20		// r24: addresses for "fc.i" =
   19.51 +					//	"start" rounded down to stride boundary
   19.52 +	.save	ar.lc,r3
   19.53 +	mov	r3=ar.lc		// save ar.lc
   19.54  	;;
   19.55  
   19.56  	.body
   19.57 -
   19.58 -	mov ar.lc=r8
   19.59 +	mov	ar.lc=r8
   19.60  	;;
   19.61 -.Loop:	fc in0				// issuable on M0 only
   19.62 -	add in0=32,in0
   19.63 +	/*
   19.64 +	 * 32 byte aligned loop, even number of (actually 2) bundles
   19.65 +	 */
   19.66 +.Loop:	fc.i	r24			// issuable on M0 only
   19.67 +	add	r24=r21,r24		// we flush "stride size" bytes per iteration
   19.68 +	nop.i	0
   19.69  	br.cloop.sptk.few .Loop
   19.70  	;;
   19.71  	sync.i
   19.72  	;;
   19.73  	srlz.i
   19.74  	;;
   19.75 -	mov ar.lc=r3			// restore ar.lc
   19.76 +	mov	ar.lc=r3		// restore ar.lc
   19.77  	br.ret.sptk.many rp
   19.78  END(flush_icache_range)
    20.1 --- a/xen/arch/ia64/linux/lib/io.c	Tue Aug 30 17:51:51 2005 -0600
    20.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.3 @@ -1,165 +0,0 @@
    20.4 -#include <linux/config.h>
    20.5 -#include <linux/module.h>
    20.6 -#include <linux/types.h>
    20.7 -
    20.8 -#include <asm/io.h>
    20.9 -
   20.10 -/*
   20.11 - * Copy data from IO memory space to "real" memory space.
   20.12 - * This needs to be optimized.
   20.13 - */
   20.14 -void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
   20.15 -{
   20.16 -	char *dst = to;
   20.17 -
   20.18 -	while (count) {
   20.19 -		count--;
   20.20 -		*dst++ = readb(from++);
   20.21 -	}
   20.22 -}
   20.23 -EXPORT_SYMBOL(memcpy_fromio);
   20.24 -
   20.25 -/*
   20.26 - * Copy data from "real" memory space to IO memory space.
   20.27 - * This needs to be optimized.
   20.28 - */
   20.29 -void memcpy_toio(volatile void __iomem *to, const void *from, long count)
   20.30 -{
   20.31 -	const char *src = from;
   20.32 -
   20.33 -	while (count) {
   20.34 -		count--;
   20.35 -		writeb(*src++, to++);
   20.36 -	}
   20.37 -}
   20.38 -EXPORT_SYMBOL(memcpy_toio);
   20.39 -
   20.40 -/*
   20.41 - * "memset" on IO memory space.
   20.42 - * This needs to be optimized.
   20.43 - */
   20.44 -void memset_io(volatile void __iomem *dst, int c, long count)
   20.45 -{
   20.46 -	unsigned char ch = (char)(c & 0xff);
   20.47 -
   20.48 -	while (count) {
   20.49 -		count--;
   20.50 -		writeb(ch, dst);
   20.51 -		dst++;
   20.52 -	}
   20.53 -}
   20.54 -EXPORT_SYMBOL(memset_io);
   20.55 -
   20.56 -#ifdef CONFIG_IA64_GENERIC
   20.57 -
   20.58 -#undef __ia64_inb
   20.59 -#undef __ia64_inw
   20.60 -#undef __ia64_inl
   20.61 -#undef __ia64_outb
   20.62 -#undef __ia64_outw
   20.63 -#undef __ia64_outl
   20.64 -#undef __ia64_readb
   20.65 -#undef __ia64_readw
   20.66 -#undef __ia64_readl
   20.67 -#undef __ia64_readq
   20.68 -#undef __ia64_readb_relaxed
   20.69 -#undef __ia64_readw_relaxed
   20.70 -#undef __ia64_readl_relaxed
   20.71 -#undef __ia64_readq_relaxed
   20.72 -#undef __ia64_writeb
   20.73 -#undef __ia64_writew
   20.74 -#undef __ia64_writel
   20.75 -#undef __ia64_writeq
   20.76 -#undef __ia64_mmiowb
   20.77 -
   20.78 -unsigned int
   20.79 -__ia64_inb (unsigned long port)
   20.80 -{
   20.81 -	return ___ia64_inb(port);
   20.82 -}
   20.83 -
   20.84 -unsigned int
   20.85 -__ia64_inw (unsigned long port)
   20.86 -{
   20.87 -	return ___ia64_inw(port);
   20.88 -}
   20.89 -
   20.90 -unsigned int
   20.91 -__ia64_inl (unsigned long port)
   20.92 -{
   20.93 -	return ___ia64_inl(port);
   20.94 -}
   20.95 -
   20.96 -void
   20.97 -__ia64_outb (unsigned char val, unsigned long port)
   20.98 -{
   20.99 -	___ia64_outb(val, port);
  20.100 -}
  20.101 -
  20.102 -void
  20.103 -__ia64_outw (unsigned short val, unsigned long port)
  20.104 -{
  20.105 -	___ia64_outw(val, port);
  20.106 -}
  20.107 -
  20.108 -void
  20.109 -__ia64_outl (unsigned int val, unsigned long port)
  20.110 -{
  20.111 -	___ia64_outl(val, port);
  20.112 -}
  20.113 -
  20.114 -unsigned char
  20.115 -__ia64_readb (void __iomem *addr)
  20.116 -{
  20.117 -	return ___ia64_readb (addr);
  20.118 -}
  20.119 -
  20.120 -unsigned short
  20.121 -__ia64_readw (void __iomem *addr)
  20.122 -{
  20.123 -	return ___ia64_readw (addr);
  20.124 -}
  20.125 -
  20.126 -unsigned int
  20.127 -__ia64_readl (void __iomem *addr)
  20.128 -{
  20.129 -	return ___ia64_readl (addr);
  20.130 -}
  20.131 -
  20.132 -unsigned long
  20.133 -__ia64_readq (void __iomem *addr)
  20.134 -{
  20.135 -	return ___ia64_readq (addr);
  20.136 -}
  20.137 -
  20.138 -unsigned char
  20.139 -__ia64_readb_relaxed (void __iomem *addr)
  20.140 -{
  20.141 -	return ___ia64_readb (addr);
  20.142 -}
  20.143 -
  20.144 -unsigned short
  20.145 -__ia64_readw_relaxed (void __iomem *addr)
  20.146 -{
  20.147 -	return ___ia64_readw (addr);
  20.148 -}
  20.149 -
  20.150 -unsigned int
  20.151 -__ia64_readl_relaxed (void __iomem *addr)
  20.152 -{
  20.153 -	return ___ia64_readl (addr);
  20.154 -}
  20.155 -
  20.156 -unsigned long
  20.157 -__ia64_readq_relaxed (void __iomem *addr)
  20.158 -{
  20.159 -	return ___ia64_readq (addr);
  20.160 -}
  20.161 -
  20.162 -void
  20.163 -__ia64_mmiowb(void)
  20.164 -{
  20.165 -	___ia64_mmiowb();
  20.166 -}
  20.167 -
  20.168 -#endif /* CONFIG_IA64_GENERIC */
    21.1 --- a/xen/arch/ia64/linux/lib/ip_fast_csum.S	Tue Aug 30 17:51:51 2005 -0600
    21.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.3 @@ -1,90 +0,0 @@
    21.4 -/*
    21.5 - * Optmized version of the ip_fast_csum() function
    21.6 - * Used for calculating IP header checksum
    21.7 - *
    21.8 - * Return: 16bit checksum, complemented
    21.9 - *
   21.10 - * Inputs:
   21.11 - *      in0: address of buffer to checksum (char *)
   21.12 - *      in1: length of the buffer (int)
   21.13 - *
   21.14 - * Copyright (C) 2002 Intel Corp.
   21.15 - * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
   21.16 - */
   21.17 -
   21.18 -#include <asm/asmmacro.h>
   21.19 -
   21.20 -/*
   21.21 - * Since we know that most likely this function is called with buf aligned
   21.22 - * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
   21.23 - * versus calling generic version of do_csum, which has lots of overhead in
   21.24 - * handling various alignments and sizes.  However, due to lack of constrains
   21.25 - * put on the function input argument, cases with alignment not on 4-byte or
   21.26 - * size not equal to 20 bytes will be handled by the generic do_csum function.
   21.27 - */
   21.28 -
   21.29 -#define in0	r32
   21.30 -#define in1	r33
   21.31 -#define ret0	r8
   21.32 -
   21.33 -GLOBAL_ENTRY(ip_fast_csum)
   21.34 -	.prologue
   21.35 -	.body
   21.36 -	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
   21.37 -	and	r14=3,in0	// is it aligned on 4-byte?
   21.38 -	add	r15=4,in0	// second source pointer
   21.39 -	;;
   21.40 -	cmp.ne.or.andcm p6,p7=r14,r0
   21.41 -	;;
   21.42 -(p7)	ld4	r20=[in0],8
   21.43 -(p7)	ld4	r21=[r15],8
   21.44 -(p6)	br.spnt	.generic
   21.45 -	;;
   21.46 -	ld4	r22=[in0],8
   21.47 -	ld4	r23=[r15],8
   21.48 -	;;
   21.49 -	ld4	r24=[in0]
   21.50 -	add	r20=r20,r21
   21.51 -	add	r22=r22,r23
   21.52 -	;;
   21.53 -	add	r20=r20,r22
   21.54 -	;;
   21.55 -	add	r20=r20,r24
   21.56 -	;;
   21.57 -	shr.u	ret0=r20,16	// now need to add the carry
   21.58 -	zxt2	r20=r20
   21.59 -	;;
   21.60 -	add	r20=ret0,r20
   21.61 -	;;
   21.62 -	shr.u	ret0=r20,16	// add carry again
   21.63 -	zxt2	r20=r20
   21.64 -	;;
   21.65 -	add	r20=ret0,r20
   21.66 -	;;
   21.67 -	shr.u	ret0=r20,16
   21.68 -	zxt2	r20=r20
   21.69 -	;;
   21.70 -	add	r20=ret0,r20
   21.71 -	;;
   21.72 -	andcm	ret0=-1,r20
   21.73 -	.restore sp		// reset frame state
   21.74 -	br.ret.sptk.many b0
   21.75 -	;;
   21.76 -
   21.77 -.generic:
   21.78 -	.prologue
   21.79 -	.save ar.pfs, r35
   21.80 -	alloc	r35=ar.pfs,2,2,2,0
   21.81 -	.save rp, r34
   21.82 -	mov	r34=b0
   21.83 -	.body
   21.84 -	dep.z	out1=in1,2,30
   21.85 -	mov	out0=in0
   21.86 -	;;
   21.87 -	br.call.sptk.many b0=do_csum
   21.88 -	;;
   21.89 -	andcm	ret0=-1,ret0
   21.90 -	mov	ar.pfs=r35
   21.91 -	mov	b0=r34
   21.92 -	br.ret.sptk.many b0
   21.93 -END(ip_fast_csum)
    22.1 --- a/xen/arch/ia64/linux/lib/memcpy.S	Tue Aug 30 17:51:51 2005 -0600
    22.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.3 @@ -1,301 +0,0 @@
    22.4 -/*
    22.5 - *
    22.6 - * Optimized version of the standard memcpy() function
    22.7 - *
    22.8 - * Inputs:
    22.9 - * 	in0:	destination address
   22.10 - *	in1:	source address
   22.11 - *	in2:	number of bytes to copy
   22.12 - * Output:
   22.13 - * 	no return value
   22.14 - *
   22.15 - * Copyright (C) 2000-2001 Hewlett-Packard Co
   22.16 - *	Stephane Eranian <eranian@hpl.hp.com>
   22.17 - *	David Mosberger-Tang <davidm@hpl.hp.com>
   22.18 - */
   22.19 -#include <asm/asmmacro.h>
   22.20 -
   22.21 -GLOBAL_ENTRY(memcpy)
   22.22 -
   22.23 -#	define MEM_LAT	21		/* latency to memory */
   22.24 -
   22.25 -#	define dst	r2
   22.26 -#	define src	r3
   22.27 -#	define retval	r8
   22.28 -#	define saved_pfs r9
   22.29 -#	define saved_lc	r10
   22.30 -#	define saved_pr	r11
   22.31 -#	define cnt	r16
   22.32 -#	define src2	r17
   22.33 -#	define t0	r18
   22.34 -#	define t1	r19
   22.35 -#	define t2	r20
   22.36 -#	define t3	r21
   22.37 -#	define t4	r22
   22.38 -#	define src_end	r23
   22.39 -
   22.40 -#	define N	(MEM_LAT + 4)
   22.41 -#	define Nrot	((N + 7) & ~7)
   22.42 -
   22.43 -	/*
   22.44 -	 * First, check if everything (src, dst, len) is a multiple of eight.  If
   22.45 -	 * so, we handle everything with no taken branches (other than the loop
   22.46 -	 * itself) and a small icache footprint.  Otherwise, we jump off to
   22.47 -	 * the more general copy routine handling arbitrary
   22.48 -	 * sizes/alignment etc.
   22.49 -	 */
   22.50 -	.prologue
   22.51 -	.save ar.pfs, saved_pfs
   22.52 -	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
   22.53 -	.save ar.lc, saved_lc
   22.54 -	mov saved_lc=ar.lc
   22.55 -	or t0=in0,in1
   22.56 -	;;
   22.57 -
   22.58 -	or t0=t0,in2
   22.59 -	.save pr, saved_pr
   22.60 -	mov saved_pr=pr
   22.61 -
   22.62 -	.body
   22.63 -
   22.64 -	cmp.eq p6,p0=in2,r0	// zero length?
   22.65 -	mov retval=in0		// return dst
   22.66 -(p6)	br.ret.spnt.many rp	// zero length, return immediately
   22.67 -	;;
   22.68 -
   22.69 -	mov dst=in0		// copy because of rotation
   22.70 -	shr.u cnt=in2,3		// number of 8-byte words to copy
   22.71 -	mov pr.rot=1<<16
   22.72 -	;;
   22.73 -
   22.74 -	adds cnt=-1,cnt		// br.ctop is repeat/until
   22.75 -	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
   22.76 -	mov ar.ec=N
   22.77 -	;;
   22.78 -
   22.79 -	and t0=0x7,t0
   22.80 -	mov ar.lc=cnt
   22.81 -	;;
   22.82 -	cmp.ne p6,p0=t0,r0
   22.83 -
   22.84 -	mov src=in1		// copy because of rotation
   22.85 -(p7)	br.cond.spnt.few .memcpy_short
   22.86 -(p6)	br.cond.spnt.few .memcpy_long
   22.87 -	;;
   22.88 -	nop.m	0
   22.89 -	;;
   22.90 -	nop.m	0
   22.91 -	nop.i	0
   22.92 -	;;
   22.93 -	nop.m	0
   22.94 -	;;
   22.95 -	.rotr val[N]
   22.96 -	.rotp p[N]
   22.97 -	.align 32
   22.98 -1: { .mib
   22.99 -(p[0])	ld8 val[0]=[src],8
  22.100 -	nop.i 0
  22.101 -	brp.loop.imp 1b, 2f
  22.102 -}
  22.103 -2: { .mfb
  22.104 -(p[N-1])st8 [dst]=val[N-1],8
  22.105 -	nop.f 0
  22.106 -	br.ctop.dptk.few 1b
  22.107 -}
  22.108 -	;;
  22.109 -	mov ar.lc=saved_lc
  22.110 -	mov pr=saved_pr,-1
  22.111 -	mov ar.pfs=saved_pfs
  22.112 -	br.ret.sptk.many rp
  22.113 -
  22.114 -	/*
  22.115 -	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
  22.116 -	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
  22.117 -	 * get used very often (gcc inlines small copies) and due to atomicity
  22.118 -	 * issues, we want to avoid read-modify-write of entire words.
  22.119 -	 */
  22.120 -	.align 32
  22.121 -.memcpy_short:
  22.122 -	adds cnt=-1,in2		// br.ctop is repeat/until
  22.123 -	mov ar.ec=MEM_LAT
  22.124 -	brp.loop.imp 1f, 2f
  22.125 -	;;
  22.126 -	mov ar.lc=cnt
  22.127 -	;;
  22.128 -	nop.m	0
  22.129 -	;;
  22.130 -	nop.m	0
  22.131 -	nop.i	0
  22.132 -	;;
  22.133 -	nop.m	0
  22.134 -	;;
  22.135 -	nop.m	0
  22.136 -	;;
  22.137 -	/*
  22.138 -	 * It is faster to put a stop bit in the loop here because it makes
  22.139 -	 * the pipeline shorter (and latency is what matters on short copies).
  22.140 -	 */
  22.141 -	.align 32
  22.142 -1: { .mib
  22.143 -(p[0])	ld1 val[0]=[src],1
  22.144 -	nop.i 0
  22.145 -	brp.loop.imp 1b, 2f
  22.146 -} ;;
  22.147 -2: { .mfb
  22.148 -(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
  22.149 -	nop.f 0
  22.150 -	br.ctop.dptk.few 1b
  22.151 -} ;;
  22.152 -	mov ar.lc=saved_lc
  22.153 -	mov pr=saved_pr,-1
  22.154 -	mov ar.pfs=saved_pfs
  22.155 -	br.ret.sptk.many rp
  22.156 -
  22.157 -	/*
  22.158 -	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
  22.159 -	 * an overriding concern here, but throughput is.  We first do
  22.160 -	 * sub-word copying until the destination is aligned, then we check
  22.161 -	 * if the source is also aligned.  If so, we do a simple load/store-loop
  22.162 -	 * until there are less than 8 bytes left over and then we do the tail,
  22.163 -	 * by storing the last few bytes using sub-word copying.  If the source
  22.164 -	 * is not aligned, we branch off to the non-congruent loop.
  22.165 -	 *
  22.166 -	 *   stage:   op:
  22.167 -	 *         0  ld
  22.168 -	 *	   :
  22.169 -	 * MEM_LAT+3  shrp
  22.170 -	 * MEM_LAT+4  st
  22.171 -	 *
  22.172 -	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
  22.173 -	 * seems to introduce an unavoidable bubble in the pipeline so the overall
  22.174 -	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
  22.175 -	 * of 4 byte/cycle.  Still not bad.
  22.176 -	 */
  22.177 -#	undef N
  22.178 -#	undef Nrot
  22.179 -#	define N	(MEM_LAT + 5)		/* number of stages */
  22.180 -#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
  22.181 -
  22.182 -#define LOG_LOOP_SIZE	6
  22.183 -
  22.184 -.memcpy_long:
  22.185 -	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
  22.186 -	and t0=-8,src		// t0 = src & ~7
  22.187 -	and t2=7,src		// t2 = src & 7
  22.188 -	;;
  22.189 -	ld8 t0=[t0]		// t0 = 1st source word
  22.190 -	adds src2=7,src		// src2 = (src + 7)
  22.191 -	sub t4=r0,dst		// t4 = -dst
  22.192 -	;;
  22.193 -	and src2=-8,src2	// src2 = (src + 7) & ~7
  22.194 -	shl t2=t2,3		// t2 = 8*(src & 7)
  22.195 -	shl t4=t4,3		// t4 = 8*(dst & 7)
  22.196 -	;;
  22.197 -	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
  22.198 -	sub t3=64,t2		// t3 = 64-8*(src & 7)
  22.199 -	shr.u t0=t0,t2
  22.200 -	;;
  22.201 -	add src_end=src,in2
  22.202 -	shl t1=t1,t3
  22.203 -	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
  22.204 -	;;
  22.205 -	or t0=t0,t1
  22.206 -	mov cnt=r0
  22.207 -	adds src_end=-1,src_end
  22.208 -	;;
  22.209 -(p3)	st1 [dst]=t0,1
  22.210 -(p3)	shr.u t0=t0,8
  22.211 -(p3)	adds cnt=1,cnt
  22.212 -	;;
  22.213 -(p4)	st2 [dst]=t0,2
  22.214 -(p4)	shr.u t0=t0,16
  22.215 -(p4)	adds cnt=2,cnt
  22.216 -	;;
  22.217 -(p5)	st4 [dst]=t0,4
  22.218 -(p5)	adds cnt=4,cnt
  22.219 -	and src_end=-8,src_end	// src_end = last word of source buffer
  22.220 -	;;
  22.221 -
  22.222 -	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
  22.223 -
  22.224 -1:{	add src=cnt,src			// make src point to remainder of source buffer
  22.225 -	sub cnt=in2,cnt			// cnt = number of bytes left to copy
  22.226 -	mov t4=ip
  22.227 -  }	;;
  22.228 -	and src2=-8,src			// align source pointer
  22.229 -	adds t4=.memcpy_loops-1b,t4
  22.230 -	mov ar.ec=N
  22.231 -
  22.232 -	and t0=7,src			// t0 = src & 7
  22.233 -	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
  22.234 -	shl cnt=cnt,3			// move bits 0-2 to 3-5
  22.235 -	;;
  22.236 -
  22.237 -	.rotr val[N+1], w[2]
  22.238 -	.rotp p[N]
  22.239 -
  22.240 -	cmp.ne p6,p0=t0,r0		// is src aligned, too?
  22.241 -	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
  22.242 -	adds t2=-1,t2			// br.ctop is repeat/until
  22.243 -	;;
  22.244 -	add t4=t0,t4
  22.245 -	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
  22.246 -	mov ar.lc=t2
  22.247 -	;;
  22.248 -	nop.m	0
  22.249 -	;;
  22.250 -	nop.m	0
  22.251 -	nop.i	0
  22.252 -	;;
  22.253 -	nop.m	0
  22.254 -	;;
  22.255 -(p6)	ld8 val[1]=[src2],8		// prime the pump...
  22.256 -	mov b6=t4
  22.257 -	br.sptk.few b6
  22.258 -	;;
  22.259 -
  22.260 -.memcpy_tail:
  22.261 -	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
  22.262 -	// less than 8) and t0 contains the last few bytes of the src buffer:
  22.263 -(p5)	st4 [dst]=t0,4
  22.264 -(p5)	shr.u t0=t0,32
  22.265 -	mov ar.lc=saved_lc
  22.266 -	;;
  22.267 -(p4)	st2 [dst]=t0,2
  22.268 -(p4)	shr.u t0=t0,16
  22.269 -	mov ar.pfs=saved_pfs
  22.270 -	;;
  22.271 -(p3)	st1 [dst]=t0
  22.272 -	mov pr=saved_pr,-1
  22.273 -	br.ret.sptk.many rp
  22.274 -
  22.275 -///////////////////////////////////////////////////////
  22.276 -	.align 64
  22.277 -
  22.278 -#define COPY(shift,index)									\
  22.279 - 1: { .mib											\
  22.280 -	(p[0])		ld8 val[0]=[src2],8;							\
  22.281 -	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
  22.282 -			brp.loop.imp 1b, 2f							\
  22.283 -    };												\
  22.284 - 2: { .mfb											\
  22.285 -	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
  22.286 -			nop.f 0;								\
  22.287 -			br.ctop.dptk.few 1b;							\
  22.288 -    };												\
  22.289 -			;;									\
  22.290 -			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
  22.291 -			;;									\
  22.292 -			shrp t0=val[N-1],val[N-index],shift;					\
  22.293 -			br .memcpy_tail
  22.294 -.memcpy_loops:
  22.295 -	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
  22.296 -	COPY(8, 0)
  22.297 -	COPY(16, 0)
  22.298 -	COPY(24, 0)
  22.299 -	COPY(32, 0)
  22.300 -	COPY(40, 0)
  22.301 -	COPY(48, 0)
  22.302 -	COPY(56, 0)
  22.303 -
  22.304 -END(memcpy)
    23.1 --- a/xen/arch/ia64/linux/lib/memcpy_mck.S	Tue Aug 30 17:51:51 2005 -0600
    23.2 +++ b/xen/arch/ia64/linux/lib/memcpy_mck.S	Wed Aug 31 14:32:27 2005 -0600
    23.3 @@ -75,6 +75,7 @@ GLOBAL_ENTRY(memcpy)
    23.4  	mov	f6=f0
    23.5  	br.cond.sptk .common_code
    23.6  	;;
    23.7 +END(memcpy)
    23.8  GLOBAL_ENTRY(__copy_user)
    23.9  	.prologue
   23.10  // check dest alignment
   23.11 @@ -300,7 +301,7 @@ EK(.ex_handler,	(p[D])	st8 [dst1] = t15,
   23.12  	add	src_pre_mem=0,src0	// prefetch src pointer
   23.13  	add	dst_pre_mem=0,dst0	// prefetch dest pointer
   23.14  	and	src0=-8,src0		// 1st src pointer
   23.15 -(p7)	mov	ar.lc = r21
   23.16 +(p7)	mov	ar.lc = cnt
   23.17  (p8)	mov	ar.lc = r0
   23.18  	;;
   23.19  	TEXT_ALIGN(32)
   23.20 @@ -524,7 +525,6 @@ EK(.ex_handler,  (p17)	st8	[dst1]=r39,8)
   23.21  #undef B
   23.22  #undef C
   23.23  #undef D
   23.24 -END(memcpy)
   23.25  
   23.26  /*
   23.27   * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
    24.1 --- a/xen/arch/ia64/linux/lib/memset.S	Tue Aug 30 17:51:51 2005 -0600
    24.2 +++ b/xen/arch/ia64/linux/lib/memset.S	Wed Aug 31 14:32:27 2005 -0600
    24.3 @@ -57,10 +57,10 @@ GLOBAL_ENTRY(memset)
    24.4  { .mmi
    24.5  	.prologue
    24.6  	alloc	tmp = ar.pfs, 3, 0, 0, 0
    24.7 -	.body
    24.8  	lfetch.nt1 [dest]			//
    24.9  	.save   ar.lc, save_lc
   24.10  	mov.i	save_lc = ar.lc
   24.11 +	.body
   24.12  } { .mmi
   24.13  	mov	ret0 = dest			// return value
   24.14  	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
    25.1 --- a/xen/arch/ia64/linux/lib/strlen_user.S	Tue Aug 30 17:51:51 2005 -0600
    25.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.3 @@ -1,198 +0,0 @@
    25.4 -/*
    25.5 - * Optimized version of the strlen_user() function
    25.6 - *
    25.7 - * Inputs:
    25.8 - *	in0	address of buffer
    25.9 - *
   25.10 - * Outputs:
   25.11 - *	ret0	0 in case of fault, strlen(buffer)+1 otherwise
   25.12 - *
   25.13 - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
   25.14 - *	David Mosberger-Tang <davidm@hpl.hp.com>
   25.15 - *	Stephane Eranian <eranian@hpl.hp.com>
   25.16 - *
   25.17 - * 01/19/99 S.Eranian heavily enhanced version (see details below)
   25.18 - * 09/24/99 S.Eranian added speculation recovery code
   25.19 - */
   25.20 -
   25.21 -#include <asm/asmmacro.h>
   25.22 -
   25.23 -//
   25.24 -// int strlen_user(char *)
   25.25 -// ------------------------
   25.26 -// Returns:
   25.27 -//	- length of string + 1
   25.28 -//	- 0 in case an exception is raised
   25.29 -//
   25.30 -// This is an enhanced version of the basic strlen_user. it includes a
   25.31 -// combination of compute zero index (czx), parallel comparisons, speculative
   25.32 -// loads and loop unroll using rotating registers.
   25.33 -//
   25.34 -// General Ideas about the algorithm:
   25.35 -//	  The goal is to look at the string in chunks of 8 bytes.
   25.36 -//	  so we need to do a few extra checks at the beginning because the
   25.37 -//	  string may not be 8-byte aligned. In this case we load the 8byte
   25.38 -//	  quantity which includes the start of the string and mask the unused
   25.39 -//	  bytes with 0xff to avoid confusing czx.
   25.40 -//	  We use speculative loads and software pipelining to hide memory
   25.41 -//	  latency and do read ahead safely. This way we defer any exception.
   25.42 -//
   25.43 -//	  Because we don't want the kernel to be relying on particular
   25.44 -//	  settings of the DCR register, we provide recovery code in case
   25.45 -//	  speculation fails. The recovery code is going to "redo" the work using
   25.46 -//	  only normal loads. If we still get a fault then we return an
   25.47 -//	  error (ret0=0). Otherwise we return the strlen+1 as usual.
   25.48 -//	  The fact that speculation may fail can be caused, for instance, by
   25.49 -//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
   25.50 -//	  a NaT bit will be set if the translation is not present. The normal
   25.51 -//	  load, on the other hand, will cause the translation to be inserted
   25.52 -//	  if the mapping exists.
   25.53 -//
   25.54 -//	  It should be noted that we execute recovery code only when we need
   25.55 -//	  to use the data that has been speculatively loaded: we don't execute
   25.56 -//	  recovery code on pure read ahead data.
   25.57 -//
   25.58 -// Remarks:
   25.59 -//	- the cmp r0,r0 is used as a fast way to initialize a predicate
   25.60 -//	  register to 1. This is required to make sure that we get the parallel
   25.61 -//	  compare correct.
   25.62 -//
   25.63 -//	- we don't use the epilogue counter to exit the loop but we need to set
   25.64 -//	  it to zero beforehand.
   25.65 -//
   25.66 -//	- after the loop we must test for Nat values because neither the
   25.67 -//	  czx nor cmp instruction raise a NaT consumption fault. We must be
   25.68 -//	  careful not to look too far for a Nat for which we don't care.
   25.69 -//	  For instance we don't need to look at a NaT in val2 if the zero byte
   25.70 -//	  was in val1.
   25.71 -//
   25.72 -//	- Clearly performance tuning is required.
   25.73 -//
   25.74 -
   25.75 -#define saved_pfs	r11
   25.76 -#define	tmp		r10
   25.77 -#define base		r16
   25.78 -#define orig		r17
   25.79 -#define saved_pr	r18
   25.80 -#define src		r19
   25.81 -#define mask		r20
   25.82 -#define val		r21
   25.83 -#define val1		r22
   25.84 -#define val2		r23
   25.85 -
   25.86 -GLOBAL_ENTRY(__strlen_user)
   25.87 -	.prologue
   25.88 -	.save ar.pfs, saved_pfs
   25.89 -	alloc saved_pfs=ar.pfs,11,0,0,8
   25.90 -
   25.91 -	.rotr v[2], w[2]	// declares our 4 aliases
   25.92 -
   25.93 -	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
   25.94 -	mov orig=in0		// keep trackof initial byte address
   25.95 -	dep src=0,in0,0,3	// src=8byte-aligned in0 address
   25.96 -	.save pr, saved_pr
   25.97 -	mov saved_pr=pr		// preserve predicates (rotation)
   25.98 -	;;
   25.99 -
  25.100 -	.body
  25.101 -
  25.102 -	ld8.s v[1]=[src],8	// load the initial 8bytes (must speculate)
  25.103 -	shl tmp=tmp,3		// multiply by 8bits/byte
  25.104 -	mov mask=-1		// our mask
  25.105 -	;;
  25.106 -	ld8.s w[1]=[src],8	// load next 8 bytes in 2nd pipeline
  25.107 -	cmp.eq p6,p0=r0,r0	// sets p6 (required because of // cmp.and)
  25.108 -	sub tmp=64,tmp		// how many bits to shift our mask on the right
  25.109 -	;;
  25.110 -	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
  25.111 -	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
  25.112 -	;;
  25.113 -	add base=-16,src	// keep track of aligned base
  25.114 -	chk.s v[1], .recover	// if already NaT, then directly skip to recover
  25.115 -	or v[1]=v[1],mask	// now we have a safe initial byte pattern
  25.116 -	;;
  25.117 -1:
  25.118 -	ld8.s v[0]=[src],8	// speculatively load next
  25.119 -	czx1.r val1=v[1]	// search 0 byte from right
  25.120 -	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
  25.121 -	;;
  25.122 -	ld8.s w[0]=[src],8	// speculatively load next to next
  25.123 -	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
  25.124 -	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
  25.125 -(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
  25.126 -	;;
  25.127 -	//
  25.128 -	// We must return try the recovery code iff
  25.129 -	// val1_is_nat || (val1==8 && val2_is_nat)
  25.130 -	//
  25.131 -	// XXX Fixme
  25.132 -	//	- there must be a better way of doing the test
  25.133 -	//
  25.134 -	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
  25.135 -	tnat.nz p6,p7=val1	// test NaT on val1
  25.136 -(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
  25.137 -	;;
  25.138 -	//
  25.139 -	// if we come here p7 is true, i.e., initialized for // cmp
  25.140 -	//
  25.141 -	cmp.eq.and  p7,p0=8,val1// val1==8?
  25.142 -	tnat.nz.and p7,p0=val2	// test NaT if val2
  25.143 -(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
  25.144 -	;;
  25.145 -(p8)	mov val1=val2		// val2 contains the value
  25.146 -(p8)	adds src=-16,src	// correct position when 3 ahead
  25.147 -(p9)	adds src=-24,src	// correct position when 4 ahead
  25.148 -	;;
  25.149 -	sub ret0=src,orig	// distance from origin
  25.150 -	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
  25.151 -	mov pr=saved_pr,0xffffffffffff0000
  25.152 -	;;
  25.153 -	sub ret0=ret0,tmp	// length=now - back -1
  25.154 -	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  25.155 -	br.ret.sptk.many rp	// end of normal execution
  25.156 -
  25.157 -	//
  25.158 -	// Outlined recovery code when speculation failed
  25.159 -	//
  25.160 -	// This time we don't use speculation and rely on the normal exception
  25.161 -	// mechanism. that's why the loop is not as good as the previous one
  25.162 -	// because read ahead is not possible
  25.163 -	//
  25.164 -	// XXX Fixme
  25.165 -	//	- today we restart from the beginning of the string instead
  25.166 -	//	  of trying to continue where we left off.
  25.167 -	//
  25.168 -.recover:
  25.169 -	EX(.Lexit1, ld8 val=[base],8)	// load the initial bytes
  25.170 -	;;
  25.171 -	or val=val,mask			// remask first bytes
  25.172 -	cmp.eq p0,p6=r0,r0		// nullify first ld8 in loop
  25.173 -	;;
  25.174 -	//
  25.175 -	// ar.ec is still zero here
  25.176 -	//
  25.177 -2:
  25.178 -	EX(.Lexit1, (p6) ld8 val=[base],8)
  25.179 -	;;
  25.180 -	czx1.r val1=val		// search 0 byte from right
  25.181 -	;;
  25.182 -	cmp.eq p6,p0=8,val1	// val1==8 ?
  25.183 -(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
  25.184 -	;;
  25.185 -	sub ret0=base,orig	// distance from base
  25.186 -	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
  25.187 -	mov pr=saved_pr,0xffffffffffff0000
  25.188 -	;;
  25.189 -	sub ret0=ret0,tmp	// length=now - back -1
  25.190 -	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  25.191 -	br.ret.sptk.many rp	// end of successful recovery code
  25.192 -
  25.193 -	//
  25.194 -	// We failed even on the normal load (called from exception handler)
  25.195 -	//
  25.196 -.Lexit1:
  25.197 -	mov ret0=0
  25.198 -	mov pr=saved_pr,0xffffffffffff0000
  25.199 -	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  25.200 -	br.ret.sptk.many rp
  25.201 -END(__strlen_user)
    26.1 --- a/xen/arch/ia64/linux/lib/strncpy_from_user.S	Tue Aug 30 17:51:51 2005 -0600
    26.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.3 @@ -1,44 +0,0 @@
    26.4 -/*
    26.5 - * Just like strncpy() except that if a fault occurs during copying,
    26.6 - * -EFAULT is returned.
    26.7 - *
    26.8 - * Inputs:
    26.9 - *	in0:	address of destination buffer
   26.10 - *	in1:	address of string to be copied
   26.11 - *	in2:	length of buffer in bytes
   26.12 - * Outputs:
   26.13 - *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
   26.14 - *
   26.15 - * Copyright (C) 1998-2001 Hewlett-Packard Co
   26.16 - * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
   26.17 - *
   26.18 - * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
   26.19 - *			 by Andreas Schwab <schwab@suse.de>).
   26.20 - */
   26.21 -
   26.22 -#include <asm/asmmacro.h>
   26.23 -
   26.24 -GLOBAL_ENTRY(__strncpy_from_user)
   26.25 -	alloc r2=ar.pfs,3,0,0,0
   26.26 -	mov r8=0
   26.27 -	mov r9=in1
   26.28 -	;;
   26.29 -	add r10=in1,in2
   26.30 -	cmp.eq p6,p0=r0,in2
   26.31 -(p6)	br.ret.spnt.many rp
   26.32 -
   26.33 -	// XXX braindead copy loop---this needs to be optimized
   26.34 -.Loop1:
   26.35 -	EX(.Lexit, ld1 r8=[in1],1)
   26.36 -	;;
   26.37 -	EX(.Lexit, st1 [in0]=r8,1)
   26.38 -	cmp.ne p6,p7=r8,r0
   26.39 -	;;
   26.40 -(p6)	cmp.ne.unc p8,p0=in1,r10
   26.41 -(p8)	br.cond.dpnt.few .Loop1
   26.42 -	;;
   26.43 -(p6)	mov r8=in2		// buffer filled up---return buffer length
   26.44 -(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
   26.45 -[.Lexit:]
   26.46 -	br.ret.sptk.many rp
   26.47 -END(__strncpy_from_user)
    27.1 --- a/xen/arch/ia64/linux/lib/strnlen_user.S	Tue Aug 30 17:51:51 2005 -0600
    27.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.3 @@ -1,45 +0,0 @@
    27.4 -/*
    27.5 - * Returns 0 if exception before NUL or reaching the supplied limit (N),
    27.6 - * a value greater than N if the string is longer than the limit, else
    27.7 - * strlen.
    27.8 - *
    27.9 - * Inputs:
   27.10 - *	in0:	address of buffer
   27.11 - *	in1:	string length limit N
   27.12 - * Outputs:
   27.13 - *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
   27.14 - *
   27.15 - * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
   27.16 - */
   27.17 -
   27.18 -#include <asm/asmmacro.h>
   27.19 -
   27.20 -GLOBAL_ENTRY(__strnlen_user)
   27.21 -	.prologue
   27.22 -	alloc r2=ar.pfs,2,0,0,0
   27.23 -	.save ar.lc, r16
   27.24 -	mov r16=ar.lc			// preserve ar.lc
   27.25 -
   27.26 -	.body
   27.27 -
   27.28 -	add r3=-1,in1
   27.29 -	;;
   27.30 -	mov ar.lc=r3
   27.31 -	mov r9=0
   27.32 -	;;
   27.33 -	// XXX braindead strlen loop---this needs to be optimized
   27.34 -.Loop1:
   27.35 -	EXCLR(.Lexit, ld1 r8=[in0],1)
   27.36 -	add r9=1,r9
   27.37 -	;;
   27.38 -	cmp.eq p6,p0=r8,r0
   27.39 -(p6)	br.cond.dpnt .Lexit
   27.40 -	br.cloop.dptk.few .Loop1
   27.41 -
   27.42 -	add r9=1,in1			// NUL not found---return N+1
   27.43 -	;;
   27.44 -.Lexit:
   27.45 -	mov r8=r9
   27.46 -	mov ar.lc=r16			// restore ar.lc
   27.47 -	br.ret.sptk.many rp
   27.48 -END(__strnlen_user)
    28.1 --- a/xen/arch/ia64/linux/lib/xor.S	Tue Aug 30 17:51:51 2005 -0600
    28.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.3 @@ -1,184 +0,0 @@
    28.4 -/*
    28.5 - * arch/ia64/lib/xor.S
    28.6 - *
    28.7 - * Optimized RAID-5 checksumming functions for IA-64.
    28.8 - *
    28.9 - * This program is free software; you can redistribute it and/or modify
   28.10 - * it under the terms of the GNU General Public License as published by
   28.11 - * the Free Software Foundation; either version 2, or (at your option)
   28.12 - * any later version.
   28.13 - *
   28.14 - * You should have received a copy of the GNU General Public License
   28.15 - * (for example /usr/src/linux/COPYING); if not, write to the Free
   28.16 - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   28.17 - */
   28.18 -
   28.19 -#include <asm/asmmacro.h>
   28.20 -
   28.21 -GLOBAL_ENTRY(xor_ia64_2)
   28.22 -	.prologue
   28.23 -	.fframe 0
   28.24 -	.save ar.pfs, r31
   28.25 -	alloc r31 = ar.pfs, 3, 0, 13, 16
   28.26 -	.save ar.lc, r30
   28.27 -	mov r30 = ar.lc
   28.28 -	.save pr, r29
   28.29 -	mov r29 = pr
   28.30 -	;;
   28.31 -	.body
   28.32 -	mov r8 = in1
   28.33 -	mov ar.ec = 6 + 2
   28.34 -	shr in0 = in0, 3
   28.35 -	;;
   28.36 -	adds in0 = -1, in0
   28.37 -	mov r16 = in1
   28.38 -	mov r17 = in2
   28.39 -	;;
   28.40 -	mov ar.lc = in0
   28.41 -	mov pr.rot = 1 << 16
   28.42 -	;;
   28.43 -	.rotr s1[6+1], s2[6+1], d[2]
   28.44 -	.rotp p[6+2]
   28.45 -0:
   28.46 -(p[0])	ld8.nta s1[0] = [r16], 8
   28.47 -(p[0])	ld8.nta s2[0] = [r17], 8
   28.48 -(p[6])	xor d[0] = s1[6], s2[6]
   28.49 -(p[6+1])st8.nta [r8] = d[1], 8
   28.50 -	nop.f 0
   28.51 -	br.ctop.dptk.few 0b
   28.52 -	;;
   28.53 -	mov ar.lc = r30
   28.54 -	mov pr = r29, -1
   28.55 -	br.ret.sptk.few rp
   28.56 -END(xor_ia64_2)
   28.57 -
   28.58 -GLOBAL_ENTRY(xor_ia64_3)
   28.59 -	.prologue
   28.60 -	.fframe 0
   28.61 -	.save ar.pfs, r31
   28.62 -	alloc r31 = ar.pfs, 4, 0, 20, 24
   28.63 -	.save ar.lc, r30
   28.64 -	mov r30 = ar.lc
   28.65 -	.save pr, r29
   28.66 -	mov r29 = pr
   28.67 -	;;
   28.68 -	.body
   28.69 -	mov r8 = in1
   28.70 -	mov ar.ec = 6 + 2
   28.71 -	shr in0 = in0, 3
   28.72 -	;;
   28.73 -	adds in0 = -1, in0
   28.74 -	mov r16 = in1
   28.75 -	mov r17 = in2
   28.76 -	;;
   28.77 -	mov r18 = in3
   28.78 -	mov ar.lc = in0
   28.79 -	mov pr.rot = 1 << 16
   28.80 -	;;
   28.81 -	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
   28.82 -	.rotp p[6+2]
   28.83 -0:
   28.84 -(p[0])	ld8.nta s1[0] = [r16], 8
   28.85 -(p[0])	ld8.nta s2[0] = [r17], 8
   28.86 -(p[6])	xor d[0] = s1[6], s2[6]
   28.87 -	;;
   28.88 -(p[0])	ld8.nta s3[0] = [r18], 8
   28.89 -(p[6+1])st8.nta [r8] = d[1], 8
   28.90 -(p[6])	xor d[0] = d[0], s3[6]
   28.91 -	br.ctop.dptk.few 0b
   28.92 -	;;
   28.93 -	mov ar.lc = r30
   28.94 -	mov pr = r29, -1
   28.95 -	br.ret.sptk.few rp
   28.96 -END(xor_ia64_3)
   28.97 -
   28.98 -GLOBAL_ENTRY(xor_ia64_4)
   28.99 -	.prologue
  28.100 -	.fframe 0
  28.101 -	.save ar.pfs, r31
  28.102 -	alloc r31 = ar.pfs, 5, 0, 27, 32
  28.103 -	.save ar.lc, r30
  28.104 -	mov r30 = ar.lc
  28.105 -	.save pr, r29
  28.106 -	mov r29 = pr
  28.107 -	;;
  28.108 -	.body
  28.109 -	mov r8 = in1
  28.110 -	mov ar.ec = 6 + 2
  28.111 -	shr in0 = in0, 3
  28.112 -	;;
  28.113 -	adds in0 = -1, in0
  28.114 -	mov r16 = in1
  28.115 -	mov r17 = in2
  28.116 -	;;
  28.117 -	mov r18 = in3
  28.118 -	mov ar.lc = in0
  28.119 -	mov pr.rot = 1 << 16
  28.120 -	mov r19 = in4
  28.121 -	;;
  28.122 -	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
  28.123 -	.rotp p[6+2]
  28.124 -0:
  28.125 -(p[0])	ld8.nta s1[0] = [r16], 8
  28.126 -(p[0])	ld8.nta s2[0] = [r17], 8
  28.127 -(p[6])	xor d[0] = s1[6], s2[6]
  28.128 -(p[0])	ld8.nta s3[0] = [r18], 8
  28.129 -(p[0])	ld8.nta s4[0] = [r19], 8
  28.130 -(p[6])	xor r20 = s3[6], s4[6]
  28.131 -	;;
  28.132 -(p[6+1])st8.nta [r8] = d[1], 8
  28.133 -(p[6])	xor d[0] = d[0], r20
  28.134 -	br.ctop.dptk.few 0b
  28.135 -	;;
  28.136 -	mov ar.lc = r30
  28.137 -	mov pr = r29, -1
  28.138 -	br.ret.sptk.few rp
  28.139 -END(xor_ia64_4)
  28.140 -
  28.141 -GLOBAL_ENTRY(xor_ia64_5)
  28.142 -	.prologue
  28.143 -	.fframe 0
  28.144 -	.save ar.pfs, r31
  28.145 -	alloc r31 = ar.pfs, 6, 0, 34, 40
  28.146 -	.save ar.lc, r30
  28.147 -	mov r30 = ar.lc
  28.148 -	.save pr, r29
  28.149 -	mov r29 = pr
  28.150 -	;;
  28.151 -	.body
  28.152 -	mov r8 = in1
  28.153 -	mov ar.ec = 6 + 2
  28.154 -	shr in0 = in0, 3
  28.155 -	;;
  28.156 -	adds in0 = -1, in0
  28.157 -	mov r16 = in1
  28.158 -	mov r17 = in2
  28.159 -	;;
  28.160 -	mov r18 = in3
  28.161 -	mov ar.lc = in0
  28.162 -	mov pr.rot = 1 << 16
  28.163 -	mov r19 = in4
  28.164 -	mov r20 = in5
  28.165 -	;;
  28.166 -	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
  28.167 -	.rotp p[6+2]
  28.168 -0:
  28.169 -(p[0])	ld8.nta s1[0] = [r16], 8
  28.170 -(p[0])	ld8.nta s2[0] = [r17], 8
  28.171 -(p[6])	xor d[0] = s1[6], s2[6]
  28.172 -(p[0])	ld8.nta s3[0] = [r18], 8
  28.173 -(p[0])	ld8.nta s4[0] = [r19], 8
  28.174 -(p[6])	xor r21 = s3[6], s4[6]
  28.175 -	;;
  28.176 -(p[0])	ld8.nta s5[0] = [r20], 8
  28.177 -(p[6+1])st8.nta [r8] = d[1], 8
  28.178 -(p[6])	xor d[0] = d[0], r21
  28.179 -	;;
  28.180 -(p[6])	  xor d[0] = d[0], s5[6]
  28.181 -	nop.f 0
  28.182 -	br.ctop.dptk.few 0b
  28.183 -	;;
  28.184 -	mov ar.lc = r30
  28.185 -	mov pr = r29, -1
  28.186 -	br.ret.sptk.few rp
  28.187 -END(xor_ia64_5)
    29.1 --- a/xen/arch/ia64/linux/minstate.h	Tue Aug 30 17:51:51 2005 -0600
    29.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.3 @@ -1,254 +0,0 @@
    29.4 -#include <linux/config.h>
    29.5 -
    29.6 -#include <asm/cache.h>
    29.7 -
    29.8 -#include "entry.h"
    29.9 -
   29.10 -/*
   29.11 - * For ivt.s we want to access the stack virtually so we don't have to disable translation
   29.12 - * on interrupts.
   29.13 - *
   29.14 - *  On entry:
   29.15 - *	r1:	pointer to current task (ar.k6)
   29.16 - */
   29.17 -#define MINSTATE_START_SAVE_MIN_VIRT								\
   29.18 -(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
   29.19 -	;;											\
   29.20 -(pUStk)	mov.m r24=ar.rnat;									\
   29.21 -(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
   29.22 -(pKStk) mov r1=sp;					/* get sp  */				\
   29.23 -	;;											\
   29.24 -(pUStk) lfetch.fault.excl.nt1 [r22];								\
   29.25 -(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
   29.26 -(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
   29.27 -	;;											\
   29.28 -(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
   29.29 -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
   29.30 -	;;											\
   29.31 -(pUStk)	mov r18=ar.bsp;										\
   29.32 -(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
   29.33 -
   29.34 -#define MINSTATE_END_SAVE_MIN_VIRT								\
   29.35 -	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
   29.36 -	;;
   29.37 -
   29.38 -/*
   29.39 - * For mca_asm.S we want to access the stack physically since the state is saved before we
   29.40 - * go virtual and don't want to destroy the iip or ipsr.
   29.41 - */
   29.42 -#define MINSTATE_START_SAVE_MIN_PHYS								\
   29.43 -(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;								\
   29.44 -(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;							\
   29.45 -(pKStk) ld8 r3 = [r3];;										\
   29.46 -(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;						\
   29.47 -(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;						\
   29.48 -(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
   29.49 -(pUStk)	addl r22=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	\
   29.50 -	;;											\
   29.51 -(pUStk)	mov r24=ar.rnat;									\
   29.52 -(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
   29.53 -(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
   29.54 -(pUStk)	dep r22=-1,r22,61,3;			/* compute kernel virtual addr of RBS */	\
   29.55 -	;;											\
   29.56 -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		\
   29.57 -(pUStk)	mov ar.bspstore=r22;			/* switch to kernel RBS */			\
   29.58 -	;;											\
   29.59 -(pUStk)	mov r18=ar.bsp;										\
   29.60 -(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
   29.61 -
   29.62 -#define MINSTATE_END_SAVE_MIN_PHYS								\
   29.63 -	dep r12=-1,r12,61,3;		/* make sp a kernel virtual address */			\
   29.64 -	;;
   29.65 -
   29.66 -#ifdef MINSTATE_VIRT
   29.67 -# define MINSTATE_GET_CURRENT(reg)	\
   29.68 -		movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
   29.69 -		ld8 reg=[reg]
   29.70 -# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_VIRT
   29.71 -# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_VIRT
   29.72 -#endif
   29.73 -
   29.74 -#ifdef MINSTATE_PHYS
   29.75 -# define MINSTATE_GET_CURRENT(reg)	mov reg=IA64_KR(CURRENT);; tpa reg=reg
   29.76 -# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_PHYS
   29.77 -# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_PHYS
   29.78 -#endif
   29.79 -
   29.80 -/*
   29.81 - * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
   29.82 - * the minimum state necessary that allows us to turn psr.ic back
   29.83 - * on.
   29.84 - *
   29.85 - * Assumed state upon entry:
   29.86 - *	psr.ic: off
   29.87 - *	r31:	contains saved predicates (pr)
   29.88 - *
   29.89 - * Upon exit, the state is as follows:
   29.90 - *	psr.ic: off
   29.91 - *	 r2 = points to &pt_regs.r16
   29.92 - *	 r8 = contents of ar.ccv
   29.93 - *	 r9 = contents of ar.csd
   29.94 - *	r10 = contents of ar.ssd
   29.95 - *	r11 = FPSR_DEFAULT
   29.96 - *	r12 = kernel sp (kernel virtual address)
   29.97 - *	r13 = points to current task_struct (kernel virtual address)
   29.98 - *	p15 = TRUE if psr.i is set in cr.ipsr
   29.99 - *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
  29.100 - *		preserved
  29.101 - *
  29.102 - * Note that psr.ic is NOT turned on by this macro.  This is so that
  29.103 - * we can pass interruption state as arguments to a handler.
  29.104 - */
  29.105 -#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)							\
  29.106 -	MINSTATE_GET_CURRENT(r16);	/* M (or M;;I) */					\
  29.107 -	mov r27=ar.rsc;			/* M */							\
  29.108 -	mov r20=r1;			/* A */							\
  29.109 -	mov r25=ar.unat;		/* M */							\
  29.110 -	mov r29=cr.ipsr;		/* M */							\
  29.111 -	mov r26=ar.pfs;			/* I */							\
  29.112 -	mov r28=cr.iip;			/* M */							\
  29.113 -	mov r21=ar.fpsr;		/* M */							\
  29.114 -	COVER;				/* B;; (or nothing) */					\
  29.115 -	;;											\
  29.116 -	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
  29.117 -	;;											\
  29.118 -	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
  29.119 -	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
  29.120 -	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
  29.121 -	/* switch from user to kernel RBS: */							\
  29.122 -	;;											\
  29.123 -	invala;				/* M */							\
  29.124 -	SAVE_IFS;										\
  29.125 -	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
  29.126 -	;;											\
  29.127 -	MINSTATE_START_SAVE_MIN									\
  29.128 -	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
  29.129 -	adds r16=PT(CR_IPSR),r1;								\
  29.130 -	;;											\
  29.131 -	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
  29.132 -	st8 [r16]=r29;		/* save cr.ipsr */						\
  29.133 -	;;											\
  29.134 -	lfetch.fault.excl.nt1 [r17];								\
  29.135 -	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
  29.136 -	mov r29=b0										\
  29.137 -	;;											\
  29.138 -	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
  29.139 -	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
  29.140 -(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
  29.141 -	;;											\
  29.142 -.mem.offset 0,0; st8.spill [r16]=r8,16;								\
  29.143 -.mem.offset 8,0; st8.spill [r17]=r9,16;								\
  29.144 -        ;;											\
  29.145 -.mem.offset 0,0; st8.spill [r16]=r10,24;							\
  29.146 -.mem.offset 8,0; st8.spill [r17]=r11,24;							\
  29.147 -        ;;											\
  29.148 -	st8 [r16]=r28,16;	/* save cr.iip */						\
  29.149 -	st8 [r17]=r30,16;	/* save cr.ifs */						\
  29.150 -(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
  29.151 -	mov r8=ar.ccv;										\
  29.152 -	mov r9=ar.csd;										\
  29.153 -	mov r10=ar.ssd;										\
  29.154 -	movl r11=FPSR_DEFAULT;   /* L-unit */							\
  29.155 -	;;											\
  29.156 -	st8 [r16]=r25,16;	/* save ar.unat */						\
  29.157 -	st8 [r17]=r26,16;	/* save ar.pfs */						\
  29.158 -	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
  29.159 -	;;											\
  29.160 -	st8 [r16]=r27,16;	/* save ar.rsc */						\
  29.161 -(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
  29.162 -(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
  29.163 -	;;			/* avoid RAW on r16 & r17 */					\
  29.164 -(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
  29.165 -	st8 [r17]=r31,16;	/* save predicates */						\
  29.166 -(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
  29.167 -	;;											\
  29.168 -	st8 [r16]=r29,16;	/* save b0 */							\
  29.169 -	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
  29.170 -	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
  29.171 -	;;											\
  29.172 -.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
  29.173 -.mem.offset 8,0; st8.spill [r17]=r12,16;							\
  29.174 -	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
  29.175 -	;;											\
  29.176 -.mem.offset 0,0; st8.spill [r16]=r13,16;							\
  29.177 -.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
  29.178 -	movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;					\
  29.179 -	ld8 r13=[r13];			/* establish 'current' */				\
  29.180 -	;;											\
  29.181 -.mem.offset 0,0; st8.spill [r16]=r15,16;							\
  29.182 -.mem.offset 8,0; st8.spill [r17]=r14,16;							\
  29.183 -	;;											\
  29.184 -.mem.offset 0,0; st8.spill [r16]=r2,16;								\
  29.185 -.mem.offset 8,0; st8.spill [r17]=r3,16;								\
  29.186 -	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
  29.187 -	;;											\
  29.188 -	EXTRA;											\
  29.189 -	movl r1=__gp;		/* establish kernel global pointer */				\
  29.190 -	;;											\
  29.191 -	MINSTATE_END_SAVE_MIN
  29.192 -
  29.193 -/*
  29.194 - * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
  29.195 - *
  29.196 - * Assumed state upon entry:
  29.197 - *	psr.ic: on
  29.198 - *	r2:	points to &pt_regs.r16
  29.199 - *	r3:	points to &pt_regs.r17
  29.200 - *	r8:	contents of ar.ccv
  29.201 - *	r9:	contents of ar.csd
  29.202 - *	r10:	contents of ar.ssd
  29.203 - *	r11:	FPSR_DEFAULT
  29.204 - *
  29.205 - * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
  29.206 - */
  29.207 -#define SAVE_REST				\
  29.208 -.mem.offset 0,0; st8.spill [r2]=r16,16;		\
  29.209 -.mem.offset 8,0; st8.spill [r3]=r17,16;		\
  29.210 -	;;					\
  29.211 -.mem.offset 0,0; st8.spill [r2]=r18,16;		\
  29.212 -.mem.offset 8,0; st8.spill [r3]=r19,16;		\
  29.213 -	;;					\
  29.214 -.mem.offset 0,0; st8.spill [r2]=r20,16;		\
  29.215 -.mem.offset 8,0; st8.spill [r3]=r21,16;		\
  29.216 -	mov r18=b6;				\
  29.217 -	;;					\
  29.218 -.mem.offset 0,0; st8.spill [r2]=r22,16;		\
  29.219 -.mem.offset 8,0; st8.spill [r3]=r23,16;		\
  29.220 -	mov r19=b7;				\
  29.221 -	;;					\
  29.222 -.mem.offset 0,0; st8.spill [r2]=r24,16;		\
  29.223 -.mem.offset 8,0; st8.spill [r3]=r25,16;		\
  29.224 -	;;					\
  29.225 -.mem.offset 0,0; st8.spill [r2]=r26,16;		\
  29.226 -.mem.offset 8,0; st8.spill [r3]=r27,16;		\
  29.227 -	;;					\
  29.228 -.mem.offset 0,0; st8.spill [r2]=r28,16;		\
  29.229 -.mem.offset 8,0; st8.spill [r3]=r29,16;		\
  29.230 -	;;					\
  29.231 -.mem.offset 0,0; st8.spill [r2]=r30,16;		\
  29.232 -.mem.offset 8,0; st8.spill [r3]=r31,32;		\
  29.233 -	;;					\
  29.234 -	mov ar.fpsr=r11;	/* M-unit */	\
  29.235 -	st8 [r2]=r8,8;		/* ar.ccv */	\
  29.236 -	adds r24=PT(B6)-PT(F7),r3;		\
  29.237 -	;;					\
  29.238 -	stf.spill [r2]=f6,32;			\
  29.239 -	stf.spill [r3]=f7,32;			\
  29.240 -	;;					\
  29.241 -	stf.spill [r2]=f8,32;			\
  29.242 -	stf.spill [r3]=f9,32;			\
  29.243 -	;;					\
  29.244 -	stf.spill [r2]=f10;			\
  29.245 -	stf.spill [r3]=f11;			\
  29.246 -	adds r25=PT(B7)-PT(F11),r3;		\
  29.247 -	;;					\
  29.248 -	st8 [r24]=r18,16;       /* b6 */	\
  29.249 -	st8 [r25]=r19,16;       /* b7 */	\
  29.250 -	;;					\
  29.251 -	st8 [r24]=r9;        	/* ar.csd */	\
  29.252 -	st8 [r25]=r10;      	/* ar.ssd */	\
  29.253 -	;;
  29.254 -
  29.255 -#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(cover, mov r30=cr.ifs,)
  29.256 -#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
  29.257 -#define SAVE_MIN		DO_SAVE_MIN(     , mov r30=r0, )
    30.1 --- a/xen/arch/ia64/linux/pcdp.h	Tue Aug 30 17:51:51 2005 -0600
    30.2 +++ b/xen/arch/ia64/linux/pcdp.h	Wed Aug 31 14:32:27 2005 -0600
    30.3 @@ -2,7 +2,7 @@
    30.4   * Definitions for PCDP-defined console devices
    30.5   *
    30.6   * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
    30.7 - * v2.0:  http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf
    30.8 + * v2.0:  http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
    30.9   *
   30.10   * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
   30.11   *	Khalid Aziz <khalid.aziz@hp.com>
   30.12 @@ -52,11 +52,36 @@ struct pcdp_uart {
   30.13  	u32				clock_rate;
   30.14  	u8				pci_prog_intfc;
   30.15  	u8				flags;
   30.16 -};
   30.17 +	u16				conout_index;
   30.18 +	u32				reserved;
   30.19 +} __attribute__((packed));
   30.20 +
   30.21 +#define PCDP_IF_PCI	1
   30.22 +
   30.23 +/* pcdp_if_pci.trans */
   30.24 +#define PCDP_PCI_TRANS_IOPORT	0x02
   30.25 +#define PCDP_PCI_TRANS_MMIO	0x01
   30.26 +
   30.27 +struct pcdp_if_pci {
   30.28 +	u8			interconnect;
   30.29 +	u8			reserved;
   30.30 +	u16			length;
   30.31 +	u8			segment;
   30.32 +	u8			bus;
   30.33 +	u8			dev;
   30.34 +	u8			fun;
   30.35 +	u16			dev_id;
   30.36 +	u16			vendor_id;
   30.37 +	u32			acpi_interrupt;
   30.38 +	u64			mmio_tra;
   30.39 +	u64			ioport_tra;
   30.40 +	u8			flags;
   30.41 +	u8			trans;
   30.42 +} __attribute__((packed));
   30.43  
   30.44  struct pcdp_vga {
   30.45  	u8			count;		/* address space descriptors */
   30.46 -};
   30.47 +} __attribute__((packed));
   30.48  
   30.49  /* pcdp_device.flags */
   30.50  #define PCDP_PRIMARY_CONSOLE	1
   30.51 @@ -66,7 +91,9 @@ struct pcdp_device {
   30.52  	u8			flags;
   30.53  	u16			length;
   30.54  	u16			efi_index;
   30.55 -};
   30.56 +	/* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
   30.57 +	/* next data is device specific type (currently only pcdp_vga) */
   30.58 +} __attribute__((packed));
   30.59  
   30.60  struct pcdp {
   30.61  	u8			signature[4];
   30.62 @@ -81,4 +108,4 @@ struct pcdp {
   30.63  	u32			num_uarts;
   30.64  	struct pcdp_uart	uart[0];	/* actual size is num_uarts */
   30.65  	/* remainder of table is pcdp_device structures */
   30.66 -};
   30.67 +} __attribute__((packed));
    31.1 --- a/xen/arch/ia64/pdb-stub.c	Tue Aug 30 17:51:51 2005 -0600
    31.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.3 @@ -1,59 +0,0 @@
    31.4 -
    31.5 -/*
    31.6 - * pervasive debugger
    31.7 - * www.cl.cam.ac.uk/netos/pdb
    31.8 - *
    31.9 - * alex ho
   31.10 - * 2004
   31.11 - * university of cambridge computer laboratory
   31.12 - *
   31.13 - * code adapted originally from kgdb, nemesis, & gdbserver
   31.14 - */
   31.15 -
   31.16 -#include <xen/lib.h>
   31.17 -#include <xen/sched.h>
   31.18 -#include <asm/ptrace.h>
   31.19 -#include <xen/keyhandler.h> 
   31.20 -#include <asm/processor.h>
   31.21 -#include <asm/pdb.h>
   31.22 -#include <xen/list.h>
   31.23 -#include <xen/serial.h>
   31.24 -
   31.25 -#define __PDB_GET_VAL 1
   31.26 -#define __PDB_SET_VAL 2
   31.27 -
   31.28 -/*
   31.29 - * Read or write memory in an address space
   31.30 - */
   31.31 -int pdb_change_values(u_char *buffer, int length,
   31.32 -		      unsigned long cr3, unsigned long addr, int rw)
   31.33 -{
   31.34 -	dummy();
   31.35 -	return 0;
   31.36 -}
   31.37 -
   31.38 -/*
   31.39 - * Set memory in a domain's address space
   31.40 - * Set "length" bytes at "address" from "domain" to the values in "buffer".
   31.41 - * Return the number of bytes set, 0 if there was a problem.
   31.42 - */
   31.43 -
   31.44 -int pdb_set_values(u_char *buffer, int length,
   31.45 -		   unsigned long cr3, unsigned long addr)
   31.46 -{
   31.47 -    int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL);
   31.48 -    return count;
   31.49 -}
   31.50 -
   31.51 -/*
   31.52 - * Read memory from a domain's address space.
   31.53 - * Fetch "length" bytes at "address" from "domain" into "buffer".
   31.54 - * Return the number of bytes read, 0 if there was a problem.
   31.55 - */
   31.56 -
   31.57 -int pdb_get_values(u_char *buffer, int length,
   31.58 -		   unsigned long cr3, unsigned long addr)
   31.59 -{
   31.60 -  return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL);
   31.61 -}
   31.62 -
    32.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.2 +++ b/xen/include/asm-ia64/linux/sort.h	Wed Aug 31 14:32:27 2005 -0600
    32.3 @@ -0,0 +1,10 @@
    32.4 +#ifndef _LINUX_SORT_H
    32.5 +#define _LINUX_SORT_H
    32.6 +
    32.7 +#include <linux/types.h>
    32.8 +
    32.9 +void sort(void *base, size_t num, size_t size,
   32.10 +	  int (*cmp)(const void *, const void *),
   32.11 +	  void (*swap)(void *, void *, int));
   32.12 +
   32.13 +#endif