ia64/xen-unstable

changeset 6454:b7276814008c

Begin updating to 2.6.13 base
author djm@kirby.fc.hp.com
date Wed Aug 31 14:32:27 2005 -0600 (2005-08-31)
parents 44316ce83277
children 4e4f1db8ea94
files xen/arch/ia64/Makefile xen/arch/ia64/linux-xen/minstate.h xen/arch/ia64/linux-xen/setup.c xen/arch/ia64/linux-xen/sort.c xen/arch/ia64/linux/README.origin xen/arch/ia64/linux/extable.c xen/arch/ia64/linux/ia64_ksyms.c xen/arch/ia64/linux/irq_lsapic.c xen/arch/ia64/linux/lib/flush.S xen/arch/ia64/linux/lib/memcpy_mck.S xen/arch/ia64/linux/lib/memset.S xen/arch/ia64/linux/pcdp.h xen/include/asm-ia64/linux/sort.h
line diff
     1.1 --- a/xen/arch/ia64/Makefile	Tue Aug 30 17:51:51 2005 -0600
     1.2 +++ b/xen/arch/ia64/Makefile	Wed Aug 31 14:32:27 2005 -0600
     1.3 @@ -1,19 +1,22 @@
     1.4  include $(BASEDIR)/Rules.mk
     1.5  
     1.6 -VPATH = linux linux-xen
     1.7 +VPATH = linux linux-xen linux/lib
     1.8 +#VPATH = linux-xen linux/lib
     1.9  
    1.10  # libs-y	+= arch/ia64/lib/lib.a
    1.11  
    1.12  OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \
    1.13 -	xenmisc.o pdb-stub.o acpi.o hypercall.o \
    1.14 +	xenmisc.o acpi.o hypercall.o \
    1.15  	machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \
    1.16  	idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \
    1.17  	xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \
    1.18 -	extable.o linuxextable.o xenirq.o xentime.o \
    1.19 +	extable.o linuxextable.o sort.o xenirq.o xentime.o \
    1.20  	regionreg.o entry.o unaligned.o privop.o vcpu.o \
    1.21  	irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \
    1.22  	grant_table.o sn_console.o
    1.23  
    1.24 +#OBJS += idiv64.o idiv32.o			\
    1.25 +
    1.26  # TMP holder to contain *.0 moved out of CONFIG_VTI
    1.27  OBJS += vmx_init.o
    1.28  
    1.29 @@ -22,6 +25,13 @@ OBJS += vmx_virt.o vmx_vcpu.o vmx_proces
    1.30  	vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \
    1.31  	vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o
    1.32  endif
    1.33 +
    1.34 +# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
    1.35 +OBJS +=	bitop.o clear_page.o flush.o copy_page_mck.o			\
    1.36 +	memset.o strlen.o memcpy_mck.o 					\
    1.37 +	__divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
    1.38 +	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
    1.39 +
    1.40  # perfmon.o
    1.41  # unwind.o needed for kernel unwinding (rare)
    1.42  
    1.43 @@ -30,8 +40,8 @@ OBJS := $(subst $(TARGET_ARCH)/asm-offse
    1.44  # remove following line if not privifying in memory
    1.45  # OBJS += privify.o
    1.46  
    1.47 -default: $(OBJS) head.o ia64lib.o xen.lds.s
    1.48 -	$(LD) -r -o arch.o $(OBJS) ia64lib.o
    1.49 +default: $(OBJS) head.o xen.lds.s
    1.50 +	$(LD) -r -o arch.o $(OBJS)
    1.51  	$(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \
    1.52  		-Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms
    1.53  	$(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET)
    1.54 @@ -79,12 +89,29 @@ xen.lds.s: xen.lds.S
    1.55  	$(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \
    1.56  		-o xen.lds.s xen.lds.S
    1.57  
    1.58 -ia64lib.o:
    1.59 -	$(MAKE) -C linux/lib && cp linux/lib/ia64lib.o .
    1.60 +# variants of divide/modulo
    1.61 +# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
    1.62 +__divdi3.o: idiv64.S
    1.63 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    1.64 +__udivdi3.o: idiv64.S
    1.65 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    1.66 +__moddi3.o: idiv64.S
    1.67 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    1.68 +__umoddi3.o: idiv64.S
    1.69 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    1.70 +__divsi3.o: idiv32.S
    1.71 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    1.72 +__udivsi3.o: idiv32.S
    1.73 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    1.74 +__modsi3.o: idiv32.S
    1.75 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    1.76 +__umodsi3.o: idiv32.S
    1.77 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    1.78 +
    1.79  
    1.80  clean:
    1.81  	rm -f *.o *~ core  xen.lds.s $(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s
    1.82  	rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h
    1.83 -	rm -f lib/*.o
    1.84 +	rm -f linux/lib/*.o
    1.85  
    1.86  .PHONY: default clean
     2.1 --- a/xen/arch/ia64/lib/Makefile	Tue Aug 30 17:51:51 2005 -0600
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,44 +0,0 @@
     2.4 -#
     2.5 -# Makefile for ia64-specific library routines..
     2.6 -#
     2.7 -
     2.8 -include $(BASEDIR)/Rules.mk
     2.9 -
    2.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
    2.11 -	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
    2.12 -	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
    2.13 -	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
    2.14 -	flush.o ip_fast_csum.o do_csum.o copy_user.o			\
    2.15 -	memset.o strlen.o memcpy.o 
    2.16 -
    2.17 -default: $(OBJS)
    2.18 -	$(LD) -r -o ia64lib.o $(OBJS)
    2.19 -
    2.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
    2.21 -
    2.22 -__divdi3.o: idiv64.S
    2.23 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    2.24 -
    2.25 -__udivdi3.o: idiv64.S
    2.26 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    2.27 -
    2.28 -__moddi3.o: idiv64.S
    2.29 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    2.30 -
    2.31 -__umoddi3.o: idiv64.S
    2.32 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    2.33 -
    2.34 -__divsi3.o: idiv32.S
    2.35 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    2.36 -
    2.37 -__udivsi3.o: idiv32.S
    2.38 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    2.39 -
    2.40 -__modsi3.o: idiv32.S
    2.41 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    2.42 -
    2.43 -__umodsi3.o: idiv32.S
    2.44 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    2.45 -
    2.46 -clean:
    2.47 -	rm -f *.o *~
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/ia64/linux-xen/minstate.h	Wed Aug 31 14:32:27 2005 -0600
     3.3 @@ -0,0 +1,254 @@
     3.4 +#include <linux/config.h>
     3.5 +
     3.6 +#include <asm/cache.h>
     3.7 +
     3.8 +#include "entry.h"
     3.9 +
    3.10 +/*
    3.11 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
    3.12 + * on interrupts.
    3.13 + *
    3.14 + *  On entry:
    3.15 + *	r1:	pointer to current task (ar.k6)
    3.16 + */
    3.17 +#define MINSTATE_START_SAVE_MIN_VIRT								\
    3.18 +(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
    3.19 +	;;											\
    3.20 +(pUStk)	mov.m r24=ar.rnat;									\
    3.21 +(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
    3.22 +(pKStk) mov r1=sp;					/* get sp  */				\
    3.23 +	;;											\
    3.24 +(pUStk) lfetch.fault.excl.nt1 [r22];								\
    3.25 +(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
    3.26 +(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
    3.27 +	;;											\
    3.28 +(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
    3.29 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
    3.30 +	;;											\
    3.31 +(pUStk)	mov r18=ar.bsp;										\
    3.32 +(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
    3.33 +
    3.34 +#define MINSTATE_END_SAVE_MIN_VIRT								\
    3.35 +	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
    3.36 +	;;
    3.37 +
    3.38 +/*
    3.39 + * For mca_asm.S we want to access the stack physically since the state is saved before we
    3.40 + * go virtual and don't want to destroy the iip or ipsr.
    3.41 + */
    3.42 +#define MINSTATE_START_SAVE_MIN_PHYS								\
    3.43 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;								\
    3.44 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;							\
    3.45 +(pKStk) ld8 r3 = [r3];;										\
    3.46 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;						\
    3.47 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;						\
    3.48 +(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
    3.49 +(pUStk)	addl r22=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	\
    3.50 +	;;											\
    3.51 +(pUStk)	mov r24=ar.rnat;									\
    3.52 +(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
    3.53 +(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
    3.54 +(pUStk)	dep r22=-1,r22,61,3;			/* compute kernel virtual addr of RBS */	\
    3.55 +	;;											\
    3.56 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		\
    3.57 +(pUStk)	mov ar.bspstore=r22;			/* switch to kernel RBS */			\
    3.58 +	;;											\
    3.59 +(pUStk)	mov r18=ar.bsp;										\
    3.60 +(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
    3.61 +
    3.62 +#define MINSTATE_END_SAVE_MIN_PHYS								\
    3.63 +	dep r12=-1,r12,61,3;		/* make sp a kernel virtual address */			\
    3.64 +	;;
    3.65 +
    3.66 +#ifdef MINSTATE_VIRT
    3.67 +# define MINSTATE_GET_CURRENT(reg)	\
    3.68 +		movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
    3.69 +		ld8 reg=[reg]
    3.70 +# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_VIRT
    3.71 +# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_VIRT
    3.72 +#endif
    3.73 +
    3.74 +#ifdef MINSTATE_PHYS
    3.75 +# define MINSTATE_GET_CURRENT(reg)	mov reg=IA64_KR(CURRENT);; tpa reg=reg
    3.76 +# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_PHYS
    3.77 +# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_PHYS
    3.78 +#endif
    3.79 +
    3.80 +/*
    3.81 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
    3.82 + * the minimum state necessary that allows us to turn psr.ic back
    3.83 + * on.
    3.84 + *
    3.85 + * Assumed state upon entry:
    3.86 + *	psr.ic: off
    3.87 + *	r31:	contains saved predicates (pr)
    3.88 + *
    3.89 + * Upon exit, the state is as follows:
    3.90 + *	psr.ic: off
    3.91 + *	 r2 = points to &pt_regs.r16
    3.92 + *	 r8 = contents of ar.ccv
    3.93 + *	 r9 = contents of ar.csd
    3.94 + *	r10 = contents of ar.ssd
    3.95 + *	r11 = FPSR_DEFAULT
    3.96 + *	r12 = kernel sp (kernel virtual address)
    3.97 + *	r13 = points to current task_struct (kernel virtual address)
    3.98 + *	p15 = TRUE if psr.i is set in cr.ipsr
    3.99 + *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
   3.100 + *		preserved
   3.101 + *
   3.102 + * Note that psr.ic is NOT turned on by this macro.  This is so that
   3.103 + * we can pass interruption state as arguments to a handler.
   3.104 + */
   3.105 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)							\
   3.106 +	MINSTATE_GET_CURRENT(r16);	/* M (or M;;I) */					\
   3.107 +	mov r27=ar.rsc;			/* M */							\
   3.108 +	mov r20=r1;			/* A */							\
   3.109 +	mov r25=ar.unat;		/* M */							\
   3.110 +	mov r29=cr.ipsr;		/* M */							\
   3.111 +	mov r26=ar.pfs;			/* I */							\
   3.112 +	mov r28=cr.iip;			/* M */							\
   3.113 +	mov r21=ar.fpsr;		/* M */							\
   3.114 +	COVER;				/* B;; (or nothing) */					\
   3.115 +	;;											\
   3.116 +	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
   3.117 +	;;											\
   3.118 +	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
   3.119 +	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
   3.120 +	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
   3.121 +	/* switch from user to kernel RBS: */							\
   3.122 +	;;											\
   3.123 +	invala;				/* M */							\
   3.124 +	SAVE_IFS;										\
   3.125 +	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
   3.126 +	;;											\
   3.127 +	MINSTATE_START_SAVE_MIN									\
   3.128 +	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
   3.129 +	adds r16=PT(CR_IPSR),r1;								\
   3.130 +	;;											\
   3.131 +	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
   3.132 +	st8 [r16]=r29;		/* save cr.ipsr */						\
   3.133 +	;;											\
   3.134 +	lfetch.fault.excl.nt1 [r17];								\
   3.135 +	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
   3.136 +	mov r29=b0										\
   3.137 +	;;											\
   3.138 +	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
   3.139 +	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
   3.140 +(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
   3.141 +	;;											\
   3.142 +.mem.offset 0,0; st8.spill [r16]=r8,16;								\
   3.143 +.mem.offset 8,0; st8.spill [r17]=r9,16;								\
   3.144 +        ;;											\
   3.145 +.mem.offset 0,0; st8.spill [r16]=r10,24;							\
   3.146 +.mem.offset 8,0; st8.spill [r17]=r11,24;							\
   3.147 +        ;;											\
   3.148 +	st8 [r16]=r28,16;	/* save cr.iip */						\
   3.149 +	st8 [r17]=r30,16;	/* save cr.ifs */						\
   3.150 +(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
   3.151 +	mov r8=ar.ccv;										\
   3.152 +	mov r9=ar.csd;										\
   3.153 +	mov r10=ar.ssd;										\
   3.154 +	movl r11=FPSR_DEFAULT;   /* L-unit */							\
   3.155 +	;;											\
   3.156 +	st8 [r16]=r25,16;	/* save ar.unat */						\
   3.157 +	st8 [r17]=r26,16;	/* save ar.pfs */						\
   3.158 +	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
   3.159 +	;;											\
   3.160 +	st8 [r16]=r27,16;	/* save ar.rsc */						\
   3.161 +(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
   3.162 +(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
   3.163 +	;;			/* avoid RAW on r16 & r17 */					\
   3.164 +(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
   3.165 +	st8 [r17]=r31,16;	/* save predicates */						\
   3.166 +(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
   3.167 +	;;											\
   3.168 +	st8 [r16]=r29,16;	/* save b0 */							\
   3.169 +	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
   3.170 +	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
   3.171 +	;;											\
   3.172 +.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
   3.173 +.mem.offset 8,0; st8.spill [r17]=r12,16;							\
   3.174 +	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
   3.175 +	;;											\
   3.176 +.mem.offset 0,0; st8.spill [r16]=r13,16;							\
   3.177 +.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
   3.178 +	movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;					\
   3.179 +	ld8 r13=[r13];			/* establish 'current' */				\
   3.180 +	;;											\
   3.181 +.mem.offset 0,0; st8.spill [r16]=r15,16;							\
   3.182 +.mem.offset 8,0; st8.spill [r17]=r14,16;							\
   3.183 +	;;											\
   3.184 +.mem.offset 0,0; st8.spill [r16]=r2,16;								\
   3.185 +.mem.offset 8,0; st8.spill [r17]=r3,16;								\
   3.186 +	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
   3.187 +	;;											\
   3.188 +	EXTRA;											\
   3.189 +	movl r1=__gp;		/* establish kernel global pointer */				\
   3.190 +	;;											\
   3.191 +	MINSTATE_END_SAVE_MIN
   3.192 +
   3.193 +/*
   3.194 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
   3.195 + *
   3.196 + * Assumed state upon entry:
   3.197 + *	psr.ic: on
   3.198 + *	r2:	points to &pt_regs.r16
   3.199 + *	r3:	points to &pt_regs.r17
   3.200 + *	r8:	contents of ar.ccv
   3.201 + *	r9:	contents of ar.csd
   3.202 + *	r10:	contents of ar.ssd
   3.203 + *	r11:	FPSR_DEFAULT
   3.204 + *
   3.205 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
   3.206 + */
   3.207 +#define SAVE_REST				\
   3.208 +.mem.offset 0,0; st8.spill [r2]=r16,16;		\
   3.209 +.mem.offset 8,0; st8.spill [r3]=r17,16;		\
   3.210 +	;;					\
   3.211 +.mem.offset 0,0; st8.spill [r2]=r18,16;		\
   3.212 +.mem.offset 8,0; st8.spill [r3]=r19,16;		\
   3.213 +	;;					\
   3.214 +.mem.offset 0,0; st8.spill [r2]=r20,16;		\
   3.215 +.mem.offset 8,0; st8.spill [r3]=r21,16;		\
   3.216 +	mov r18=b6;				\
   3.217 +	;;					\
   3.218 +.mem.offset 0,0; st8.spill [r2]=r22,16;		\
   3.219 +.mem.offset 8,0; st8.spill [r3]=r23,16;		\
   3.220 +	mov r19=b7;				\
   3.221 +	;;					\
   3.222 +.mem.offset 0,0; st8.spill [r2]=r24,16;		\
   3.223 +.mem.offset 8,0; st8.spill [r3]=r25,16;		\
   3.224 +	;;					\
   3.225 +.mem.offset 0,0; st8.spill [r2]=r26,16;		\
   3.226 +.mem.offset 8,0; st8.spill [r3]=r27,16;		\
   3.227 +	;;					\
   3.228 +.mem.offset 0,0; st8.spill [r2]=r28,16;		\
   3.229 +.mem.offset 8,0; st8.spill [r3]=r29,16;		\
   3.230 +	;;					\
   3.231 +.mem.offset 0,0; st8.spill [r2]=r30,16;		\
   3.232 +.mem.offset 8,0; st8.spill [r3]=r31,32;		\
   3.233 +	;;					\
   3.234 +	mov ar.fpsr=r11;	/* M-unit */	\
   3.235 +	st8 [r2]=r8,8;		/* ar.ccv */	\
   3.236 +	adds r24=PT(B6)-PT(F7),r3;		\
   3.237 +	;;					\
   3.238 +	stf.spill [r2]=f6,32;			\
   3.239 +	stf.spill [r3]=f7,32;			\
   3.240 +	;;					\
   3.241 +	stf.spill [r2]=f8,32;			\
   3.242 +	stf.spill [r3]=f9,32;			\
   3.243 +	;;					\
   3.244 +	stf.spill [r2]=f10;			\
   3.245 +	stf.spill [r3]=f11;			\
   3.246 +	adds r25=PT(B7)-PT(F11),r3;		\
   3.247 +	;;					\
   3.248 +	st8 [r24]=r18,16;       /* b6 */	\
   3.249 +	st8 [r25]=r19,16;       /* b7 */	\
   3.250 +	;;					\
   3.251 +	st8 [r24]=r9;        	/* ar.csd */	\
   3.252 +	st8 [r25]=r10;      	/* ar.ssd */	\
   3.253 +	;;
   3.254 +
   3.255 +#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(cover, mov r30=cr.ifs,)
   3.256 +#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
   3.257 +#define SAVE_MIN		DO_SAVE_MIN(     , mov r30=r0, )
     4.1 --- a/xen/arch/ia64/linux-xen/setup.c	Tue Aug 30 17:51:51 2005 -0600
     4.2 +++ b/xen/arch/ia64/linux-xen/setup.c	Wed Aug 31 14:32:27 2005 -0600
     4.3 @@ -4,10 +4,15 @@
     4.4   * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
     4.5   *	David Mosberger-Tang <davidm@hpl.hp.com>
     4.6   *	Stephane Eranian <eranian@hpl.hp.com>
     4.7 - * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
     4.8 + * Copyright (C) 2000, 2004 Intel Corp
     4.9 + * 	Rohit Seth <rohit.seth@intel.com>
    4.10 + * 	Suresh Siddha <suresh.b.siddha@intel.com>
    4.11 + * 	Gordon Jin <gordon.jin@intel.com>
    4.12   * Copyright (C) 1999 VA Linux Systems
    4.13   * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
    4.14   *
    4.15 + * 12/26/04 S.Siddha, G.Jin, R.Seth
    4.16 + *			Add multi-threading and multi-core detection
    4.17   * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
    4.18   * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
    4.19   * 03/31/00 R.Seth	cpu_initialized and current->processor fixes
    4.20 @@ -15,6 +20,7 @@
    4.21   * 02/01/00 R.Seth	fixed get_cpuinfo for SMP
    4.22   * 01/07/99 S.Eranian	added the support for command line argument
    4.23   * 06/24/99 W.Drummond	added boot_cpu_data.
    4.24 + * 05/28/05 Z. Menyhart	Dynamic stride size for "flush_icache_range()"
    4.25   */
    4.26  #include <linux/config.h>
    4.27  #include <linux/module.h>
    4.28 @@ -35,6 +41,10 @@
    4.29  #include <linux/serial_core.h>
    4.30  #include <linux/efi.h>
    4.31  #include <linux/initrd.h>
    4.32 +#ifndef XEN
    4.33 +#include <linux/platform.h>
    4.34 +#include <linux/pm.h>
    4.35 +#endif
    4.36  
    4.37  #include <asm/ia32.h>
    4.38  #include <asm/machvec.h>
    4.39 @@ -51,8 +61,10 @@
    4.40  #include <asm/smp.h>
    4.41  #include <asm/system.h>
    4.42  #include <asm/unistd.h>
    4.43 +#ifdef XEN
    4.44  #include <asm/vmx.h>
    4.45  #include <asm/io.h>
    4.46 +#endif
    4.47  
    4.48  #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
    4.49  # error "struct cpuinfo_ia64 too big!"
    4.50 @@ -64,12 +76,16 @@ EXPORT_SYMBOL(__per_cpu_offset);
    4.51  #endif
    4.52  
    4.53  DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
    4.54 +#ifdef XEN
    4.55  DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr);
    4.56 +#endif
    4.57  DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
    4.58  DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
    4.59  unsigned long ia64_cycles_per_usec;
    4.60  struct ia64_boot_param *ia64_boot_param;
    4.61  struct screen_info screen_info;
    4.62 +unsigned long vga_console_iobase;
    4.63 +unsigned long vga_console_membase;
    4.64  
    4.65  unsigned long ia64_max_cacheline_size;
    4.66  unsigned long ia64_iobase;	/* virtual address for I/O accesses */
    4.67 @@ -78,7 +94,12 @@ struct io_space io_space[MAX_IO_SPACES];
    4.68  EXPORT_SYMBOL(io_space);
    4.69  unsigned int num_io_spaces;
    4.70  
    4.71 -unsigned char aux_device_present = 0xaa;        /* XXX remove this when legacy I/O is gone */
    4.72 +/*
    4.73 + * "flush_icache_range()" needs to know what processor dependent stride size to use
    4.74 + * when it makes i-cache(s) coherent with d-caches.
    4.75 + */
    4.76 +#define	I_CACHE_STRIDE_SHIFT	5	/* Safest way to go: 32 bytes by 32 bytes */
    4.77 +unsigned long ia64_i_cache_stride_shift = ~0;
    4.78  
    4.79  /*
    4.80   * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
    4.81 @@ -287,23 +308,25 @@ io_port_init (void)
    4.82  static inline int __init
    4.83  early_console_setup (char *cmdline)
    4.84  {
    4.85 +	int earlycons = 0;
    4.86 +
    4.87  #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
    4.88  	{
    4.89  		extern int sn_serial_console_early_setup(void);
    4.90  		if (!sn_serial_console_early_setup())
    4.91 -			return 0;
    4.92 +			earlycons++;
    4.93  	}
    4.94  #endif
    4.95  #ifdef CONFIG_EFI_PCDP
    4.96  	if (!efi_setup_pcdp_console(cmdline))
    4.97 -		return 0;
    4.98 +		earlycons++;
    4.99  #endif
   4.100  #ifdef CONFIG_SERIAL_8250_CONSOLE
   4.101  	if (!early_serial_console_init(cmdline))
   4.102 -		return 0;
   4.103 +		earlycons++;
   4.104  #endif
   4.105  
   4.106 -	return -1;
   4.107 +	return (earlycons) ? 0 : -1;
   4.108  }
   4.109  
   4.110  static inline void
   4.111 @@ -315,7 +338,34 @@ mark_bsp_online (void)
   4.112  #endif
   4.113  }
   4.114  
   4.115 -void __init
   4.116 +#ifdef CONFIG_SMP
   4.117 +static void
   4.118 +check_for_logical_procs (void)
   4.119 +{
   4.120 +	pal_logical_to_physical_t info;
   4.121 +	s64 status;
   4.122 +
   4.123 +	status = ia64_pal_logical_to_phys(0, &info);
   4.124 +	if (status == -1) {
   4.125 +		printk(KERN_INFO "No logical to physical processor mapping "
   4.126 +		       "available\n");
   4.127 +		return;
   4.128 +	}
   4.129 +	if (status) {
   4.130 +		printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
   4.131 +		       status);
   4.132 +		return;
   4.133 +	}
   4.134 +	/*
   4.135 +	 * Total number of siblings that BSP has.  Though not all of them 
   4.136 +	 * may have booted successfully. The correct number of siblings 
   4.137 +	 * booted is in info.overview_num_log.
   4.138 +	 */
   4.139 +	smp_num_siblings = info.overview_tpc;
   4.140 +	smp_num_cpucores = info.overview_cpp;
   4.141 +}
   4.142 +#endif
   4.143 +
   4.144  #ifdef XEN
   4.145  early_setup_arch (char **cmdline_p)
   4.146  #else
   4.147 @@ -398,6 +448,19 @@ late_setup_arch (char **cmdline_p)
   4.148  
   4.149  #ifdef CONFIG_SMP
   4.150  	cpu_physical_id(0) = hard_smp_processor_id();
   4.151 +
   4.152 +	cpu_set(0, cpu_sibling_map[0]);
   4.153 +	cpu_set(0, cpu_core_map[0]);
   4.154 +
   4.155 +	check_for_logical_procs();
   4.156 +	if (smp_num_cpucores > 1)
   4.157 +		printk(KERN_INFO
   4.158 +		       "cpu package is Multi-Core capable: number of cores=%d\n",
   4.159 +		       smp_num_cpucores);
   4.160 +	if (smp_num_siblings > 1)
   4.161 +		printk(KERN_INFO
   4.162 +		       "cpu package is Multi-Threading capable: number of siblings=%d\n",
   4.163 +		       smp_num_siblings);
   4.164  #endif
   4.165  
   4.166  #ifdef XEN
   4.167 @@ -505,12 +568,23 @@ show_cpuinfo (struct seq_file *m, void *
   4.168  		   "cpu regs   : %u\n"
   4.169  		   "cpu MHz    : %lu.%06lu\n"
   4.170  		   "itc MHz    : %lu.%06lu\n"
   4.171 -		   "BogoMIPS   : %lu.%02lu\n\n",
   4.172 +		   "BogoMIPS   : %lu.%02lu\n",
   4.173  		   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
   4.174  		   features, c->ppn, c->number,
   4.175  		   c->proc_freq / 1000000, c->proc_freq % 1000000,
   4.176  		   c->itc_freq / 1000000, c->itc_freq % 1000000,
   4.177  		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
   4.178 +#ifdef CONFIG_SMP
   4.179 +	seq_printf(m, "siblings   : %u\n", c->num_log);
   4.180 +	if (c->threads_per_core > 1 || c->cores_per_socket > 1)
   4.181 +		seq_printf(m,
   4.182 +		   	   "physical id: %u\n"
   4.183 +		   	   "core id    : %u\n"
   4.184 +		   	   "thread id  : %u\n",
   4.185 +		   	   c->socket_id, c->core_id, c->thread_id);
   4.186 +#endif
   4.187 +	seq_printf(m,"\n");
   4.188 +
   4.189  	return 0;
   4.190  }
   4.191  
   4.192 @@ -581,6 +655,14 @@ identify_cpu (struct cpuinfo_ia64 *c)
   4.193  	memcpy(c->vendor, cpuid.field.vendor, 16);
   4.194  #ifdef CONFIG_SMP
   4.195  	c->cpu = smp_processor_id();
   4.196 +
   4.197 +	/* below default values will be overwritten  by identify_siblings() 
   4.198 +	 * for Multi-Threading/Multi-Core capable cpu's
   4.199 +	 */
   4.200 +	c->threads_per_core = c->cores_per_socket = c->num_log = 1;
   4.201 +	c->socket_id = -1;
   4.202 +
   4.203 +	identify_siblings(c);
   4.204  #endif
   4.205  	c->ppn = cpuid.field.ppn;
   4.206  	c->number = cpuid.field.number;
   4.207 @@ -611,6 +693,12 @@ setup_per_cpu_areas (void)
   4.208  	/* start_kernel() requires this... */
   4.209  }
   4.210  
   4.211 +/*
   4.212 + * Calculate the max. cache line size.
   4.213 + *
   4.214 + * In addition, the minimum of the i-cache stride sizes is calculated for
   4.215 + * "flush_icache_range()".
   4.216 + */
   4.217  static void
   4.218  get_max_cacheline_size (void)
   4.219  {
   4.220 @@ -624,6 +712,8 @@ get_max_cacheline_size (void)
   4.221                  printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
   4.222                         __FUNCTION__, status);
   4.223                  max = SMP_CACHE_BYTES;
   4.224 +		/* Safest setup for "flush_icache_range()" */
   4.225 +		ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
   4.226  		goto out;
   4.227          }
   4.228  
   4.229 @@ -632,14 +722,31 @@ get_max_cacheline_size (void)
   4.230  						    &cci);
   4.231  		if (status != 0) {
   4.232  			printk(KERN_ERR
   4.233 -			       "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n",
   4.234 +			       "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
   4.235  			       __FUNCTION__, l, status);
   4.236  			max = SMP_CACHE_BYTES;
   4.237 +			/* The safest setup for "flush_icache_range()" */
   4.238 +			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
   4.239 +			cci.pcci_unified = 1;
   4.240  		}
   4.241  		line_size = 1 << cci.pcci_line_size;
   4.242  		if (line_size > max)
   4.243  			max = line_size;
   4.244 -        }
   4.245 +		if (!cci.pcci_unified) {
   4.246 +			status = ia64_pal_cache_config_info(l,
   4.247 +						    /* cache_type (instruction)= */ 1,
   4.248 +						    &cci);
   4.249 +			if (status != 0) {
   4.250 +				printk(KERN_ERR
   4.251 +				"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
   4.252 +					__FUNCTION__, l, status);
   4.253 +				/* The safest setup for "flush_icache_range()" */
   4.254 +				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
   4.255 +			}
   4.256 +		}
   4.257 +		if (cci.pcci_stride < ia64_i_cache_stride_shift)
   4.258 +			ia64_i_cache_stride_shift = cci.pcci_stride;
   4.259 +	}
   4.260    out:
   4.261  	if (max > ia64_max_cacheline_size)
   4.262  		ia64_max_cacheline_size = max;
   4.263 @@ -700,7 +807,17 @@ cpu_init (void)
   4.264  	ia64_set_kr(IA64_KR_FPU_OWNER, 0);
   4.265  
   4.266  	/*
   4.267 -	 * Initialize default control register to defer all speculative faults.  The
   4.268 +	 * Initialize the page-table base register to a global
   4.269 +	 * directory with all zeroes.  This ensure that we can handle
   4.270 +	 * TLB-misses to user address-space even before we created the
   4.271 +	 * first user address-space.  This may happen, e.g., due to
   4.272 +	 * aggressive use of lfetch.fault.
   4.273 +	 */
   4.274 +	ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
   4.275 +
   4.276 +	/*
   4.277 +	 * Initialize default control register to defer speculative faults except
   4.278 +	 * for those arising from TLB misses, which are not deferred.  The
   4.279  	 * kernel MUST NOT depend on a particular setting of these bits (in other words,
   4.280  	 * the kernel must have recovery code for all speculative accesses).  Turn on
   4.281  	 * dcr.lc as per recommendation by the architecture team.  Most IA-32 apps
   4.282 @@ -762,6 +879,9 @@ cpu_init (void)
   4.283  	/* size of physical stacked register partition plus 8 bytes: */
   4.284  	__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
   4.285  	platform_cpu_init();
   4.286 +#ifndef XEN
   4.287 +	pm_idle = default_idle;
   4.288 +#endif
   4.289  }
   4.290  
   4.291  void
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/xen/arch/ia64/linux-xen/sort.c	Wed Aug 31 14:32:27 2005 -0600
     5.3 @@ -0,0 +1,122 @@
     5.4 +/*
     5.5 + * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
     5.6 + *
     5.7 + * Jan 23 2005  Matt Mackall <mpm@selenic.com>
     5.8 + */
     5.9 +
    5.10 +#include <linux/kernel.h>
    5.11 +#include <linux/module.h>
    5.12 +#ifdef XEN
    5.13 +#include <linux/types.h>
    5.14 +#endif
    5.15 +
    5.16 +void u32_swap(void *a, void *b, int size)
    5.17 +{
    5.18 +	u32 t = *(u32 *)a;
    5.19 +	*(u32 *)a = *(u32 *)b;
    5.20 +	*(u32 *)b = t;
    5.21 +}
    5.22 +
    5.23 +void generic_swap(void *a, void *b, int size)
    5.24 +{
    5.25 +	char t;
    5.26 +
    5.27 +	do {
    5.28 +		t = *(char *)a;
    5.29 +		*(char *)a++ = *(char *)b;
    5.30 +		*(char *)b++ = t;
    5.31 +	} while (--size > 0);
    5.32 +}
    5.33 +
    5.34 +/*
    5.35 + * sort - sort an array of elements
    5.36 + * @base: pointer to data to sort
    5.37 + * @num: number of elements
    5.38 + * @size: size of each element
    5.39 + * @cmp: pointer to comparison function
    5.40 + * @swap: pointer to swap function or NULL
    5.41 + *
    5.42 + * This function does a heapsort on the given array. You may provide a
    5.43 + * swap function optimized to your element type.
    5.44 + *
    5.45 + * Sorting time is O(n log n) both on average and worst-case. While
    5.46 + * qsort is about 20% faster on average, it suffers from exploitable
    5.47 + * O(n*n) worst-case behavior and extra memory requirements that make
    5.48 + * it less suitable for kernel use.
    5.49 + */
    5.50 +
    5.51 +void sort(void *base, size_t num, size_t size,
    5.52 +	  int (*cmp)(const void *, const void *),
    5.53 +	  void (*swap)(void *, void *, int size))
    5.54 +{
    5.55 +	/* pre-scale counters for performance */
    5.56 +	int i = (num/2) * size, n = num * size, c, r;
    5.57 +
    5.58 +	if (!swap)
    5.59 +		swap = (size == 4 ? u32_swap : generic_swap);
    5.60 +
    5.61 +	/* heapify */
    5.62 +	for ( ; i >= 0; i -= size) {
    5.63 +		for (r = i; r * 2 < n; r  = c) {
    5.64 +			c = r * 2;
    5.65 +			if (c < n - size && cmp(base + c, base + c + size) < 0)
    5.66 +				c += size;
    5.67 +			if (cmp(base + r, base + c) >= 0)
    5.68 +				break;
    5.69 +			swap(base + r, base + c, size);
    5.70 +		}
    5.71 +	}
    5.72 +
    5.73 +	/* sort */
    5.74 +	for (i = n - size; i >= 0; i -= size) {
    5.75 +		swap(base, base + i, size);
    5.76 +		for (r = 0; r * 2 < i; r = c) {
    5.77 +			c = r * 2;
    5.78 +			if (c < i - size && cmp(base + c, base + c + size) < 0)
    5.79 +				c += size;
    5.80 +			if (cmp(base + r, base + c) >= 0)
    5.81 +				break;
    5.82 +			swap(base + r, base + c, size);
    5.83 +		}
    5.84 +	}
    5.85 +}
    5.86 +
    5.87 +EXPORT_SYMBOL(sort);
    5.88 +
    5.89 +#if 0
    5.90 +/* a simple boot-time regression test */
    5.91 +
    5.92 +int cmpint(const void *a, const void *b)
    5.93 +{
    5.94 +	return *(int *)a - *(int *)b;
    5.95 +}
    5.96 +
    5.97 +static int sort_test(void)
    5.98 +{
    5.99 +	int *a, i, r = 1;
   5.100 +
   5.101 +	a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
   5.102 +	BUG_ON(!a);
   5.103 +
   5.104 +	printk("testing sort()\n");
   5.105 +
   5.106 +	for (i = 0; i < 1000; i++) {
   5.107 +		r = (r * 725861) % 6599;
   5.108 +		a[i] = r;
   5.109 +	}
   5.110 +
   5.111 +	sort(a, 1000, sizeof(int), cmpint, NULL);
   5.112 +
   5.113 +	for (i = 0; i < 999; i++)
   5.114 +		if (a[i] > a[i+1]) {
   5.115 +			printk("sort() failed!\n");
   5.116 +			break;
   5.117 +		}
   5.118 +
   5.119 +	kfree(a);
   5.120 +
   5.121 +	return 0;
   5.122 +}
   5.123 +
   5.124 +module_init(sort_test);
   5.125 +#endif
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/xen/arch/ia64/linux/README.origin	Wed Aug 31 14:32:27 2005 -0600
     6.3 @@ -0,0 +1,20 @@
     6.4 +Source files in this directory are identical copies of linux-2.6.13 files:
     6.5 +
     6.6 +cmdline.c		-> linux/lib/cmdline.c
     6.7 +efi_stub.S		-> linux/arch/ia64/efi_stub.S
     6.8 +extable.c		-> linux/arch/ia64/mm/extable.c
     6.9 +hpsim.S			-> linux/arch/ia64/hp/sim/hpsim.S
    6.10 +ia64_ksyms.c		-> linux/arch/ia64/kernel/ia64_ksyms.c
    6.11 +linuxextable.c		-> linux/kernel/extable.c
    6.12 +machvec.c		-> linux/arch/ia64/kernel/machvec.c
    6.13 +patch.c			-> linux/arch/ia64/kernel/patch.c
    6.14 +pcdp.h			-> drivers/firmware/pcdp.h
    6.15 +lib/bitop.c		-> linux/arch/ia64/lib/bitop.c
    6.16 +lib/clear_page.S	-> linux/arch/ia64/lib/clear_page.S
    6.17 +lib/copy_page_mck.S	-> linux/arch/ia64/lib/copy_page_mck.S
    6.18 +lib/flush.S		-> linux/arch/ia64/lib/flush.S
    6.19 +lib/idiv32.S		-> linux/arch/ia64/lib/idiv32.S
    6.20 +lib/idiv64.S		-> linux/arch/ia64/lib/idiv64.S
    6.21 +lib/memcpy_mck.S	-> linux/arch/ia64/lib/memcpy_mck.S
    6.22 +lib/memset.S		-> linux/arch/ia64/lib/memset.S
    6.23 +lib/strlen.S		-> linux/arch/ia64/lib/strlen.S
     7.1 --- a/xen/arch/ia64/linux/extable.c	Tue Aug 30 17:51:51 2005 -0600
     7.2 +++ b/xen/arch/ia64/linux/extable.c	Wed Aug 31 14:32:27 2005 -0600
     7.3 @@ -6,29 +6,29 @@
     7.4   */
     7.5  
     7.6  #include <linux/config.h>
     7.7 +#include <linux/sort.h>
     7.8  
     7.9  #include <asm/uaccess.h>
    7.10  #include <asm/module.h>
    7.11  
    7.12 -static inline int
    7.13 -compare_entries (struct exception_table_entry *l, struct exception_table_entry *r)
    7.14 +static int cmp_ex(const void *a, const void *b)
    7.15  {
    7.16 +	const struct exception_table_entry *l = a, *r = b;
    7.17  	u64 lip = (u64) &l->addr + l->addr;
    7.18  	u64 rip = (u64) &r->addr + r->addr;
    7.19  
    7.20 +	/* avoid overflow */
    7.21 +	if (lip > rip)
    7.22 +		return 1;
    7.23  	if (lip < rip)
    7.24  		return -1;
    7.25 -	if (lip == rip)
    7.26 -		return 0;
    7.27 -	else
    7.28 -		return 1;
    7.29 +	return 0;
    7.30  }
    7.31  
    7.32 -static inline void
    7.33 -swap_entries (struct exception_table_entry *l, struct exception_table_entry *r)
    7.34 +static void swap_ex(void *a, void *b, int size)
    7.35  {
    7.36 +	struct exception_table_entry *l = a, *r = b, tmp;
    7.37  	u64 delta = (u64) r - (u64) l;
    7.38 -	struct exception_table_entry tmp;
    7.39  
    7.40  	tmp = *l;
    7.41  	l->addr = r->addr + delta;
    7.42 @@ -38,23 +38,20 @@ swap_entries (struct exception_table_ent
    7.43  }
    7.44  
    7.45  /*
    7.46 - * Sort the exception table.  It's usually already sorted, but there may be unordered
    7.47 - * entries due to multiple text sections (such as the .init text section).  Note that the
    7.48 - * exception-table-entries contain location-relative addresses, which requires a bit of
    7.49 - * care during sorting to avoid overflows in the offset members (e.g., it would not be
    7.50 - * safe to make a temporary copy of an exception-table entry on the stack, because the
    7.51 - * stack may be more than 2GB away from the exception-table).
    7.52 + * Sort the exception table. It's usually already sorted, but there
    7.53 + * may be unordered entries due to multiple text sections (such as the
    7.54 + * .init text section). Note that the exception-table-entries contain
    7.55 + * location-relative addresses, which requires a bit of care during
    7.56 + * sorting to avoid overflows in the offset members (e.g., it would
    7.57 + * not be safe to make a temporary copy of an exception-table entry on
    7.58 + * the stack, because the stack may be more than 2GB away from the
    7.59 + * exception-table).
    7.60   */
    7.61 -void
    7.62 -sort_extable (struct exception_table_entry *start, struct exception_table_entry *finish)
    7.63 +void sort_extable (struct exception_table_entry *start,
    7.64 +		   struct exception_table_entry *finish)
    7.65  {
    7.66 -	struct exception_table_entry *p, *q;
    7.67 -
    7.68 - 	/* insertion sort */
    7.69 -	for (p = start + 1; p < finish; ++p)
    7.70 -		/* start .. p-1 is sorted; push p down to it's proper place */
    7.71 -		for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; --q)
    7.72 -			swap_entries(&q[0], &q[-1]);
    7.73 +	sort(start, finish - start, sizeof(struct exception_table_entry),
    7.74 +	     cmp_ex, swap_ex);
    7.75  }
    7.76  
    7.77  const struct exception_table_entry *
     8.1 --- a/xen/arch/ia64/linux/ia64_ksyms.c	Tue Aug 30 17:51:51 2005 -0600
     8.2 +++ b/xen/arch/ia64/linux/ia64_ksyms.c	Wed Aug 31 14:32:27 2005 -0600
     8.3 @@ -58,9 +58,6 @@ EXPORT_SYMBOL(__strlen_user);
     8.4  EXPORT_SYMBOL(__strncpy_from_user);
     8.5  EXPORT_SYMBOL(__strnlen_user);
     8.6  
     8.7 -#include <asm/unistd.h>
     8.8 -EXPORT_SYMBOL(__ia64_syscall);
     8.9 -
    8.10  /* from arch/ia64/lib */
    8.11  extern void __divsi3(void);
    8.12  extern void __udivsi3(void);
     9.1 --- a/xen/arch/ia64/linux/lib/Makefile	Tue Aug 30 17:51:51 2005 -0600
     9.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.3 @@ -1,44 +0,0 @@
     9.4 -#
     9.5 -# Makefile for ia64-specific library routines..
     9.6 -#
     9.7 -
     9.8 -include $(BASEDIR)/Rules.mk
     9.9 -
    9.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
    9.11 -	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
    9.12 -	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
    9.13 -	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
    9.14 -	flush.o ip_fast_csum.o do_csum.o copy_user.o			\
    9.15 -	memset.o strlen.o memcpy.o 
    9.16 -
    9.17 -default: $(OBJS)
    9.18 -	$(LD) -r -o ia64lib.o $(OBJS)
    9.19 -
    9.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
    9.21 -
    9.22 -__divdi3.o: idiv64.S
    9.23 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    9.24 -
    9.25 -__udivdi3.o: idiv64.S
    9.26 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    9.27 -
    9.28 -__moddi3.o: idiv64.S
    9.29 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    9.30 -
    9.31 -__umoddi3.o: idiv64.S
    9.32 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    9.33 -
    9.34 -__divsi3.o: idiv32.S
    9.35 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    9.36 -
    9.37 -__udivsi3.o: idiv32.S
    9.38 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    9.39 -
    9.40 -__modsi3.o: idiv32.S
    9.41 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    9.42 -
    9.43 -__umodsi3.o: idiv32.S
    9.44 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    9.45 -
    9.46 -clean:
    9.47 -	rm -f *.o *~
    10.1 --- a/xen/arch/ia64/linux/lib/carta_random.S	Tue Aug 30 17:51:51 2005 -0600
    10.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.3 @@ -1,54 +0,0 @@
    10.4 -/*
    10.5 - * Fast, simple, yet decent quality random number generator based on
    10.6 - * a paper by David G. Carta ("Two Fast Implementations of the
    10.7 - * `Minimal Standard' Random Number Generator," Communications of the
    10.8 - * ACM, January, 1990).
    10.9 - *
   10.10 - * Copyright (C) 2002 Hewlett-Packard Co
   10.11 - *	David Mosberger-Tang <davidm@hpl.hp.com>
   10.12 - */
   10.13 -
   10.14 -#include <asm/asmmacro.h>
   10.15 -
   10.16 -#define a	r2
   10.17 -#define m	r3
   10.18 -#define lo	r8
   10.19 -#define hi	r9
   10.20 -#define t0	r16
   10.21 -#define t1	r17
   10.22 -#define	seed	r32
   10.23 -
   10.24 -GLOBAL_ENTRY(carta_random32)
   10.25 -	movl	a = (16807 << 16) | 16807
   10.26 -	;;
   10.27 -	pmpyshr2.u t0 = a, seed, 0
   10.28 -	pmpyshr2.u t1 = a, seed, 16
   10.29 -	;;
   10.30 -	unpack2.l t0 = t1, t0
   10.31 -	dep	m = -1, r0, 0, 31
   10.32 -	;;
   10.33 -	zxt4	lo = t0
   10.34 -	shr.u	hi = t0, 32
   10.35 -	;;
   10.36 -	dep	t0 = 0, hi, 15, 49	// t0 = (hi & 0x7fff)
   10.37 -	;;
   10.38 -	shl	t0 = t0, 16		// t0 = (hi & 0x7fff) << 16
   10.39 -	shr	t1 = hi, 15		// t1 = (hi >> 15)
   10.40 -	;;
   10.41 -	add	lo = lo, t0
   10.42 -	;;
   10.43 -	cmp.gtu	p6, p0 = lo, m
   10.44 -	;;
   10.45 -(p6)	and	lo = lo, m
   10.46 -	;;
   10.47 -(p6)	add	lo = 1, lo
   10.48 -	;;
   10.49 -	add	lo = lo, t1
   10.50 -	;;
   10.51 -	cmp.gtu p6, p0 = lo, m
   10.52 -	;;
   10.53 -(p6)	and	lo = lo, m
   10.54 -	;;
   10.55 -(p6)	add	lo = 1, lo
   10.56 -	br.ret.sptk.many rp
   10.57 -END(carta_random32)
    11.1 --- a/xen/arch/ia64/linux/lib/checksum.c	Tue Aug 30 17:51:51 2005 -0600
    11.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.3 @@ -1,102 +0,0 @@
    11.4 -/*
    11.5 - * Network checksum routines
    11.6 - *
    11.7 - * Copyright (C) 1999, 2003 Hewlett-Packard Co
    11.8 - *	Stephane Eranian <eranian@hpl.hp.com>
    11.9 - *
   11.10 - * Most of the code coming from arch/alpha/lib/checksum.c
   11.11 - *
   11.12 - * This file contains network checksum routines that are better done
   11.13 - * in an architecture-specific manner due to speed..
   11.14 - */
   11.15 -
   11.16 -#include <linux/module.h>
   11.17 -#include <linux/string.h>
   11.18 -
   11.19 -#include <asm/byteorder.h>
   11.20 -
   11.21 -static inline unsigned short
   11.22 -from64to16 (unsigned long x)
   11.23 -{
   11.24 -	/* add up 32-bit words for 33 bits */
   11.25 -	x = (x & 0xffffffff) + (x >> 32);
   11.26 -	/* add up 16-bit and 17-bit words for 17+c bits */
   11.27 -	x = (x & 0xffff) + (x >> 16);
   11.28 -	/* add up 16-bit and 2-bit for 16+c bit */
   11.29 -	x = (x & 0xffff) + (x >> 16);
   11.30 -	/* add up carry.. */
   11.31 -	x = (x & 0xffff) + (x >> 16);
   11.32 -	return x;
   11.33 -}
   11.34 -
   11.35 -/*
   11.36 - * computes the checksum of the TCP/UDP pseudo-header
   11.37 - * returns a 16-bit checksum, already complemented.
   11.38 - */
   11.39 -unsigned short int
   11.40 -csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
   11.41 -		   unsigned short proto, unsigned int sum)
   11.42 -{
   11.43 -	return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
   11.44 -			   ((unsigned long) proto << 8));
   11.45 -}
   11.46 -
   11.47 -EXPORT_SYMBOL(csum_tcpudp_magic);
   11.48 -
   11.49 -unsigned int
   11.50 -csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
   11.51 -		    unsigned short proto, unsigned int sum)
   11.52 -{
   11.53 -	unsigned long result;
   11.54 -
   11.55 -	result = (saddr + daddr + sum +
   11.56 -		  ((unsigned long) ntohs(len) << 16) +
   11.57 -		  ((unsigned long) proto << 8));
   11.58 -
   11.59 -	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
   11.60 -	/* 64 to 33 */
   11.61 -	result = (result & 0xffffffff) + (result >> 32);
   11.62 -	/* 33 to 32 */
   11.63 -	result = (result & 0xffffffff) + (result >> 32);
   11.64 -	return result;
   11.65 -}
   11.66 -
   11.67 -extern unsigned long do_csum (const unsigned char *, long);
   11.68 -
   11.69 -/*
   11.70 - * computes the checksum of a memory block at buff, length len,
   11.71 - * and adds in "sum" (32-bit)
   11.72 - *
   11.73 - * returns a 32-bit number suitable for feeding into itself
   11.74 - * or csum_tcpudp_magic
   11.75 - *
   11.76 - * this function must be called with even lengths, except
   11.77 - * for the last fragment, which may be odd
   11.78 - *
   11.79 - * it's best to have buff aligned on a 32-bit boundary
   11.80 - */
   11.81 -unsigned int
   11.82 -csum_partial (const unsigned char * buff, int len, unsigned int sum)
   11.83 -{
   11.84 -	unsigned long result = do_csum(buff, len);
   11.85 -
   11.86 -	/* add in old sum, and carry.. */
   11.87 -	result += sum;
   11.88 -	/* 32+c bits -> 32 bits */
   11.89 -	result = (result & 0xffffffff) + (result >> 32);
   11.90 -	return result;
   11.91 -}
   11.92 -
   11.93 -EXPORT_SYMBOL(csum_partial);
   11.94 -
   11.95 -/*
   11.96 - * this routine is used for miscellaneous IP-like checksums, mainly
   11.97 - * in icmp.c
   11.98 - */
   11.99 -unsigned short
  11.100 -ip_compute_csum (unsigned char * buff, int len)
  11.101 -{
  11.102 -	return ~do_csum(buff,len);
  11.103 -}
  11.104 -
  11.105 -EXPORT_SYMBOL(ip_compute_csum);
    12.1 --- a/xen/arch/ia64/linux/lib/clear_user.S	Tue Aug 30 17:51:51 2005 -0600
    12.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.3 @@ -1,209 +0,0 @@
    12.4 -/*
    12.5 - * This routine clears to zero a linear memory buffer in user space.
    12.6 - *
    12.7 - * Inputs:
    12.8 - *	in0:	address of buffer
    12.9 - *	in1:	length of buffer in bytes
   12.10 - * Outputs:
   12.11 - *	r8:	number of bytes that didn't get cleared due to a fault
   12.12 - *
   12.13 - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
   12.14 - *	Stephane Eranian <eranian@hpl.hp.com>
   12.15 - */
   12.16 -
   12.17 -#include <asm/asmmacro.h>
   12.18 -
   12.19 -//
   12.20 -// arguments
   12.21 -//
   12.22 -#define buf		r32
   12.23 -#define len		r33
   12.24 -
   12.25 -//
   12.26 -// local registers
   12.27 -//
   12.28 -#define cnt		r16
   12.29 -#define buf2		r17
   12.30 -#define saved_lc	r18
   12.31 -#define saved_pfs	r19
   12.32 -#define tmp		r20
   12.33 -#define len2		r21
   12.34 -#define len3		r22
   12.35 -
   12.36 -//
   12.37 -// Theory of operations:
   12.38 -//	- we check whether or not the buffer is small, i.e., less than 17
   12.39 -//	  in which case we do the byte by byte loop.
   12.40 -//
   12.41 -//	- Otherwise we go progressively from 1 byte store to 8byte store in
   12.42 -//	  the head part, the body is a 16byte store loop and we finish we the
   12.43 -//	  tail for the last 15 bytes.
   12.44 -//	  The good point about this breakdown is that the long buffer handling
   12.45 -//	  contains only 2 branches.
   12.46 -//
   12.47 -//	The reason for not using shifting & masking for both the head and the
   12.48 -//	tail is to stay semantically correct. This routine is not supposed
   12.49 -//	to write bytes outside of the buffer. While most of the time this would
   12.50 -//	be ok, we can't tolerate a mistake. A classical example is the case
   12.51 -//	of multithreaded code were to the extra bytes touched is actually owned
   12.52 -//	by another thread which runs concurrently to ours. Another, less likely,
   12.53 -//	example is with device drivers where reading an I/O mapped location may
   12.54 -//	have side effects (same thing for writing).
   12.55 -//
   12.56 -
   12.57 -GLOBAL_ENTRY(__do_clear_user)
   12.58 -	.prologue
   12.59 -	.save ar.pfs, saved_pfs
   12.60 -	alloc	saved_pfs=ar.pfs,2,0,0,0
   12.61 -	cmp.eq p6,p0=r0,len		// check for zero length
   12.62 -	.save ar.lc, saved_lc
   12.63 -	mov saved_lc=ar.lc		// preserve ar.lc (slow)
   12.64 -	.body
   12.65 -	;;				// avoid WAW on CFM
   12.66 -	adds tmp=-1,len			// br.ctop is repeat/until
   12.67 -	mov ret0=len			// return value is length at this point
   12.68 -(p6)	br.ret.spnt.many rp
   12.69 -	;;
   12.70 -	cmp.lt p6,p0=16,len		// if len > 16 then long memset
   12.71 -	mov ar.lc=tmp			// initialize lc for small count
   12.72 -(p6)	br.cond.dptk .long_do_clear
   12.73 -	;;				// WAR on ar.lc
   12.74 -	//
   12.75 -	// worst case 16 iterations, avg 8 iterations
   12.76 -	//
   12.77 -	// We could have played with the predicates to use the extra
   12.78 -	// M slot for 2 stores/iteration but the cost the initialization
   12.79 -	// the various counters compared to how long the loop is supposed
   12.80 -	// to last on average does not make this solution viable.
   12.81 -	//
   12.82 -1:
   12.83 -	EX( .Lexit1, st1 [buf]=r0,1 )
   12.84 -	adds len=-1,len			// countdown length using len
   12.85 -	br.cloop.dptk 1b
   12.86 -	;;				// avoid RAW on ar.lc
   12.87 -	//
   12.88 -	// .Lexit4: comes from byte by byte loop
   12.89 -	//	    len contains bytes left
   12.90 -.Lexit1:
   12.91 -	mov ret0=len			// faster than using ar.lc
   12.92 -	mov ar.lc=saved_lc
   12.93 -	br.ret.sptk.many rp		// end of short clear_user
   12.94 -
   12.95 -
   12.96 -	//
   12.97 -	// At this point we know we have more than 16 bytes to copy
   12.98 -	// so we focus on alignment (no branches required)
   12.99 -	//
  12.100 -	// The use of len/len2 for countdown of the number of bytes left
  12.101 -	// instead of ret0 is due to the fact that the exception code
  12.102 -	// changes the values of r8.
  12.103 -	//
  12.104 -.long_do_clear:
  12.105 -	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
  12.106 -	;;
  12.107 -	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
  12.108 -(p6)	adds len=-1,len;;		// sync because buf is modified
  12.109 -	tbit.nz p6,p0=buf,1
  12.110 -	;;
  12.111 -	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
  12.112 -(p6)	adds len=-2,len;;
  12.113 -	tbit.nz p6,p0=buf,2
  12.114 -	;;
  12.115 -	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
  12.116 -(p6)	adds len=-4,len;;
  12.117 -	tbit.nz p6,p0=buf,3
  12.118 -	;;
  12.119 -	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
  12.120 -(p6)	adds len=-8,len;;
  12.121 -	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
  12.122 -	;;
  12.123 -	cmp.eq p6,p0=r0,cnt
  12.124 -	adds tmp=-1,cnt
  12.125 -(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
  12.126 -	;;
  12.127 -	adds buf2=8,buf			// setup second base pointer
  12.128 -	mov ar.lc=tmp
  12.129 -	;;
  12.130 -
  12.131 -	//
  12.132 -	// 16bytes/iteration core loop
  12.133 -	//
  12.134 -	// The second store can never generate a fault because
  12.135 -	// we come into the loop only when we are 16-byte aligned.
  12.136 -	// This means that if we cross a page then it will always be
  12.137 -	// in the first store and never in the second.
  12.138 -	//
  12.139 -	//
  12.140 -	// We need to keep track of the remaining length. A possible (optimistic)
  12.141 -	// way would be to use ar.lc and derive how many byte were left by
  12.142 -	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
  12.143 -	// every iteration.
  12.144 -	// However we need to keep the synchronization point. A template
  12.145 -	// M;;MB does not exist and thus we can keep the addition at no
  12.146 -	// extra cycle cost (use a nop slot anyway). It also simplifies the
  12.147 -	// (unlikely)  error recovery code
  12.148 -	//
  12.149 -
  12.150 -2:	EX(.Lexit3, st8 [buf]=r0,16 )
  12.151 -	;;				// needed to get len correct when error
  12.152 -	st8 [buf2]=r0,16
  12.153 -	adds len=-16,len
  12.154 -	br.cloop.dptk 2b
  12.155 -	;;
  12.156 -	mov ar.lc=saved_lc
  12.157 -	//
  12.158 -	// tail correction based on len only
  12.159 -	//
  12.160 -	// We alternate the use of len3,len2 to allow parallelism and correct
  12.161 -	// error handling. We also reuse p6/p7 to return correct value.
  12.162 -	// The addition of len2/len3 does not cost anything more compared to
  12.163 -	// the regular memset as we had empty slots.
  12.164 -	//
  12.165 -.dotail:
  12.166 -	mov len2=len			// for parallelization of error handling
  12.167 -	mov len3=len
  12.168 -	tbit.nz p6,p0=len,3
  12.169 -	;;
  12.170 -	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
  12.171 -(p6)	adds len3=-8,len2
  12.172 -	tbit.nz p7,p6=len,2
  12.173 -	;;
  12.174 -	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
  12.175 -(p7)	adds len2=-4,len3
  12.176 -	tbit.nz p6,p7=len,1
  12.177 -	;;
  12.178 -	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
  12.179 -(p6)	adds len3=-2,len2
  12.180 -	tbit.nz p7,p6=len,0
  12.181 -	;;
  12.182 -	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
  12.183 -	mov ret0=r0				// success
  12.184 -	br.ret.sptk.many rp			// end of most likely path
  12.185 -
  12.186 -	//
  12.187 -	// Outlined error handling code
  12.188 -	//
  12.189 -
  12.190 -	//
  12.191 -	// .Lexit3: comes from core loop, need restore pr/lc
  12.192 -	//	    len contains bytes left
  12.193 -	//
  12.194 -	//
  12.195 -	// .Lexit2:
  12.196 -	//	if p6 -> coming from st8 or st2 : len2 contains what's left
  12.197 -	//	if p7 -> coming from st4 or st1 : len3 contains what's left
  12.198 -	// We must restore lc/pr even though might not have been used.
  12.199 -.Lexit2:
  12.200 -	.pred.rel "mutex", p6, p7
  12.201 -(p6)	mov len=len2
  12.202 -(p7)	mov len=len3
  12.203 -	;;
  12.204 -	//
  12.205 -	// .Lexit4: comes from head, need not restore pr/lc
  12.206 -	//	    len contains bytes left
  12.207 -	//
  12.208 -.Lexit3:
  12.209 -	mov ret0=len
  12.210 -	mov ar.lc=saved_lc
  12.211 -	br.ret.sptk.many rp
  12.212 -END(__do_clear_user)
    13.1 --- a/xen/arch/ia64/linux/lib/copy_page.S	Tue Aug 30 17:51:51 2005 -0600
    13.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.3 @@ -1,98 +0,0 @@
    13.4 -/*
    13.5 - *
    13.6 - * Optimized version of the standard copy_page() function
    13.7 - *
    13.8 - * Inputs:
    13.9 - *	in0:	address of target page
   13.10 - *	in1:	address of source page
   13.11 - * Output:
   13.12 - *	no return value
   13.13 - *
   13.14 - * Copyright (C) 1999, 2001 Hewlett-Packard Co
   13.15 - *	Stephane Eranian <eranian@hpl.hp.com>
   13.16 - *	David Mosberger <davidm@hpl.hp.com>
   13.17 - *
   13.18 - * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
   13.19 - */
   13.20 -#include <asm/asmmacro.h>
   13.21 -#include <asm/page.h>
   13.22 -
   13.23 -#define PIPE_DEPTH	3
   13.24 -#define EPI		p[PIPE_DEPTH-1]
   13.25 -
   13.26 -#define lcount		r16
   13.27 -#define saved_pr	r17
   13.28 -#define saved_lc	r18
   13.29 -#define saved_pfs	r19
   13.30 -#define src1		r20
   13.31 -#define src2		r21
   13.32 -#define tgt1		r22
   13.33 -#define tgt2		r23
   13.34 -#define srcf		r24
   13.35 -#define tgtf		r25
   13.36 -#define tgt_last	r26
   13.37 -
   13.38 -#define Nrot		((8*PIPE_DEPTH+7)&~7)
   13.39 -
   13.40 -GLOBAL_ENTRY(copy_page)
   13.41 -	.prologue
   13.42 -	.save ar.pfs, saved_pfs
   13.43 -	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
   13.44 -
   13.45 -	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
   13.46 -	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
   13.47 -	.rotp p[PIPE_DEPTH]
   13.48 -
   13.49 -	.save ar.lc, saved_lc
   13.50 -	mov saved_lc=ar.lc
   13.51 -	mov ar.ec=PIPE_DEPTH
   13.52 -
   13.53 -	mov lcount=PAGE_SIZE/64-1
   13.54 -	.save pr, saved_pr
   13.55 -	mov saved_pr=pr
   13.56 -	mov pr.rot=1<<16
   13.57 -
   13.58 -	.body
   13.59 -
   13.60 -	mov src1=in1
   13.61 -	adds src2=8,in1
   13.62 -	mov tgt_last = PAGE_SIZE
   13.63 -	;;
   13.64 -	adds tgt2=8,in0
   13.65 -	add srcf=512,in1
   13.66 -	mov ar.lc=lcount
   13.67 -	mov tgt1=in0
   13.68 -	add tgtf=512,in0
   13.69 -	add tgt_last = tgt_last, in0
   13.70 -	;;
   13.71 -1:
   13.72 -(p[0])	ld8 t1[0]=[src1],16
   13.73 -(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
   13.74 -(p[0])	ld8 t2[0]=[src2],16
   13.75 -(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
   13.76 -	cmp.ltu p6,p0 = tgtf, tgt_last
   13.77 -	;;
   13.78 -(p[0])	ld8 t3[0]=[src1],16
   13.79 -(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
   13.80 -(p[0])	ld8 t4[0]=[src2],16
   13.81 -(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
   13.82 -	;;
   13.83 -(p[0])	ld8 t5[0]=[src1],16
   13.84 -(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
   13.85 -(p[0])	ld8 t6[0]=[src2],16
   13.86 -(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
   13.87 -	;;
   13.88 -(p[0])	ld8 t7[0]=[src1],16
   13.89 -(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
   13.90 -(p[0])	ld8 t8[0]=[src2],16
   13.91 -(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
   13.92 -
   13.93 -(p6)	lfetch [srcf], 64
   13.94 -(p6)	lfetch [tgtf], 64
   13.95 -	br.ctop.sptk.few 1b
   13.96 -	;;
   13.97 -	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
   13.98 -	mov ar.pfs=saved_pfs
   13.99 -	mov ar.lc=saved_lc
  13.100 -	br.ret.sptk.many rp
  13.101 -END(copy_page)
    14.1 --- a/xen/arch/ia64/linux/lib/copy_user.S	Tue Aug 30 17:51:51 2005 -0600
    14.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.3 @@ -1,610 +0,0 @@
    14.4 -/*
    14.5 - *
    14.6 - * Optimized version of the copy_user() routine.
    14.7 - * It is used to copy date across the kernel/user boundary.
    14.8 - *
    14.9 - * The source and destination are always on opposite side of
   14.10 - * the boundary. When reading from user space we must catch
   14.11 - * faults on loads. When writing to user space we must catch
   14.12 - * errors on stores. Note that because of the nature of the copy
   14.13 - * we don't need to worry about overlapping regions.
   14.14 - *
   14.15 - *
   14.16 - * Inputs:
   14.17 - *	in0	address of source buffer
   14.18 - *	in1	address of destination buffer
   14.19 - *	in2	number of bytes to copy
   14.20 - *
   14.21 - * Outputs:
   14.22 - *	ret0	0 in case of success. The number of bytes NOT copied in
   14.23 - *		case of error.
   14.24 - *
   14.25 - * Copyright (C) 2000-2001 Hewlett-Packard Co
   14.26 - *	Stephane Eranian <eranian@hpl.hp.com>
   14.27 - *
   14.28 - * Fixme:
   14.29 - *	- handle the case where we have more than 16 bytes and the alignment
   14.30 - *	  are different.
   14.31 - *	- more benchmarking
   14.32 - *	- fix extraneous stop bit introduced by the EX() macro.
   14.33 - */
   14.34 -
   14.35 -#include <asm/asmmacro.h>
   14.36 -
   14.37 -//
   14.38 -// Tuneable parameters
   14.39 -//
   14.40 -#define COPY_BREAK	16	// we do byte copy below (must be >=16)
   14.41 -#define PIPE_DEPTH	21	// pipe depth
   14.42 -
   14.43 -#define EPI		p[PIPE_DEPTH-1]
   14.44 -
   14.45 -//
   14.46 -// arguments
   14.47 -//
   14.48 -#define dst		in0
   14.49 -#define src		in1
   14.50 -#define len		in2
   14.51 -
   14.52 -//
   14.53 -// local registers
   14.54 -//
   14.55 -#define t1		r2	// rshift in bytes
   14.56 -#define t2		r3	// lshift in bytes
   14.57 -#define rshift		r14	// right shift in bits
   14.58 -#define lshift		r15	// left shift in bits
   14.59 -#define word1		r16
   14.60 -#define word2		r17
   14.61 -#define cnt		r18
   14.62 -#define len2		r19
   14.63 -#define saved_lc	r20
   14.64 -#define saved_pr	r21
   14.65 -#define tmp		r22
   14.66 -#define val		r23
   14.67 -#define src1		r24
   14.68 -#define dst1		r25
   14.69 -#define src2		r26
   14.70 -#define dst2		r27
   14.71 -#define len1		r28
   14.72 -#define enddst		r29
   14.73 -#define endsrc		r30
   14.74 -#define saved_pfs	r31
   14.75 -
   14.76 -GLOBAL_ENTRY(__copy_user)
   14.77 -	.prologue
   14.78 -	.save ar.pfs, saved_pfs
   14.79 -	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
   14.80 -
   14.81 -	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
   14.82 -	.rotp p[PIPE_DEPTH]
   14.83 -
   14.84 -	adds len2=-1,len	// br.ctop is repeat/until
   14.85 -	mov ret0=r0
   14.86 -
   14.87 -	;;			// RAW of cfm when len=0
   14.88 -	cmp.eq p8,p0=r0,len	// check for zero length
   14.89 -	.save ar.lc, saved_lc
   14.90 -	mov saved_lc=ar.lc	// preserve ar.lc (slow)
   14.91 -(p8)	br.ret.spnt.many rp	// empty mempcy()
   14.92 -	;;
   14.93 -	add enddst=dst,len	// first byte after end of source
   14.94 -	add endsrc=src,len	// first byte after end of destination
   14.95 -	.save pr, saved_pr
   14.96 -	mov saved_pr=pr		// preserve predicates
   14.97 -
   14.98 -	.body
   14.99 -
  14.100 -	mov dst1=dst		// copy because of rotation
  14.101 -	mov ar.ec=PIPE_DEPTH
  14.102 -	mov pr.rot=1<<16	// p16=true all others are false
  14.103 -
  14.104 -	mov src1=src		// copy because of rotation
  14.105 -	mov ar.lc=len2		// initialize lc for small count
  14.106 -	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
  14.107 -
  14.108 -	xor tmp=src,dst		// same alignment test prepare
  14.109 -(p10)	br.cond.dptk .long_copy_user
  14.110 -	;;			// RAW pr.rot/p16 ?
  14.111 -	//
  14.112 -	// Now we do the byte by byte loop with software pipeline
  14.113 -	//
  14.114 -	// p7 is necessarily false by now
  14.115 -1:
  14.116 -	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  14.117 -	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  14.118 -	br.ctop.dptk.few 1b
  14.119 -	;;
  14.120 -	mov ar.lc=saved_lc
  14.121 -	mov pr=saved_pr,0xffffffffffff0000
  14.122 -	mov ar.pfs=saved_pfs		// restore ar.ec
  14.123 -	br.ret.sptk.many rp		// end of short memcpy
  14.124 -
  14.125 -	//
  14.126 -	// Not 8-byte aligned
  14.127 -	//
  14.128 -.diff_align_copy_user:
  14.129 -	// At this point we know we have more than 16 bytes to copy
  14.130 -	// and also that src and dest do _not_ have the same alignment.
  14.131 -	and src2=0x7,src1				// src offset
  14.132 -	and dst2=0x7,dst1				// dst offset
  14.133 -	;;
  14.134 -	// The basic idea is that we copy byte-by-byte at the head so
  14.135 -	// that we can reach 8-byte alignment for both src1 and dst1.
  14.136 -	// Then copy the body using software pipelined 8-byte copy,
  14.137 -	// shifting the two back-to-back words right and left, then copy
  14.138 -	// the tail by copying byte-by-byte.
  14.139 -	//
  14.140 -	// Fault handling. If the byte-by-byte at the head fails on the
  14.141 -	// load, then restart and finish the pipleline by copying zeros
  14.142 -	// to the dst1. Then copy zeros for the rest of dst1.
  14.143 -	// If 8-byte software pipeline fails on the load, do the same as
  14.144 -	// failure_in3 does. If the byte-by-byte at the tail fails, it is
  14.145 -	// handled simply by failure_in_pipe1.
  14.146 -	//
  14.147 -	// The case p14 represents the source has more bytes in the
  14.148 -	// the first word (by the shifted part), whereas the p15 needs to
  14.149 -	// copy some bytes from the 2nd word of the source that has the
  14.150 -	// tail of the 1st of the destination.
  14.151 -	//
  14.152 -
  14.153 -	//
  14.154 -	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
  14.155 -	// to copy the head to dst1, to start 8-byte copy software pipeline.
  14.156 -	// We know src1 is not 8-byte aligned in this case.
  14.157 -	//
  14.158 -	cmp.eq p14,p15=r0,dst2
  14.159 -(p15)	br.cond.spnt 1f
  14.160 -	;;
  14.161 -	sub t1=8,src2
  14.162 -	mov t2=src2
  14.163 -	;;
  14.164 -	shl rshift=t2,3
  14.165 -	sub len1=len,t1					// set len1
  14.166 -	;;
  14.167 -	sub lshift=64,rshift
  14.168 -	;;
  14.169 -	br.cond.spnt .word_copy_user
  14.170 -	;;
  14.171 -1:
  14.172 -	cmp.leu	p14,p15=src2,dst2
  14.173 -	sub t1=dst2,src2
  14.174 -	;;
  14.175 -	.pred.rel "mutex", p14, p15
  14.176 -(p14)	sub word1=8,src2				// (8 - src offset)
  14.177 -(p15)	sub t1=r0,t1					// absolute value
  14.178 -(p15)	sub word1=8,dst2				// (8 - dst offset)
  14.179 -	;;
  14.180 -	// For the case p14, we don't need to copy the shifted part to
  14.181 -	// the 1st word of destination.
  14.182 -	sub t2=8,t1
  14.183 -(p14)	sub word1=word1,t1
  14.184 -	;;
  14.185 -	sub len1=len,word1				// resulting len
  14.186 -(p15)	shl rshift=t1,3					// in bits
  14.187 -(p14)	shl rshift=t2,3
  14.188 -	;;
  14.189 -(p14)	sub len1=len1,t1
  14.190 -	adds cnt=-1,word1
  14.191 -	;;
  14.192 -	sub lshift=64,rshift
  14.193 -	mov ar.ec=PIPE_DEPTH
  14.194 -	mov pr.rot=1<<16	// p16=true all others are false
  14.195 -	mov ar.lc=cnt
  14.196 -	;;
  14.197 -2:
  14.198 -	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
  14.199 -	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  14.200 -	br.ctop.dptk.few 2b
  14.201 -	;;
  14.202 -	clrrrb
  14.203 -	;;
  14.204 -.word_copy_user:
  14.205 -	cmp.gtu p9,p0=16,len1
  14.206 -(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
  14.207 -	;;
  14.208 -	shr.u cnt=len1,3		// number of 64-bit words
  14.209 -	;;
  14.210 -	adds cnt=-1,cnt
  14.211 -	;;
  14.212 -	.pred.rel "mutex", p14, p15
  14.213 -(p14)	sub src1=src1,t2
  14.214 -(p15)	sub src1=src1,t1
  14.215 -	//
  14.216 -	// Now both src1 and dst1 point to an 8-byte aligned address. And
  14.217 -	// we have more than 8 bytes to copy.
  14.218 -	//
  14.219 -	mov ar.lc=cnt
  14.220 -	mov ar.ec=PIPE_DEPTH
  14.221 -	mov pr.rot=1<<16	// p16=true all others are false
  14.222 -	;;
  14.223 -3:
  14.224 -	//
  14.225 -	// The pipleline consists of 3 stages:
  14.226 -	// 1 (p16):	Load a word from src1
  14.227 -	// 2 (EPI_1):	Shift right pair, saving to tmp
  14.228 -	// 3 (EPI):	Store tmp to dst1
  14.229 -	//
  14.230 -	// To make it simple, use at least 2 (p16) loops to set up val1[n]
  14.231 -	// because we need 2 back-to-back val1[] to get tmp.
  14.232 -	// Note that this implies EPI_2 must be p18 or greater.
  14.233 -	//
  14.234 -
  14.235 -#define EPI_1		p[PIPE_DEPTH-2]
  14.236 -#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
  14.237 -#define CASE(pred, shift)	\
  14.238 -	(pred)	br.cond.spnt .copy_user_bit##shift
  14.239 -#define BODY(rshift)						\
  14.240 -.copy_user_bit##rshift:						\
  14.241 -1:								\
  14.242 -	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
  14.243 -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
  14.244 -	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
  14.245 -(p16)	mov val1[0]=r0;						\
  14.246 -	br.ctop.dptk 1b;					\
  14.247 -	;;							\
  14.248 -	br.cond.sptk.many .diff_align_do_tail;			\
  14.249 -2:								\
  14.250 -(EPI)	st8 [dst1]=tmp,8;					\
  14.251 -(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
  14.252 -3:								\
  14.253 -(p16)	mov val1[1]=r0;						\
  14.254 -(p16)	mov val1[0]=r0;						\
  14.255 -	br.ctop.dptk 2b;					\
  14.256 -	;;							\
  14.257 -	br.cond.sptk.many .failure_in2
  14.258 -
  14.259 -	//
  14.260 -	// Since the instruction 'shrp' requires a fixed 128-bit value
  14.261 -	// specifying the bits to shift, we need to provide 7 cases
  14.262 -	// below.
  14.263 -	//
  14.264 -	SWITCH(p6, 8)
  14.265 -	SWITCH(p7, 16)
  14.266 -	SWITCH(p8, 24)
  14.267 -	SWITCH(p9, 32)
  14.268 -	SWITCH(p10, 40)
  14.269 -	SWITCH(p11, 48)
  14.270 -	SWITCH(p12, 56)
  14.271 -	;;
  14.272 -	CASE(p6, 8)
  14.273 -	CASE(p7, 16)
  14.274 -	CASE(p8, 24)
  14.275 -	CASE(p9, 32)
  14.276 -	CASE(p10, 40)
  14.277 -	CASE(p11, 48)
  14.278 -	CASE(p12, 56)
  14.279 -	;;
  14.280 -	BODY(8)
  14.281 -	BODY(16)
  14.282 -	BODY(24)
  14.283 -	BODY(32)
  14.284 -	BODY(40)
  14.285 -	BODY(48)
  14.286 -	BODY(56)
  14.287 -	;;
  14.288 -.diff_align_do_tail:
  14.289 -	.pred.rel "mutex", p14, p15
  14.290 -(p14)	sub src1=src1,t1
  14.291 -(p14)	adds dst1=-8,dst1
  14.292 -(p15)	sub dst1=dst1,t1
  14.293 -	;;
  14.294 -4:
  14.295 -	// Tail correction.
  14.296 -	//
  14.297 -	// The problem with this piplelined loop is that the last word is not
  14.298 -	// loaded and thus parf of the last word written is not correct.
  14.299 -	// To fix that, we simply copy the tail byte by byte.
  14.300 -
  14.301 -	sub len1=endsrc,src1,1
  14.302 -	clrrrb
  14.303 -	;;
  14.304 -	mov ar.ec=PIPE_DEPTH
  14.305 -	mov pr.rot=1<<16	// p16=true all others are false
  14.306 -	mov ar.lc=len1
  14.307 -	;;
  14.308 -5:
  14.309 -	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  14.310 -	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  14.311 -	br.ctop.dptk.few 5b
  14.312 -	;;
  14.313 -	mov ar.lc=saved_lc
  14.314 -	mov pr=saved_pr,0xffffffffffff0000
  14.315 -	mov ar.pfs=saved_pfs
  14.316 -	br.ret.sptk.many rp
  14.317 -
  14.318 -	//
  14.319 -	// Beginning of long mempcy (i.e. > 16 bytes)
  14.320 -	//
  14.321 -.long_copy_user:
  14.322 -	tbit.nz p6,p7=src1,0	// odd alignment
  14.323 -	and tmp=7,tmp
  14.324 -	;;
  14.325 -	cmp.eq p10,p8=r0,tmp
  14.326 -	mov len1=len		// copy because of rotation
  14.327 -(p8)	br.cond.dpnt .diff_align_copy_user
  14.328 -	;;
  14.329 -	// At this point we know we have more than 16 bytes to copy
  14.330 -	// and also that both src and dest have the same alignment
  14.331 -	// which may not be the one we want. So for now we must move
  14.332 -	// forward slowly until we reach 16byte alignment: no need to
  14.333 -	// worry about reaching the end of buffer.
  14.334 -	//
  14.335 -	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
  14.336 -(p6)	adds len1=-1,len1;;
  14.337 -	tbit.nz p7,p0=src1,1
  14.338 -	;;
  14.339 -	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
  14.340 -(p7)	adds len1=-2,len1;;
  14.341 -	tbit.nz p8,p0=src1,2
  14.342 -	;;
  14.343 -	//
  14.344 -	// Stop bit not required after ld4 because if we fail on ld4
  14.345 -	// we have never executed the ld1, therefore st1 is not executed.
  14.346 -	//
  14.347 -	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
  14.348 -	;;
  14.349 -	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
  14.350 -	tbit.nz p9,p0=src1,3
  14.351 -	;;
  14.352 -	//
  14.353 -	// Stop bit not required after ld8 because if we fail on ld8
  14.354 -	// we have never executed the ld2, therefore st2 is not executed.
  14.355 -	//
  14.356 -	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
  14.357 -	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
  14.358 -(p8)	adds len1=-4,len1
  14.359 -	;;
  14.360 -	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
  14.361 -(p9)	adds len1=-8,len1;;
  14.362 -	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
  14.363 -	;;
  14.364 -	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
  14.365 -	tbit.nz p6,p0=len1,3
  14.366 -	cmp.eq p7,p0=r0,cnt
  14.367 -	adds tmp=-1,cnt			// br.ctop is repeat/until
  14.368 -(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
  14.369 -	;;
  14.370 -	adds src2=8,src1
  14.371 -	adds dst2=8,dst1
  14.372 -	mov ar.lc=tmp
  14.373 -	;;
  14.374 -	//
  14.375 -	// 16bytes/iteration
  14.376 -	//
  14.377 -2:
  14.378 -	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
  14.379 -(p16)	ld8 val2[0]=[src2],16
  14.380 -
  14.381 -	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
  14.382 -(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
  14.383 -	br.ctop.dptk 2b
  14.384 -	;;			// RAW on src1 when fall through from loop
  14.385 -	//
  14.386 -	// Tail correction based on len only
  14.387 -	//
  14.388 -	// No matter where we come from (loop or test) the src1 pointer
  14.389 -	// is 16 byte aligned AND we have less than 16 bytes to copy.
  14.390 -	//
  14.391 -.dotail:
  14.392 -	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
  14.393 -	tbit.nz p7,p0=len1,2
  14.394 -	;;
  14.395 -	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
  14.396 -	tbit.nz p8,p0=len1,1
  14.397 -	;;
  14.398 -	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
  14.399 -	tbit.nz p9,p0=len1,0
  14.400 -	;;
  14.401 -	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
  14.402 -	;;
  14.403 -	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
  14.404 -	mov ar.lc=saved_lc
  14.405 -	;;
  14.406 -	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
  14.407 -	mov pr=saved_pr,0xffffffffffff0000
  14.408 -	;;
  14.409 -	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
  14.410 -	mov ar.pfs=saved_pfs
  14.411 -	;;
  14.412 -	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
  14.413 -	br.ret.sptk.many rp
  14.414 -
  14.415 -
  14.416 -	//
  14.417 -	// Here we handle the case where the byte by byte copy fails
  14.418 -	// on the load.
  14.419 -	// Several factors make the zeroing of the rest of the buffer kind of
  14.420 -	// tricky:
  14.421 -	//	- the pipeline: loads/stores are not in sync (pipeline)
  14.422 -	//
  14.423 -	//	  In the same loop iteration, the dst1 pointer does not directly
  14.424 -	//	  reflect where the faulty load was.
  14.425 -	//
  14.426 -	//	- pipeline effect
  14.427 -	//	  When you get a fault on load, you may have valid data from
  14.428 -	//	  previous loads not yet store in transit. Such data must be
  14.429 -	//	  store normally before moving onto zeroing the rest.
  14.430 -	//
  14.431 -	//	- single/multi dispersal independence.
  14.432 -	//
  14.433 -	// solution:
  14.434 -	//	- we don't disrupt the pipeline, i.e. data in transit in
  14.435 -	//	  the software pipeline will be eventually move to memory.
  14.436 -	//	  We simply replace the load with a simple mov and keep the
  14.437 -	//	  pipeline going. We can't really do this inline because
  14.438 -	//	  p16 is always reset to 1 when lc > 0.
  14.439 -	//
  14.440 -.failure_in_pipe1:
  14.441 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  14.442 -1:
  14.443 -(p16)	mov val1[0]=r0
  14.444 -(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
  14.445 -	br.ctop.dptk 1b
  14.446 -	;;
  14.447 -	mov pr=saved_pr,0xffffffffffff0000
  14.448 -	mov ar.lc=saved_lc
  14.449 -	mov ar.pfs=saved_pfs
  14.450 -	br.ret.sptk.many rp
  14.451 -
  14.452 -	//
  14.453 -	// This is the case where the byte by byte copy fails on the load
  14.454 -	// when we copy the head. We need to finish the pipeline and copy
  14.455 -	// zeros for the rest of the destination. Since this happens
  14.456 -	// at the top we still need to fill the body and tail.
  14.457 -.failure_in_pipe2:
  14.458 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  14.459 -2:
  14.460 -(p16)	mov val1[0]=r0
  14.461 -(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
  14.462 -	br.ctop.dptk 2b
  14.463 -	;;
  14.464 -	sub len=enddst,dst1,1		// precompute len
  14.465 -	br.cond.dptk.many .failure_in1bis
  14.466 -	;;
  14.467 -
  14.468 -	//
  14.469 -	// Here we handle the head & tail part when we check for alignment.
  14.470 -	// The following code handles only the load failures. The
  14.471 -	// main diffculty comes from the fact that loads/stores are
  14.472 -	// scheduled. So when you fail on a load, the stores corresponding
  14.473 -	// to previous successful loads must be executed.
  14.474 -	//
  14.475 -	// However some simplifications are possible given the way
  14.476 -	// things work.
  14.477 -	//
  14.478 -	// 1) HEAD
  14.479 -	// Theory of operation:
  14.480 -	//
  14.481 -	//  Page A   | Page B
  14.482 -	//  ---------|-----
  14.483 -	//          1|8 x
  14.484 -	//	  1 2|8 x
  14.485 -	//	    4|8 x
  14.486 -	//	  1 4|8 x
  14.487 -	//        2 4|8 x
  14.488 -	//      1 2 4|8 x
  14.489 -	//	     |1
  14.490 -	//	     |2 x
  14.491 -	//	     |4 x
  14.492 -	//
  14.493 -	// page_size >= 4k (2^12).  (x means 4, 2, 1)
  14.494 -	// Here we suppose Page A exists and Page B does not.
  14.495 -	//
  14.496 -	// As we move towards eight byte alignment we may encounter faults.
  14.497 -	// The numbers on each page show the size of the load (current alignment).
  14.498 -	//
  14.499 -	// Key point:
  14.500 -	//	- if you fail on 1, 2, 4 then you have never executed any smaller
  14.501 -	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
  14.502 -	//	  before.
  14.503 -	//
  14.504 -	// This allows us to simplify the cleanup code, because basically you
  14.505 -	// only have to worry about "pending" stores in the case of a failing
  14.506 -	// ld8(). Given the way the code is written today, this means only
  14.507 -	// worry about st2, st4. There we can use the information encapsulated
  14.508 -	// into the predicates.
  14.509 -	//
  14.510 -	// Other key point:
  14.511 -	//	- if you fail on the ld8 in the head, it means you went straight
  14.512 -	//	  to it, i.e. 8byte alignment within an unexisting page.
  14.513 -	// Again this comes from the fact that if you crossed just for the ld8 then
  14.514 -	// you are 8byte aligned but also 16byte align, therefore you would
  14.515 -	// either go for the 16byte copy loop OR the ld8 in the tail part.
  14.516 -	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
  14.517 -	// because it would mean you had 15bytes to copy in which case you
  14.518 -	// would have defaulted to the byte by byte copy.
  14.519 -	//
  14.520 -	//
  14.521 -	// 2) TAIL
  14.522 -	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
  14.523 -	// aligned.
  14.524 -	//
  14.525 -	// Key point:
  14.526 -	// This means that we either:
  14.527 -	//		- are right on a page boundary
  14.528 -	//	OR
  14.529 -	//		- are at more than 16 bytes from a page boundary with
  14.530 -	//		  at most 15 bytes to copy: no chance of crossing.
  14.531 -	//
  14.532 -	// This allows us to assume that if we fail on a load we haven't possibly
  14.533 -	// executed any of the previous (tail) ones, so we don't need to do
  14.534 -	// any stores. For instance, if we fail on ld2, this means we had
  14.535 -	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
  14.536 -	//
  14.537 -	// This means that we are in a situation similar the a fault in the
  14.538 -	// head part. That's nice!
  14.539 -	//
  14.540 -.failure_in1:
  14.541 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  14.542 -	sub len=endsrc,src1,1
  14.543 -	//
  14.544 -	// we know that ret0 can never be zero at this point
  14.545 -	// because we failed why trying to do a load, i.e. there is still
  14.546 -	// some work to do.
  14.547 -	// The failure_in1bis and length problem is taken care of at the
  14.548 -	// calling side.
  14.549 -	//
  14.550 -	;;
  14.551 -.failure_in1bis:		// from (.failure_in3)
  14.552 -	mov ar.lc=len		// Continue with a stupid byte store.
  14.553 -	;;
  14.554 -5:
  14.555 -	st1 [dst1]=r0,1
  14.556 -	br.cloop.dptk 5b
  14.557 -	;;
  14.558 -	mov pr=saved_pr,0xffffffffffff0000
  14.559 -	mov ar.lc=saved_lc
  14.560 -	mov ar.pfs=saved_pfs
  14.561 -	br.ret.sptk.many rp
  14.562 -
  14.563 -	//
  14.564 -	// Here we simply restart the loop but instead
  14.565 -	// of doing loads we fill the pipeline with zeroes
  14.566 -	// We can't simply store r0 because we may have valid
  14.567 -	// data in transit in the pipeline.
  14.568 -	// ar.lc and ar.ec are setup correctly at this point
  14.569 -	//
  14.570 -	// we MUST use src1/endsrc here and not dst1/enddst because
  14.571 -	// of the pipeline effect.
  14.572 -	//
  14.573 -.failure_in3:
  14.574 -	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  14.575 -	;;
  14.576 -2:
  14.577 -(p16)	mov val1[0]=r0
  14.578 -(p16)	mov val2[0]=r0
  14.579 -(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
  14.580 -(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
  14.581 -	br.ctop.dptk 2b
  14.582 -	;;
  14.583 -	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
  14.584 -	sub len=enddst,dst1,1		// precompute len
  14.585 -(p6)	br.cond.dptk .failure_in1bis
  14.586 -	;;
  14.587 -	mov pr=saved_pr,0xffffffffffff0000
  14.588 -	mov ar.lc=saved_lc
  14.589 -	mov ar.pfs=saved_pfs
  14.590 -	br.ret.sptk.many rp
  14.591 -
  14.592 -.failure_in2:
  14.593 -	sub ret0=endsrc,src1
  14.594 -	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
  14.595 -	sub len=enddst,dst1,1		// precompute len
  14.596 -(p6)	br.cond.dptk .failure_in1bis
  14.597 -	;;
  14.598 -	mov pr=saved_pr,0xffffffffffff0000
  14.599 -	mov ar.lc=saved_lc
  14.600 -	mov ar.pfs=saved_pfs
  14.601 -	br.ret.sptk.many rp
  14.602 -
  14.603 -	//
  14.604 -	// handling of failures on stores: that's the easy part
  14.605 -	//
  14.606 -.failure_out:
  14.607 -	sub ret0=enddst,dst1
  14.608 -	mov pr=saved_pr,0xffffffffffff0000
  14.609 -	mov ar.lc=saved_lc
  14.610 -
  14.611 -	mov ar.pfs=saved_pfs
  14.612 -	br.ret.sptk.many rp
  14.613 -END(__copy_user)
    15.1 --- a/xen/arch/ia64/linux/lib/csum_partial_copy.c	Tue Aug 30 17:51:51 2005 -0600
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,151 +0,0 @@
    15.4 -/*
    15.5 - * Network Checksum & Copy routine
    15.6 - *
    15.7 - * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
    15.8 - *	Stephane Eranian <eranian@hpl.hp.com>
    15.9 - *
   15.10 - * Most of the code has been imported from Linux/Alpha
   15.11 - */
   15.12 -
   15.13 -#include <linux/module.h>
   15.14 -#include <linux/types.h>
   15.15 -#include <linux/string.h>
   15.16 -
   15.17 -#include <asm/uaccess.h>
   15.18 -
   15.19 -/*
   15.20 - * XXX Fixme: those 2 inlines are meant for debugging and will go away
   15.21 - */
   15.22 -static inline unsigned
   15.23 -short from64to16(unsigned long x)
   15.24 -{
   15.25 -	/* add up 32-bit words for 33 bits */
   15.26 -	x = (x & 0xffffffff) + (x >> 32);
   15.27 -	/* add up 16-bit and 17-bit words for 17+c bits */
   15.28 -	x = (x & 0xffff) + (x >> 16);
   15.29 -	/* add up 16-bit and 2-bit for 16+c bit */
   15.30 -	x = (x & 0xffff) + (x >> 16);
   15.31 -	/* add up carry.. */
   15.32 -	x = (x & 0xffff) + (x >> 16);
   15.33 -	return x;
   15.34 -}
   15.35 -
   15.36 -static inline
   15.37 -unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
   15.38 -{
   15.39 -	int odd, count;
   15.40 -	unsigned long result = (unsigned long)psum;
   15.41 -
   15.42 -	if (len <= 0)
   15.43 -		goto out;
   15.44 -	odd = 1 & (unsigned long) buff;
   15.45 -	if (odd) {
   15.46 -		result = *buff << 8;
   15.47 -		len--;
   15.48 -		buff++;
   15.49 -	}
   15.50 -	count = len >> 1;		/* nr of 16-bit words.. */
   15.51 -	if (count) {
   15.52 -		if (2 & (unsigned long) buff) {
   15.53 -			result += *(unsigned short *) buff;
   15.54 -			count--;
   15.55 -			len -= 2;
   15.56 -			buff += 2;
   15.57 -		}
   15.58 -		count >>= 1;		/* nr of 32-bit words.. */
   15.59 -		if (count) {
   15.60 -			if (4 & (unsigned long) buff) {
   15.61 -				result += *(unsigned int *) buff;
   15.62 -				count--;
   15.63 -				len -= 4;
   15.64 -				buff += 4;
   15.65 -			}
   15.66 -			count >>= 1;	/* nr of 64-bit words.. */
   15.67 -			if (count) {
   15.68 -				unsigned long carry = 0;
   15.69 -				do {
   15.70 -					unsigned long w = *(unsigned long *) buff;
   15.71 -					count--;
   15.72 -					buff += 8;
   15.73 -					result += carry;
   15.74 -					result += w;
   15.75 -					carry = (w > result);
   15.76 -				} while (count);
   15.77 -				result += carry;
   15.78 -				result = (result & 0xffffffff) + (result >> 32);
   15.79 -			}
   15.80 -			if (len & 4) {
   15.81 -				result += *(unsigned int *) buff;
   15.82 -				buff += 4;
   15.83 -			}
   15.84 -		}
   15.85 -		if (len & 2) {
   15.86 -			result += *(unsigned short *) buff;
   15.87 -			buff += 2;
   15.88 -		}
   15.89 -	}
   15.90 -	if (len & 1)
   15.91 -		result += *buff;
   15.92 -
   15.93 -	result = from64to16(result);
   15.94 -
   15.95 -	if (odd)
   15.96 -		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
   15.97 -
   15.98 -out:
   15.99 -	return result;
  15.100 -}
  15.101 -
  15.102 -/*
  15.103 - * XXX Fixme
  15.104 - *
  15.105 - * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
  15.106 - * But it's very tricky to get right even in C.
  15.107 - */
  15.108 -extern unsigned long do_csum(const unsigned char *, long);
  15.109 -
  15.110 -static unsigned int
  15.111 -do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
  15.112 -				int len, unsigned int psum, int *errp)
  15.113 -{
  15.114 -	unsigned long result;
  15.115 -
  15.116 -	/* XXX Fixme
  15.117 -	 * for now we separate the copy from checksum for obvious
  15.118 -	 * alignment difficulties. Look at the Alpha code and you'll be
  15.119 -	 * scared.
  15.120 -	 */
  15.121 -
  15.122 -	if (__copy_from_user(dst, src, len) != 0 && errp)
  15.123 -		*errp = -EFAULT;
  15.124 -
  15.125 -	result = do_csum(dst, len);
  15.126 -
  15.127 -	/* add in old sum, and carry.. */
  15.128 -	result += psum;
  15.129 -	/* 32+c bits -> 32 bits */
  15.130 -	result = (result & 0xffffffff) + (result >> 32);
  15.131 -	return result;
  15.132 -}
  15.133 -
  15.134 -unsigned int
  15.135 -csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
  15.136 -			     int len, unsigned int sum, int *errp)
  15.137 -{
  15.138 -	if (!access_ok(VERIFY_READ, src, len)) {
  15.139 -		*errp = -EFAULT;
  15.140 -		memset(dst, 0, len);
  15.141 -		return sum;
  15.142 -	}
  15.143 -
  15.144 -	return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
  15.145 -}
  15.146 -
  15.147 -unsigned int
  15.148 -csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
  15.149 -			  int len, unsigned int sum)
  15.150 -{
  15.151 -	return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
  15.152 -}
  15.153 -
  15.154 -EXPORT_SYMBOL(csum_partial_copy_nocheck);
    16.1 --- a/xen/arch/ia64/linux/lib/dec_and_lock.c	Tue Aug 30 17:51:51 2005 -0600
    16.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.3 @@ -1,42 +0,0 @@
    16.4 -/*
    16.5 - * Copyright (C) 2003 Jerome Marchand, Bull S.A.
    16.6 - *	Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com>
    16.7 - *
    16.8 - * This file is released under the GPLv2, or at your option any later version.
    16.9 - *
   16.10 - * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction.  This
   16.11 - * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
   16.12 - */
   16.13 -
   16.14 -#include <linux/compiler.h>
   16.15 -#include <linux/module.h>
   16.16 -#include <linux/spinlock.h>
   16.17 -#include <asm/atomic.h>
   16.18 -
   16.19 -/*
   16.20 - * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  Both of these
   16.21 - * operations have to be done atomically, so that the count doesn't drop to zero without
   16.22 - * acquiring the spinlock first.
   16.23 - */
   16.24 -int
   16.25 -_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
   16.26 -{
   16.27 -	int old, new;
   16.28 -
   16.29 -	do {
   16.30 -		old = atomic_read(refcount);
   16.31 -		new = old - 1;
   16.32 -
   16.33 -		if (unlikely (old == 1)) {
   16.34 -			/* oops, we may be decrementing to zero, do it the slow way... */
   16.35 -			spin_lock(lock);
   16.36 -			if (atomic_dec_and_test(refcount))
   16.37 -				return 1;
   16.38 -			spin_unlock(lock);
   16.39 -			return 0;
   16.40 -		}
   16.41 -	} while (cmpxchg(&refcount->counter, old, new) != old);
   16.42 -	return 0;
   16.43 -}
   16.44 -
   16.45 -EXPORT_SYMBOL(_atomic_dec_and_lock);
    17.1 --- a/xen/arch/ia64/linux/lib/do_csum.S	Tue Aug 30 17:51:51 2005 -0600
    17.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.3 @@ -1,323 +0,0 @@
    17.4 -/*
    17.5 - *
    17.6 - * Optmized version of the standard do_csum() function
    17.7 - *
    17.8 - * Return: a 64bit quantity containing the 16bit Internet checksum
    17.9 - *
   17.10 - * Inputs:
   17.11 - *	in0: address of buffer to checksum (char *)
   17.12 - *	in1: length of the buffer (int)
   17.13 - *
   17.14 - * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
   17.15 - *	Stephane Eranian <eranian@hpl.hp.com>
   17.16 - *
   17.17 - * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
   17.18 - *		Data locality study on the checksum buffer.
   17.19 - *		More optimization cleanup - remove excessive stop bits.
   17.20 - * 02/04/08	David Mosberger <davidm@hpl.hp.com>
   17.21 - *		More cleanup and tuning.
   17.22 - * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
   17.23 - *		Clean up and optimize and the software pipeline, loading two
   17.24 - *		back-to-back 8-byte words per loop. Clean up the initialization
   17.25 - *		for the loop. Support the cases where load latency = 1 or 2.
   17.26 - *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
   17.27 - */
   17.28 -
   17.29 -#include <asm/asmmacro.h>
   17.30 -
   17.31 -//
   17.32 -// Theory of operations:
   17.33 -//	The goal is to go as quickly as possible to the point where
   17.34 -//	we can checksum 16 bytes/loop. Before reaching that point we must
   17.35 -//	take care of incorrect alignment of first byte.
   17.36 -//
   17.37 -//	The code hereafter also takes care of the "tail" part of the buffer
   17.38 -//	before entering the core loop, if any. The checksum is a sum so it
   17.39 -//	allows us to commute operations. So we do the "head" and "tail"
   17.40 -//	first to finish at full speed in the body. Once we get the head and
   17.41 -//	tail values, we feed them into the pipeline, very handy initialization.
   17.42 -//
   17.43 -//	Of course we deal with the special case where the whole buffer fits
   17.44 -//	into one 8 byte word. In this case we have only one entry in the pipeline.
   17.45 -//
   17.46 -//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
   17.47 -//	possible load latency and also to accommodate for head and tail.
   17.48 -//
   17.49 -//	The end of the function deals with folding the checksum from 64bits
   17.50 -//	down to 16bits taking care of the carry.
   17.51 -//
   17.52 -//	This version avoids synchronization in the core loop by also using a
   17.53 -//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
   17.54 -//
   17.55 -//	 wordx[] (x=1,2)
   17.56 -//	|---|
   17.57 -//      |   | 0			: new value loaded in pipeline
   17.58 -//	|---|
   17.59 -//      |   | -			: in transit data
   17.60 -//	|---|
   17.61 -//      |   | LOAD_LATENCY	: current value to add to checksum
   17.62 -//	|---|
   17.63 -//      |   | LOAD_LATENCY+1	: previous value added to checksum
   17.64 -//      |---|			(previous iteration)
   17.65 -//
   17.66 -//	resultx[] (x=1,2)
   17.67 -//	|---|
   17.68 -//      |   | 0			: initial value
   17.69 -//	|---|
   17.70 -//      |   | LOAD_LATENCY-1	: new checksum
   17.71 -//	|---|
   17.72 -//      |   | LOAD_LATENCY	: previous value of checksum
   17.73 -//	|---|
   17.74 -//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
   17.75 -//      |---|
   17.76 -//
   17.77 -//
   17.78 -//	See RFC1071 "Computing the Internet Checksum" for various techniques for
   17.79 -//	calculating the Internet checksum.
   17.80 -//
   17.81 -// NOT YET DONE:
   17.82 -//	- Maybe another algorithm which would take care of the folding at the
   17.83 -//	  end in a different manner
   17.84 -//	- Work with people more knowledgeable than me on the network stack
   17.85 -//	  to figure out if we could not split the function depending on the
   17.86 -//	  type of packet or alignment we get. Like the ip_fast_csum() routine
   17.87 -//	  where we know we have at least 20bytes worth of data to checksum.
   17.88 -//	- Do a better job of handling small packets.
   17.89 -//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
   17.90 -//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
   17.91 -//	  on the data that buffer points to (partly because the checksum is often preceded by
   17.92 -//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
   17.93 -//	  the data is already in the cache.
   17.94 -//
   17.95 -
   17.96 -#define saved_pfs	r11
   17.97 -#define hmask		r16
   17.98 -#define tmask		r17
   17.99 -#define first1		r18
  17.100 -#define firstval	r19
  17.101 -#define firstoff	r20
  17.102 -#define last		r21
  17.103 -#define lastval		r22
  17.104 -#define lastoff		r23
  17.105 -#define saved_lc	r24
  17.106 -#define saved_pr	r25
  17.107 -#define tmp1		r26
  17.108 -#define tmp2		r27
  17.109 -#define tmp3		r28
  17.110 -#define carry1		r29
  17.111 -#define carry2		r30
  17.112 -#define first2		r31
  17.113 -
  17.114 -#define buf		in0
  17.115 -#define len		in1
  17.116 -
  17.117 -#define LOAD_LATENCY	2	// XXX fix me
  17.118 -
  17.119 -#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
  17.120 -# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
  17.121 -#endif
  17.122 -
  17.123 -#define PIPE_DEPTH			(LOAD_LATENCY+2)
  17.124 -#define ELD	p[LOAD_LATENCY]		// end of load
  17.125 -#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
  17.126 -
  17.127 -// unsigned long do_csum(unsigned char *buf,long len)
  17.128 -
  17.129 -GLOBAL_ENTRY(do_csum)
  17.130 -	.prologue
  17.131 -	.save ar.pfs, saved_pfs
  17.132 -	alloc saved_pfs=ar.pfs,2,16,0,16
  17.133 -	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
  17.134 -	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
  17.135 -	mov ret0=r0		// in case we have zero length
  17.136 -	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
  17.137 -	;;
  17.138 -	add tmp1=buf,len	// last byte's address
  17.139 -	.save pr, saved_pr
  17.140 -	mov saved_pr=pr		// preserve predicates (rotation)
  17.141 -(p6)	br.ret.spnt.many rp	// return if zero or negative length
  17.142 -
  17.143 -	mov hmask=-1		// initialize head mask
  17.144 -	tbit.nz p15,p0=buf,0	// is buf an odd address?
  17.145 -	and first1=-8,buf	// 8-byte align down address of first1 element
  17.146 -
  17.147 -	and firstoff=7,buf	// how many bytes off for first1 element
  17.148 -	mov tmask=-1		// initialize tail mask
  17.149 -
  17.150 -	;;
  17.151 -	adds tmp2=-1,tmp1	// last-1
  17.152 -	and lastoff=7,tmp1	// how many bytes off for last element
  17.153 -	;;
  17.154 -	sub tmp1=8,lastoff	// complement to lastoff
  17.155 -	and last=-8,tmp2	// address of word containing last byte
  17.156 -	;;
  17.157 -	sub tmp3=last,first1	// tmp3=distance from first1 to last
  17.158 -	.save ar.lc, saved_lc
  17.159 -	mov saved_lc=ar.lc	// save lc
  17.160 -	cmp.eq p8,p9=last,first1	// everything fits in one word ?
  17.161 -
  17.162 -	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
  17.163 -	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
  17.164 -	shl tmp2=firstoff,3	// number of bits
  17.165 -	;;
  17.166 -(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
  17.167 -	shl tmp1=tmp1,3		// number of bits
  17.168 -(p9)	adds tmp3=-8,tmp3	// effectively loaded
  17.169 -	;;
  17.170 -(p8)	mov lastval=r0		// we don't need lastval if first1==last
  17.171 -	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
  17.172 -	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
  17.173 -	;;
  17.174 -	.body
  17.175 -#define count tmp3
  17.176 -
  17.177 -(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
  17.178 -(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
  17.179 -	shr.u count=count,3	// how many 8-byte?
  17.180 -	;;
  17.181 -	// If count is odd, finish this 8-byte word so that we can
  17.182 -	// load two back-to-back 8-byte words per loop thereafter.
  17.183 -	and word1[0]=firstval,hmask	// and mask it as appropriate
  17.184 -	tbit.nz p10,p11=count,0		// if (count is odd)
  17.185 -	;;
  17.186 -(p8)	mov result1[0]=word1[0]
  17.187 -(p9)	add result1[0]=word1[0],word2[0]
  17.188 -	;;
  17.189 -	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
  17.190 -	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
  17.191 -	;;
  17.192 -(p6)	adds result1[0]=1,result1[0]
  17.193 -(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
  17.194 -(p11)	br.cond.dptk .do_csum16		// if (count is even)
  17.195 -
  17.196 -	// Here count is odd.
  17.197 -	ld8 word1[1]=[first1],8		// load an 8-byte word
  17.198 -	cmp.eq p9,p10=1,count		// if (count == 1)
  17.199 -	adds count=-1,count		// loaded an 8-byte word
  17.200 -	;;
  17.201 -	add result1[0]=result1[0],word1[1]
  17.202 -	;;
  17.203 -	cmp.ltu p6,p0=result1[0],word1[1]
  17.204 -	;;
  17.205 -(p6)	adds result1[0]=1,result1[0]
  17.206 -(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
  17.207 -	// Fall through to caluculate the checksum, feeding result1[0] as
  17.208 -	// the initial value in result1[0].
  17.209 -	//
  17.210 -	// Calculate the checksum loading two 8-byte words per loop.
  17.211 -	//
  17.212 -.do_csum16:
  17.213 -	add first2=8,first1
  17.214 -	shr.u count=count,1	// we do 16 bytes per loop
  17.215 -	;;
  17.216 -	adds count=-1,count
  17.217 -	mov carry1=r0
  17.218 -	mov carry2=r0
  17.219 -	brp.loop.imp 1f,2f
  17.220 -	;;
  17.221 -	mov ar.ec=PIPE_DEPTH
  17.222 -	mov ar.lc=count	// set lc
  17.223 -	mov pr.rot=1<<16
  17.224 -	// result1[0] must be initialized in advance.
  17.225 -	mov result2[0]=r0
  17.226 -	;;
  17.227 -	.align 32
  17.228 -1:
  17.229 -(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
  17.230 -(pC1[1])adds carry1=1,carry1
  17.231 -(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
  17.232 -(pC2[1])adds carry2=1,carry2
  17.233 -(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
  17.234 -(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
  17.235 -2:
  17.236 -(p[0])	ld8 word1[0]=[first1],16
  17.237 -(p[0])	ld8 word2[0]=[first2],16
  17.238 -	br.ctop.sptk 1b
  17.239 -	;;
  17.240 -	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
  17.241 -(pC1[1])adds carry1=1,carry1	// since we miss the last one
  17.242 -(pC2[1])adds carry2=1,carry2
  17.243 -	;;
  17.244 -	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
  17.245 -	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
  17.246 -	;;
  17.247 -	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
  17.248 -	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
  17.249 -	;;
  17.250 -(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
  17.251 -(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
  17.252 -	;;
  17.253 -	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
  17.254 -	;;
  17.255 -	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
  17.256 -	;;
  17.257 -(p6)	adds result1[0]=1,result1[0]
  17.258 -	;;
  17.259 -.do_csum_exit:
  17.260 -	//
  17.261 -	// now fold 64 into 16 bits taking care of carry
  17.262 -	// that's not very good because it has lots of sequentiality
  17.263 -	//
  17.264 -	mov tmp3=0xffff
  17.265 -	zxt4 tmp1=result1[0]
  17.266 -	shr.u tmp2=result1[0],32
  17.267 -	;;
  17.268 -	add result1[0]=tmp1,tmp2
  17.269 -	;;
  17.270 -	and tmp1=result1[0],tmp3
  17.271 -	shr.u tmp2=result1[0],16
  17.272 -	;;
  17.273 -	add result1[0]=tmp1,tmp2
  17.274 -	;;
  17.275 -	and tmp1=result1[0],tmp3
  17.276 -	shr.u tmp2=result1[0],16
  17.277 -	;;
  17.278 -	add result1[0]=tmp1,tmp2
  17.279 -	;;
  17.280 -	and tmp1=result1[0],tmp3
  17.281 -	shr.u tmp2=result1[0],16
  17.282 -	;;
  17.283 -	add ret0=tmp1,tmp2
  17.284 -	mov pr=saved_pr,0xffffffffffff0000
  17.285 -	;;
  17.286 -	// if buf was odd then swap bytes
  17.287 -	mov ar.pfs=saved_pfs		// restore ar.ec
  17.288 -(p15)	mux1 ret0=ret0,@rev		// reverse word
  17.289 -	;;
  17.290 -	mov ar.lc=saved_lc
  17.291 -(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
  17.292 -	br.ret.sptk.many rp
  17.293 -
  17.294 -//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
  17.295 -//	not much better than the original. So keep the original there so that
  17.296 -//	someone else can challenge.
  17.297 -//
  17.298 -//	shr.u word1[0]=result1[0],32
  17.299 -//	zxt4 result1[0]=result1[0]
  17.300 -//	;;
  17.301 -//	add result1[0]=result1[0],word1[0]
  17.302 -//	;;
  17.303 -//	zxt2 result2[0]=result1[0]
  17.304 -//	extr.u word1[0]=result1[0],16,16
  17.305 -//	shr.u carry1=result1[0],32
  17.306 -//	;;
  17.307 -//	add result2[0]=result2[0],word1[0]
  17.308 -//	;;
  17.309 -//	add result2[0]=result2[0],carry1
  17.310 -//	;;
  17.311 -//	extr.u ret0=result2[0],16,16
  17.312 -//	;;
  17.313 -//	add ret0=ret0,result2[0]
  17.314 -//	;;
  17.315 -//	zxt2 ret0=ret0
  17.316 -//	mov ar.pfs=saved_pfs		 // restore ar.ec
  17.317 -//	mov pr=saved_pr,0xffffffffffff0000
  17.318 -//	;;
  17.319 -//	// if buf was odd then swap bytes
  17.320 -//	mov ar.lc=saved_lc
  17.321 -//(p15)	mux1 ret0=ret0,@rev		// reverse word
  17.322 -//	;;
  17.323 -//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
  17.324 -//	br.ret.sptk.many rp
  17.325 -
  17.326 -END(do_csum)
    18.1 --- a/xen/arch/ia64/linux/lib/flush.S	Tue Aug 30 17:51:51 2005 -0600
    18.2 +++ b/xen/arch/ia64/linux/lib/flush.S	Wed Aug 31 14:32:27 2005 -0600
    18.3 @@ -1,39 +1,61 @@
    18.4  /*
    18.5   * Cache flushing routines.
    18.6   *
    18.7 - * Copyright (C) 1999-2001 Hewlett-Packard Co
    18.8 - * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com>
    18.9 + * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
   18.10 + *	David Mosberger-Tang <davidm@hpl.hp.com>
   18.11 + *
   18.12 + * 05/28/05 Zoltan Menyhart	Dynamic stride size
   18.13   */
   18.14 +
   18.15  #include <asm/asmmacro.h>
   18.16 -#include <asm/page.h>
   18.17 +
   18.18  
   18.19  	/*
   18.20  	 * flush_icache_range(start,end)
   18.21 -	 *	Must flush range from start to end-1 but nothing else (need to
   18.22 +	 *
   18.23 +	 *	Make i-cache(s) coherent with d-caches.
   18.24 +	 *
   18.25 +	 *	Must deal with range from start to end-1 but nothing else (need to
   18.26  	 *	be careful not to touch addresses that may be unmapped).
   18.27 +	 *
   18.28 +	 *	Note: "in0" and "in1" are preserved for debugging purposes.
   18.29  	 */
   18.30  GLOBAL_ENTRY(flush_icache_range)
   18.31 +
   18.32  	.prologue
   18.33 -	alloc r2=ar.pfs,2,0,0,0
   18.34 -	sub r8=in1,in0,1
   18.35 +	alloc	r2=ar.pfs,2,0,0,0
   18.36 +	movl	r3=ia64_i_cache_stride_shift
   18.37 + 	mov	r21=1
   18.38  	;;
   18.39 -	shr.u r8=r8,5			// we flush 32 bytes per iteration
   18.40 -	.save ar.lc, r3
   18.41 -	mov r3=ar.lc			// save ar.lc
   18.42 +	ld8	r20=[r3]		// r20: stride shift
   18.43 +	sub	r22=in1,r0,1		// last byte address
   18.44 +	;;
   18.45 +	shr.u	r23=in0,r20		// start / (stride size)
   18.46 +	shr.u	r22=r22,r20		// (last byte address) / (stride size)
   18.47 +	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
   18.48 +	;;
   18.49 +	sub	r8=r22,r23		// number of strides - 1
   18.50 +	shl	r24=r23,r20		// r24: addresses for "fc.i" =
   18.51 +					//	"start" rounded down to stride boundary
   18.52 +	.save	ar.lc,r3
   18.53 +	mov	r3=ar.lc		// save ar.lc
   18.54  	;;
   18.55  
   18.56  	.body
   18.57 -
   18.58 -	mov ar.lc=r8
   18.59 +	mov	ar.lc=r8
   18.60  	;;
   18.61 -.Loop:	fc in0				// issuable on M0 only
   18.62 -	add in0=32,in0
   18.63 +	/*
   18.64 +	 * 32 byte aligned loop, even number of (actually 2) bundles
   18.65 +	 */
   18.66 +.Loop:	fc.i	r24			// issuable on M0 only
   18.67 +	add	r24=r21,r24		// we flush "stride size" bytes per iteration
   18.68 +	nop.i	0
   18.69  	br.cloop.sptk.few .Loop
   18.70  	;;
   18.71  	sync.i
   18.72  	;;
   18.73  	srlz.i
   18.74  	;;
   18.75 -	mov ar.lc=r3			// restore ar.lc
   18.76 +	mov	ar.lc=r3		// restore ar.lc
   18.77  	br.ret.sptk.many rp
   18.78  END(flush_icache_range)
    19.1 --- a/xen/arch/ia64/linux/lib/io.c	Tue Aug 30 17:51:51 2005 -0600
    19.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.3 @@ -1,165 +0,0 @@
    19.4 -#include <linux/config.h>
    19.5 -#include <linux/module.h>
    19.6 -#include <linux/types.h>
    19.7 -
    19.8 -#include <asm/io.h>
    19.9 -
   19.10 -/*
   19.11 - * Copy data from IO memory space to "real" memory space.
   19.12 - * This needs to be optimized.
   19.13 - */
   19.14 -void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
   19.15 -{
   19.16 -	char *dst = to;
   19.17 -
   19.18 -	while (count) {
   19.19 -		count--;
   19.20 -		*dst++ = readb(from++);
   19.21 -	}
   19.22 -}
   19.23 -EXPORT_SYMBOL(memcpy_fromio);
   19.24 -
   19.25 -/*
   19.26 - * Copy data from "real" memory space to IO memory space.
   19.27 - * This needs to be optimized.
   19.28 - */
   19.29 -void memcpy_toio(volatile void __iomem *to, const void *from, long count)
   19.30 -{
   19.31 -	const char *src = from;
   19.32 -
   19.33 -	while (count) {
   19.34 -		count--;
   19.35 -		writeb(*src++, to++);
   19.36 -	}
   19.37 -}
   19.38 -EXPORT_SYMBOL(memcpy_toio);
   19.39 -
   19.40 -/*
   19.41 - * "memset" on IO memory space.
   19.42 - * This needs to be optimized.
   19.43 - */
   19.44 -void memset_io(volatile void __iomem *dst, int c, long count)
   19.45 -{
   19.46 -	unsigned char ch = (char)(c & 0xff);
   19.47 -
   19.48 -	while (count) {
   19.49 -		count--;
   19.50 -		writeb(ch, dst);
   19.51 -		dst++;
   19.52 -	}
   19.53 -}
   19.54 -EXPORT_SYMBOL(memset_io);
   19.55 -
   19.56 -#ifdef CONFIG_IA64_GENERIC
   19.57 -
   19.58 -#undef __ia64_inb
   19.59 -#undef __ia64_inw
   19.60 -#undef __ia64_inl
   19.61 -#undef __ia64_outb
   19.62 -#undef __ia64_outw
   19.63 -#undef __ia64_outl
   19.64 -#undef __ia64_readb
   19.65 -#undef __ia64_readw
   19.66 -#undef __ia64_readl
   19.67 -#undef __ia64_readq
   19.68 -#undef __ia64_readb_relaxed
   19.69 -#undef __ia64_readw_relaxed
   19.70 -#undef __ia64_readl_relaxed
   19.71 -#undef __ia64_readq_relaxed
   19.72 -#undef __ia64_writeb
   19.73 -#undef __ia64_writew
   19.74 -#undef __ia64_writel
   19.75 -#undef __ia64_writeq
   19.76 -#undef __ia64_mmiowb
   19.77 -
   19.78 -unsigned int
   19.79 -__ia64_inb (unsigned long port)
   19.80 -{
   19.81 -	return ___ia64_inb(port);
   19.82 -}
   19.83 -
   19.84 -unsigned int
   19.85 -__ia64_inw (unsigned long port)
   19.86 -{
   19.87 -	return ___ia64_inw(port);
   19.88 -}
   19.89 -
   19.90 -unsigned int
   19.91 -__ia64_inl (unsigned long port)
   19.92 -{
   19.93 -	return ___ia64_inl(port);
   19.94 -}
   19.95 -
   19.96 -void
   19.97 -__ia64_outb (unsigned char val, unsigned long port)
   19.98 -{
   19.99 -	___ia64_outb(val, port);
  19.100 -}
  19.101 -
  19.102 -void
  19.103 -__ia64_outw (unsigned short val, unsigned long port)
  19.104 -{
  19.105 -	___ia64_outw(val, port);
  19.106 -}
  19.107 -
  19.108 -void
  19.109 -__ia64_outl (unsigned int val, unsigned long port)
  19.110 -{
  19.111 -	___ia64_outl(val, port);
  19.112 -}
  19.113 -
  19.114 -unsigned char
  19.115 -__ia64_readb (void __iomem *addr)
  19.116 -{
  19.117 -	return ___ia64_readb (addr);
  19.118 -}
  19.119 -
  19.120 -unsigned short
  19.121 -__ia64_readw (void __iomem *addr)
  19.122 -{
  19.123 -	return ___ia64_readw (addr);
  19.124 -}
  19.125 -
  19.126 -unsigned int
  19.127 -__ia64_readl (void __iomem *addr)
  19.128 -{
  19.129 -	return ___ia64_readl (addr);
  19.130 -}
  19.131 -
  19.132 -unsigned long
  19.133 -__ia64_readq (void __iomem *addr)
  19.134 -{
  19.135 -	return ___ia64_readq (addr);
  19.136 -}
  19.137 -
  19.138 -unsigned char
  19.139 -__ia64_readb_relaxed (void __iomem *addr)
  19.140 -{
  19.141 -	return ___ia64_readb (addr);
  19.142 -}
  19.143 -
  19.144 -unsigned short
  19.145 -__ia64_readw_relaxed (void __iomem *addr)
  19.146 -{
  19.147 -	return ___ia64_readw (addr);
  19.148 -}
  19.149 -
  19.150 -unsigned int
  19.151 -__ia64_readl_relaxed (void __iomem *addr)
  19.152 -{
  19.153 -	return ___ia64_readl (addr);
  19.154 -}
  19.155 -
  19.156 -unsigned long
  19.157 -__ia64_readq_relaxed (void __iomem *addr)
  19.158 -{
  19.159 -	return ___ia64_readq (addr);
  19.160 -}
  19.161 -
  19.162 -void
  19.163 -__ia64_mmiowb(void)
  19.164 -{
  19.165 -	___ia64_mmiowb();
  19.166 -}
  19.167 -
  19.168 -#endif /* CONFIG_IA64_GENERIC */
    20.1 --- a/xen/arch/ia64/linux/lib/ip_fast_csum.S	Tue Aug 30 17:51:51 2005 -0600
    20.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.3 @@ -1,90 +0,0 @@
    20.4 -/*
    20.5 - * Optmized version of the ip_fast_csum() function
    20.6 - * Used for calculating IP header checksum
    20.7 - *
    20.8 - * Return: 16bit checksum, complemented
    20.9 - *
   20.10 - * Inputs:
   20.11 - *      in0: address of buffer to checksum (char *)
   20.12 - *      in1: length of the buffer (int)
   20.13 - *
   20.14 - * Copyright (C) 2002 Intel Corp.
   20.15 - * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
   20.16 - */
   20.17 -
   20.18 -#include <asm/asmmacro.h>
   20.19 -
   20.20 -/*
   20.21 - * Since we know that most likely this function is called with buf aligned
   20.22 - * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
   20.23 - * versus calling generic version of do_csum, which has lots of overhead in
   20.24 - * handling various alignments and sizes.  However, due to lack of constrains
   20.25 - * put on the function input argument, cases with alignment not on 4-byte or
   20.26 - * size not equal to 20 bytes will be handled by the generic do_csum function.
   20.27 - */
   20.28 -
   20.29 -#define in0	r32
   20.30 -#define in1	r33
   20.31 -#define ret0	r8
   20.32 -
   20.33 -GLOBAL_ENTRY(ip_fast_csum)
   20.34 -	.prologue
   20.35 -	.body
   20.36 -	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
   20.37 -	and	r14=3,in0	// is it aligned on 4-byte?
   20.38 -	add	r15=4,in0	// second source pointer
   20.39 -	;;
   20.40 -	cmp.ne.or.andcm p6,p7=r14,r0
   20.41 -	;;
   20.42 -(p7)	ld4	r20=[in0],8
   20.43 -(p7)	ld4	r21=[r15],8
   20.44 -(p6)	br.spnt	.generic
   20.45 -	;;
   20.46 -	ld4	r22=[in0],8
   20.47 -	ld4	r23=[r15],8
   20.48 -	;;
   20.49 -	ld4	r24=[in0]
   20.50 -	add	r20=r20,r21
   20.51 -	add	r22=r22,r23
   20.52 -	;;
   20.53 -	add	r20=r20,r22
   20.54 -	;;
   20.55 -	add	r20=r20,r24
   20.56 -	;;
   20.57 -	shr.u	ret0=r20,16	// now need to add the carry
   20.58 -	zxt2	r20=r20
   20.59 -	;;
   20.60 -	add	r20=ret0,r20
   20.61 -	;;
   20.62 -	shr.u	ret0=r20,16	// add carry again
   20.63 -	zxt2	r20=r20
   20.64 -	;;
   20.65 -	add	r20=ret0,r20
   20.66 -	;;
   20.67 -	shr.u	ret0=r20,16
   20.68 -	zxt2	r20=r20
   20.69 -	;;
   20.70 -	add	r20=ret0,r20
   20.71 -	;;
   20.72 -	andcm	ret0=-1,r20
   20.73 -	.restore sp		// reset frame state
   20.74 -	br.ret.sptk.many b0
   20.75 -	;;
   20.76 -
   20.77 -.generic:
   20.78 -	.prologue
   20.79 -	.save ar.pfs, r35
   20.80 -	alloc	r35=ar.pfs,2,2,2,0
   20.81 -	.save rp, r34
   20.82 -	mov	r34=b0
   20.83 -	.body
   20.84 -	dep.z	out1=in1,2,30
   20.85 -	mov	out0=in0
   20.86 -	;;
   20.87 -	br.call.sptk.many b0=do_csum
   20.88 -	;;
   20.89 -	andcm	ret0=-1,ret0
   20.90 -	mov	ar.pfs=r35
   20.91 -	mov	b0=r34
   20.92 -	br.ret.sptk.many b0
   20.93 -END(ip_fast_csum)
    21.1 --- a/xen/arch/ia64/linux/lib/memcpy.S	Tue Aug 30 17:51:51 2005 -0600
    21.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.3 @@ -1,301 +0,0 @@
    21.4 -/*
    21.5 - *
    21.6 - * Optimized version of the standard memcpy() function
    21.7 - *
    21.8 - * Inputs:
    21.9 - * 	in0:	destination address
   21.10 - *	in1:	source address
   21.11 - *	in2:	number of bytes to copy
   21.12 - * Output:
   21.13 - * 	no return value
   21.14 - *
   21.15 - * Copyright (C) 2000-2001 Hewlett-Packard Co
   21.16 - *	Stephane Eranian <eranian@hpl.hp.com>
   21.17 - *	David Mosberger-Tang <davidm@hpl.hp.com>
   21.18 - */
   21.19 -#include <asm/asmmacro.h>
   21.20 -
   21.21 -GLOBAL_ENTRY(memcpy)
   21.22 -
   21.23 -#	define MEM_LAT	21		/* latency to memory */
   21.24 -
   21.25 -#	define dst	r2
   21.26 -#	define src	r3
   21.27 -#	define retval	r8
   21.28 -#	define saved_pfs r9
   21.29 -#	define saved_lc	r10
   21.30 -#	define saved_pr	r11
   21.31 -#	define cnt	r16
   21.32 -#	define src2	r17
   21.33 -#	define t0	r18
   21.34 -#	define t1	r19
   21.35 -#	define t2	r20
   21.36 -#	define t3	r21
   21.37 -#	define t4	r22
   21.38 -#	define src_end	r23
   21.39 -
   21.40 -#	define N	(MEM_LAT + 4)
   21.41 -#	define Nrot	((N + 7) & ~7)
   21.42 -
   21.43 -	/*
   21.44 -	 * First, check if everything (src, dst, len) is a multiple of eight.  If
   21.45 -	 * so, we handle everything with no taken branches (other than the loop
   21.46 -	 * itself) and a small icache footprint.  Otherwise, we jump off to
   21.47 -	 * the more general copy routine handling arbitrary
   21.48 -	 * sizes/alignment etc.
   21.49 -	 */
   21.50 -	.prologue
   21.51 -	.save ar.pfs, saved_pfs
   21.52 -	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
   21.53 -	.save ar.lc, saved_lc
   21.54 -	mov saved_lc=ar.lc
   21.55 -	or t0=in0,in1
   21.56 -	;;
   21.57 -
   21.58 -	or t0=t0,in2
   21.59 -	.save pr, saved_pr
   21.60 -	mov saved_pr=pr
   21.61 -
   21.62 -	.body
   21.63 -
   21.64 -	cmp.eq p6,p0=in2,r0	// zero length?
   21.65 -	mov retval=in0		// return dst
   21.66 -(p6)	br.ret.spnt.many rp	// zero length, return immediately
   21.67 -	;;
   21.68 -
   21.69 -	mov dst=in0		// copy because of rotation
   21.70 -	shr.u cnt=in2,3		// number of 8-byte words to copy
   21.71 -	mov pr.rot=1<<16
   21.72 -	;;
   21.73 -
   21.74 -	adds cnt=-1,cnt		// br.ctop is repeat/until
   21.75 -	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
   21.76 -	mov ar.ec=N
   21.77 -	;;
   21.78 -
   21.79 -	and t0=0x7,t0
   21.80 -	mov ar.lc=cnt
   21.81 -	;;
   21.82 -	cmp.ne p6,p0=t0,r0
   21.83 -
   21.84 -	mov src=in1		// copy because of rotation
   21.85 -(p7)	br.cond.spnt.few .memcpy_short
   21.86 -(p6)	br.cond.spnt.few .memcpy_long
   21.87 -	;;
   21.88 -	nop.m	0
   21.89 -	;;
   21.90 -	nop.m	0
   21.91 -	nop.i	0
   21.92 -	;;
   21.93 -	nop.m	0
   21.94 -	;;
   21.95 -	.rotr val[N]
   21.96 -	.rotp p[N]
   21.97 -	.align 32
   21.98 -1: { .mib
   21.99 -(p[0])	ld8 val[0]=[src],8
  21.100 -	nop.i 0
  21.101 -	brp.loop.imp 1b, 2f
  21.102 -}
  21.103 -2: { .mfb
  21.104 -(p[N-1])st8 [dst]=val[N-1],8
  21.105 -	nop.f 0
  21.106 -	br.ctop.dptk.few 1b
  21.107 -}
  21.108 -	;;
  21.109 -	mov ar.lc=saved_lc
  21.110 -	mov pr=saved_pr,-1
  21.111 -	mov ar.pfs=saved_pfs
  21.112 -	br.ret.sptk.many rp
  21.113 -
  21.114 -	/*
  21.115 -	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
  21.116 -	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
  21.117 -	 * get used very often (gcc inlines small copies) and due to atomicity
  21.118 -	 * issues, we want to avoid read-modify-write of entire words.
  21.119 -	 */
  21.120 -	.align 32
  21.121 -.memcpy_short:
  21.122 -	adds cnt=-1,in2		// br.ctop is repeat/until
  21.123 -	mov ar.ec=MEM_LAT
  21.124 -	brp.loop.imp 1f, 2f
  21.125 -	;;
  21.126 -	mov ar.lc=cnt
  21.127 -	;;
  21.128 -	nop.m	0
  21.129 -	;;
  21.130 -	nop.m	0
  21.131 -	nop.i	0
  21.132 -	;;
  21.133 -	nop.m	0
  21.134 -	;;
  21.135 -	nop.m	0
  21.136 -	;;
  21.137 -	/*
  21.138 -	 * It is faster to put a stop bit in the loop here because it makes
  21.139 -	 * the pipeline shorter (and latency is what matters on short copies).
  21.140 -	 */
  21.141 -	.align 32
  21.142 -1: { .mib
  21.143 -(p[0])	ld1 val[0]=[src],1
  21.144 -	nop.i 0
  21.145 -	brp.loop.imp 1b, 2f
  21.146 -} ;;
  21.147 -2: { .mfb
  21.148 -(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
  21.149 -	nop.f 0
  21.150 -	br.ctop.dptk.few 1b
  21.151 -} ;;
  21.152 -	mov ar.lc=saved_lc
  21.153 -	mov pr=saved_pr,-1
  21.154 -	mov ar.pfs=saved_pfs
  21.155 -	br.ret.sptk.many rp
  21.156 -
  21.157 -	/*
  21.158 -	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
  21.159 -	 * an overriding concern here, but throughput is.  We first do
  21.160 -	 * sub-word copying until the destination is aligned, then we check
  21.161 -	 * if the source is also aligned.  If so, we do a simple load/store-loop
  21.162 -	 * until there are less than 8 bytes left over and then we do the tail,
  21.163 -	 * by storing the last few bytes using sub-word copying.  If the source
  21.164 -	 * is not aligned, we branch off to the non-congruent loop.
  21.165 -	 *
  21.166 -	 *   stage:   op:
  21.167 -	 *         0  ld
  21.168 -	 *	   :
  21.169 -	 * MEM_LAT+3  shrp
  21.170 -	 * MEM_LAT+4  st
  21.171 -	 *
  21.172 -	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
  21.173 -	 * seems to introduce an unavoidable bubble in the pipeline so the overall
  21.174 -	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
  21.175 -	 * of 4 byte/cycle.  Still not bad.
  21.176 -	 */
  21.177 -#	undef N
  21.178 -#	undef Nrot
  21.179 -#	define N	(MEM_LAT + 5)		/* number of stages */
  21.180 -#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
  21.181 -
  21.182 -#define LOG_LOOP_SIZE	6
  21.183 -
  21.184 -.memcpy_long:
  21.185 -	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
  21.186 -	and t0=-8,src		// t0 = src & ~7
  21.187 -	and t2=7,src		// t2 = src & 7
  21.188 -	;;
  21.189 -	ld8 t0=[t0]		// t0 = 1st source word
  21.190 -	adds src2=7,src		// src2 = (src + 7)
  21.191 -	sub t4=r0,dst		// t4 = -dst
  21.192 -	;;
  21.193 -	and src2=-8,src2	// src2 = (src + 7) & ~7
  21.194 -	shl t2=t2,3		// t2 = 8*(src & 7)
  21.195 -	shl t4=t4,3		// t4 = 8*(dst & 7)
  21.196 -	;;
  21.197 -	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
  21.198 -	sub t3=64,t2		// t3 = 64-8*(src & 7)
  21.199 -	shr.u t0=t0,t2
  21.200 -	;;
  21.201 -	add src_end=src,in2
  21.202 -	shl t1=t1,t3
  21.203 -	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
  21.204 -	;;
  21.205 -	or t0=t0,t1
  21.206 -	mov cnt=r0
  21.207 -	adds src_end=-1,src_end
  21.208 -	;;
  21.209 -(p3)	st1 [dst]=t0,1
  21.210 -(p3)	shr.u t0=t0,8
  21.211 -(p3)	adds cnt=1,cnt
  21.212 -	;;
  21.213 -(p4)	st2 [dst]=t0,2
  21.214 -(p4)	shr.u t0=t0,16
  21.215 -(p4)	adds cnt=2,cnt
  21.216 -	;;
  21.217 -(p5)	st4 [dst]=t0,4
  21.218 -(p5)	adds cnt=4,cnt
  21.219 -	and src_end=-8,src_end	// src_end = last word of source buffer
  21.220 -	;;
  21.221 -
  21.222 -	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
  21.223 -
  21.224 -1:{	add src=cnt,src			// make src point to remainder of source buffer
  21.225 -	sub cnt=in2,cnt			// cnt = number of bytes left to copy
  21.226 -	mov t4=ip
  21.227 -  }	;;
  21.228 -	and src2=-8,src			// align source pointer
  21.229 -	adds t4=.memcpy_loops-1b,t4
  21.230 -	mov ar.ec=N
  21.231 -
  21.232 -	and t0=7,src			// t0 = src & 7
  21.233 -	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
  21.234 -	shl cnt=cnt,3			// move bits 0-2 to 3-5
  21.235 -	;;
  21.236 -
  21.237 -	.rotr val[N+1], w[2]
  21.238 -	.rotp p[N]
  21.239 -
  21.240 -	cmp.ne p6,p0=t0,r0		// is src aligned, too?
  21.241 -	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
  21.242 -	adds t2=-1,t2			// br.ctop is repeat/until
  21.243 -	;;
  21.244 -	add t4=t0,t4
  21.245 -	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
  21.246 -	mov ar.lc=t2
  21.247 -	;;
  21.248 -	nop.m	0
  21.249 -	;;
  21.250 -	nop.m	0
  21.251 -	nop.i	0
  21.252 -	;;
  21.253 -	nop.m	0
  21.254 -	;;
  21.255 -(p6)	ld8 val[1]=[src2],8		// prime the pump...
  21.256 -	mov b6=t4
  21.257 -	br.sptk.few b6
  21.258 -	;;
  21.259 -
  21.260 -.memcpy_tail:
  21.261 -	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
  21.262 -	// less than 8) and t0 contains the last few bytes of the src buffer:
  21.263 -(p5)	st4 [dst]=t0,4
  21.264 -(p5)	shr.u t0=t0,32
  21.265 -	mov ar.lc=saved_lc
  21.266 -	;;
  21.267 -(p4)	st2 [dst]=t0,2
  21.268 -(p4)	shr.u t0=t0,16
  21.269 -	mov ar.pfs=saved_pfs
  21.270 -	;;
  21.271 -(p3)	st1 [dst]=t0
  21.272 -	mov pr=saved_pr,-1
  21.273 -	br.ret.sptk.many rp
  21.274 -
  21.275 -///////////////////////////////////////////////////////
  21.276 -	.align 64
  21.277 -
  21.278 -#define COPY(shift,index)									\
  21.279 - 1: { .mib											\
  21.280 -	(p[0])		ld8 val[0]=[src2],8;							\
  21.281 -	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
  21.282 -			brp.loop.imp 1b, 2f							\
  21.283 -    };												\
  21.284 - 2: { .mfb											\
  21.285 -	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
  21.286 -			nop.f 0;								\
  21.287 -			br.ctop.dptk.few 1b;							\
  21.288 -    };												\
  21.289 -			;;									\
  21.290 -			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
  21.291 -			;;									\
  21.292 -			shrp t0=val[N-1],val[N-index],shift;					\
  21.293 -			br .memcpy_tail
  21.294 -.memcpy_loops:
  21.295 -	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
  21.296 -	COPY(8, 0)
  21.297 -	COPY(16, 0)
  21.298 -	COPY(24, 0)
  21.299 -	COPY(32, 0)
  21.300 -	COPY(40, 0)
  21.301 -	COPY(48, 0)
  21.302 -	COPY(56, 0)
  21.303 -
  21.304 -END(memcpy)
    22.1 --- a/xen/arch/ia64/linux/lib/memcpy_mck.S	Tue Aug 30 17:51:51 2005 -0600
    22.2 +++ b/xen/arch/ia64/linux/lib/memcpy_mck.S	Wed Aug 31 14:32:27 2005 -0600
    22.3 @@ -75,6 +75,7 @@ GLOBAL_ENTRY(memcpy)
    22.4  	mov	f6=f0
    22.5  	br.cond.sptk .common_code
    22.6  	;;
    22.7 +END(memcpy)
    22.8  GLOBAL_ENTRY(__copy_user)
    22.9  	.prologue
   22.10  // check dest alignment
   22.11 @@ -300,7 +301,7 @@ EK(.ex_handler,	(p[D])	st8 [dst1] = t15,
   22.12  	add	src_pre_mem=0,src0	// prefetch src pointer
   22.13  	add	dst_pre_mem=0,dst0	// prefetch dest pointer
   22.14  	and	src0=-8,src0		// 1st src pointer
   22.15 -(p7)	mov	ar.lc = r21
   22.16 +(p7)	mov	ar.lc = cnt
   22.17  (p8)	mov	ar.lc = r0
   22.18  	;;
   22.19  	TEXT_ALIGN(32)
   22.20 @@ -524,7 +525,6 @@ EK(.ex_handler,  (p17)	st8	[dst1]=r39,8)
   22.21  #undef B
   22.22  #undef C
   22.23  #undef D
   22.24 -END(memcpy)
   22.25  
   22.26  /*
   22.27   * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
    23.1 --- a/xen/arch/ia64/linux/lib/memset.S	Tue Aug 30 17:51:51 2005 -0600
    23.2 +++ b/xen/arch/ia64/linux/lib/memset.S	Wed Aug 31 14:32:27 2005 -0600
    23.3 @@ -57,10 +57,10 @@ GLOBAL_ENTRY(memset)
    23.4  { .mmi
    23.5  	.prologue
    23.6  	alloc	tmp = ar.pfs, 3, 0, 0, 0
    23.7 -	.body
    23.8  	lfetch.nt1 [dest]			//
    23.9  	.save   ar.lc, save_lc
   23.10  	mov.i	save_lc = ar.lc
   23.11 +	.body
   23.12  } { .mmi
   23.13  	mov	ret0 = dest			// return value
   23.14  	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
    24.1 --- a/xen/arch/ia64/linux/lib/strlen_user.S	Tue Aug 30 17:51:51 2005 -0600
    24.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.3 @@ -1,198 +0,0 @@
    24.4 -/*
    24.5 - * Optimized version of the strlen_user() function
    24.6 - *
    24.7 - * Inputs:
    24.8 - *	in0	address of buffer
    24.9 - *
   24.10 - * Outputs:
   24.11 - *	ret0	0 in case of fault, strlen(buffer)+1 otherwise
   24.12 - *
   24.13 - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
   24.14 - *	David Mosberger-Tang <davidm@hpl.hp.com>
   24.15 - *	Stephane Eranian <eranian@hpl.hp.com>
   24.16 - *
   24.17 - * 01/19/99 S.Eranian heavily enhanced version (see details below)
   24.18 - * 09/24/99 S.Eranian added speculation recovery code
   24.19 - */
   24.20 -
   24.21 -#include <asm/asmmacro.h>
   24.22 -
   24.23 -//
   24.24 -// int strlen_user(char *)
   24.25 -// ------------------------
   24.26 -// Returns:
   24.27 -//	- length of string + 1
   24.28 -//	- 0 in case an exception is raised
   24.29 -//
   24.30 -// This is an enhanced version of the basic strlen_user. it includes a
   24.31 -// combination of compute zero index (czx), parallel comparisons, speculative
   24.32 -// loads and loop unroll using rotating registers.
   24.33 -//
   24.34 -// General Ideas about the algorithm:
   24.35 -//	  The goal is to look at the string in chunks of 8 bytes.
   24.36 -//	  so we need to do a few extra checks at the beginning because the
   24.37 -//	  string may not be 8-byte aligned. In this case we load the 8byte
   24.38 -//	  quantity which includes the start of the string and mask the unused
   24.39 -//	  bytes with 0xff to avoid confusing czx.
   24.40 -//	  We use speculative loads and software pipelining to hide memory
   24.41 -//	  latency and do read ahead safely. This way we defer any exception.
   24.42 -//
   24.43 -//	  Because we don't want the kernel to be relying on particular
   24.44 -//	  settings of the DCR register, we provide recovery code in case
   24.45 -//	  speculation fails. The recovery code is going to "redo" the work using
   24.46 -//	  only normal loads. If we still get a fault then we return an
   24.47 -//	  error (ret0=0). Otherwise we return the strlen+1 as usual.
   24.48 -//	  The fact that speculation may fail can be caused, for instance, by
   24.49 -//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
   24.50 -//	  a NaT bit will be set if the translation is not present. The normal
   24.51 -//	  load, on the other hand, will cause the translation to be inserted
   24.52 -//	  if the mapping exists.
   24.53 -//
   24.54 -//	  It should be noted that we execute recovery code only when we need
   24.55 -//	  to use the data that has been speculatively loaded: we don't execute
   24.56 -//	  recovery code on pure read ahead data.
   24.57 -//
   24.58 -// Remarks:
   24.59 -//	- the cmp r0,r0 is used as a fast way to initialize a predicate
   24.60 -//	  register to 1. This is required to make sure that we get the parallel
   24.61 -//	  compare correct.
   24.62 -//
   24.63 -//	- we don't use the epilogue counter to exit the loop but we need to set
   24.64 -//	  it to zero beforehand.
   24.65 -//
   24.66 -//	- after the loop we must test for Nat values because neither the
   24.67 -//	  czx nor cmp instruction raise a NaT consumption fault. We must be
   24.68 -//	  careful not to look too far for a Nat for which we don't care.
   24.69 -//	  For instance we don't need to look at a NaT in val2 if the zero byte
   24.70 -//	  was in val1.
   24.71 -//
   24.72 -//	- Clearly performance tuning is required.
   24.73 -//
   24.74 -
   24.75 -#define saved_pfs	r11
   24.76 -#define	tmp		r10
   24.77 -#define base		r16
   24.78 -#define orig		r17
   24.79 -#define saved_pr	r18
   24.80 -#define src		r19
   24.81 -#define mask		r20
   24.82 -#define val		r21
   24.83 -#define val1		r22
   24.84 -#define val2		r23
   24.85 -
   24.86 -GLOBAL_ENTRY(__strlen_user)
   24.87 -	.prologue
   24.88 -	.save ar.pfs, saved_pfs
   24.89 -	alloc saved_pfs=ar.pfs,11,0,0,8
   24.90 -
   24.91 -	.rotr v[2], w[2]	// declares our 4 aliases
   24.92 -
   24.93 -	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
   24.94 -	mov orig=in0		// keep trackof initial byte address
   24.95 -	dep src=0,in0,0,3	// src=8byte-aligned in0 address
   24.96 -	.save pr, saved_pr
   24.97 -	mov saved_pr=pr		// preserve predicates (rotation)
   24.98 -	;;
   24.99 -
  24.100 -	.body
  24.101 -
  24.102 -	ld8.s v[1]=[src],8	// load the initial 8bytes (must speculate)
  24.103 -	shl tmp=tmp,3		// multiply by 8bits/byte
  24.104 -	mov mask=-1		// our mask
  24.105 -	;;
  24.106 -	ld8.s w[1]=[src],8	// load next 8 bytes in 2nd pipeline
  24.107 -	cmp.eq p6,p0=r0,r0	// sets p6 (required because of // cmp.and)
  24.108 -	sub tmp=64,tmp		// how many bits to shift our mask on the right
  24.109 -	;;
  24.110 -	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
  24.111 -	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
  24.112 -	;;
  24.113 -	add base=-16,src	// keep track of aligned base
  24.114 -	chk.s v[1], .recover	// if already NaT, then directly skip to recover
  24.115 -	or v[1]=v[1],mask	// now we have a safe initial byte pattern
  24.116 -	;;
  24.117 -1:
  24.118 -	ld8.s v[0]=[src],8	// speculatively load next
  24.119 -	czx1.r val1=v[1]	// search 0 byte from right
  24.120 -	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
  24.121 -	;;
  24.122 -	ld8.s w[0]=[src],8	// speculatively load next to next
  24.123 -	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
  24.124 -	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
  24.125 -(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
  24.126 -	;;
  24.127 -	//
  24.128 -	// We must return try the recovery code iff
  24.129 -	// val1_is_nat || (val1==8 && val2_is_nat)
  24.130 -	//
  24.131 -	// XXX Fixme
  24.132 -	//	- there must be a better way of doing the test
  24.133 -	//
  24.134 -	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
  24.135 -	tnat.nz p6,p7=val1	// test NaT on val1
  24.136 -(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
  24.137 -	;;
  24.138 -	//
  24.139 -	// if we come here p7 is true, i.e., initialized for // cmp
  24.140 -	//
  24.141 -	cmp.eq.and  p7,p0=8,val1// val1==8?
  24.142 -	tnat.nz.and p7,p0=val2	// test NaT if val2
  24.143 -(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
  24.144 -	;;
  24.145 -(p8)	mov val1=val2		// val2 contains the value
  24.146 -(p8)	adds src=-16,src	// correct position when 3 ahead
  24.147 -(p9)	adds src=-24,src	// correct position when 4 ahead
  24.148 -	;;
  24.149 -	sub ret0=src,orig	// distance from origin
  24.150 -	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
  24.151 -	mov pr=saved_pr,0xffffffffffff0000
  24.152 -	;;
  24.153 -	sub ret0=ret0,tmp	// length=now - back -1
  24.154 -	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  24.155 -	br.ret.sptk.many rp	// end of normal execution
  24.156 -
  24.157 -	//
  24.158 -	// Outlined recovery code when speculation failed
  24.159 -	//
  24.160 -	// This time we don't use speculation and rely on the normal exception
  24.161 -	// mechanism. that's why the loop is not as good as the previous one
  24.162 -	// because read ahead is not possible
  24.163 -	//
  24.164 -	// XXX Fixme
  24.165 -	//	- today we restart from the beginning of the string instead
  24.166 -	//	  of trying to continue where we left off.
  24.167 -	//
  24.168 -.recover:
  24.169 -	EX(.Lexit1, ld8 val=[base],8)	// load the initial bytes
  24.170 -	;;
  24.171 -	or val=val,mask			// remask first bytes
  24.172 -	cmp.eq p0,p6=r0,r0		// nullify first ld8 in loop
  24.173 -	;;
  24.174 -	//
  24.175 -	// ar.ec is still zero here
  24.176 -	//
  24.177 -2:
  24.178 -	EX(.Lexit1, (p6) ld8 val=[base],8)
  24.179 -	;;
  24.180 -	czx1.r val1=val		// search 0 byte from right
  24.181 -	;;
  24.182 -	cmp.eq p6,p0=8,val1	// val1==8 ?
  24.183 -(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
  24.184 -	;;
  24.185 -	sub ret0=base,orig	// distance from base
  24.186 -	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
  24.187 -	mov pr=saved_pr,0xffffffffffff0000
  24.188 -	;;
  24.189 -	sub ret0=ret0,tmp	// length=now - back -1
  24.190 -	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  24.191 -	br.ret.sptk.many rp	// end of successful recovery code
  24.192 -
  24.193 -	//
  24.194 -	// We failed even on the normal load (called from exception handler)
  24.195 -	//
  24.196 -.Lexit1:
  24.197 -	mov ret0=0
  24.198 -	mov pr=saved_pr,0xffffffffffff0000
  24.199 -	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  24.200 -	br.ret.sptk.many rp
  24.201 -END(__strlen_user)
    25.1 --- a/xen/arch/ia64/linux/lib/strncpy_from_user.S	Tue Aug 30 17:51:51 2005 -0600
    25.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.3 @@ -1,44 +0,0 @@
    25.4 -/*
    25.5 - * Just like strncpy() except that if a fault occurs during copying,
    25.6 - * -EFAULT is returned.
    25.7 - *
    25.8 - * Inputs:
    25.9 - *	in0:	address of destination buffer
   25.10 - *	in1:	address of string to be copied
   25.11 - *	in2:	length of buffer in bytes
   25.12 - * Outputs:
   25.13 - *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
   25.14 - *
   25.15 - * Copyright (C) 1998-2001 Hewlett-Packard Co
   25.16 - * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
   25.17 - *
   25.18 - * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
   25.19 - *			 by Andreas Schwab <schwab@suse.de>).
   25.20 - */
   25.21 -
   25.22 -#include <asm/asmmacro.h>
   25.23 -
   25.24 -GLOBAL_ENTRY(__strncpy_from_user)
   25.25 -	alloc r2=ar.pfs,3,0,0,0
   25.26 -	mov r8=0
   25.27 -	mov r9=in1
   25.28 -	;;
   25.29 -	add r10=in1,in2
   25.30 -	cmp.eq p6,p0=r0,in2
   25.31 -(p6)	br.ret.spnt.many rp
   25.32 -
   25.33 -	// XXX braindead copy loop---this needs to be optimized
   25.34 -.Loop1:
   25.35 -	EX(.Lexit, ld1 r8=[in1],1)
   25.36 -	;;
   25.37 -	EX(.Lexit, st1 [in0]=r8,1)
   25.38 -	cmp.ne p6,p7=r8,r0
   25.39 -	;;
   25.40 -(p6)	cmp.ne.unc p8,p0=in1,r10
   25.41 -(p8)	br.cond.dpnt.few .Loop1
   25.42 -	;;
   25.43 -(p6)	mov r8=in2		// buffer filled up---return buffer length
   25.44 -(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
   25.45 -[.Lexit:]
   25.46 -	br.ret.sptk.many rp
   25.47 -END(__strncpy_from_user)
    26.1 --- a/xen/arch/ia64/linux/lib/strnlen_user.S	Tue Aug 30 17:51:51 2005 -0600
    26.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.3 @@ -1,45 +0,0 @@
    26.4 -/*
    26.5 - * Returns 0 if exception before NUL or reaching the supplied limit (N),
    26.6 - * a value greater than N if the string is longer than the limit, else
    26.7 - * strlen.
    26.8 - *
    26.9 - * Inputs:
   26.10 - *	in0:	address of buffer
   26.11 - *	in1:	string length limit N
   26.12 - * Outputs:
   26.13 - *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
   26.14 - *
   26.15 - * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
   26.16 - */
   26.17 -
   26.18 -#include <asm/asmmacro.h>
   26.19 -
   26.20 -GLOBAL_ENTRY(__strnlen_user)
   26.21 -	.prologue
   26.22 -	alloc r2=ar.pfs,2,0,0,0
   26.23 -	.save ar.lc, r16
   26.24 -	mov r16=ar.lc			// preserve ar.lc
   26.25 -
   26.26 -	.body
   26.27 -
   26.28 -	add r3=-1,in1
   26.29 -	;;
   26.30 -	mov ar.lc=r3
   26.31 -	mov r9=0
   26.32 -	;;
   26.33 -	// XXX braindead strlen loop---this needs to be optimized
   26.34 -.Loop1:
   26.35 -	EXCLR(.Lexit, ld1 r8=[in0],1)
   26.36 -	add r9=1,r9
   26.37 -	;;
   26.38 -	cmp.eq p6,p0=r8,r0
   26.39 -(p6)	br.cond.dpnt .Lexit
   26.40 -	br.cloop.dptk.few .Loop1
   26.41 -
   26.42 -	add r9=1,in1			// NUL not found---return N+1
   26.43 -	;;
   26.44 -.Lexit:
   26.45 -	mov r8=r9
   26.46 -	mov ar.lc=r16			// restore ar.lc
   26.47 -	br.ret.sptk.many rp
   26.48 -END(__strnlen_user)
    27.1 --- a/xen/arch/ia64/linux/lib/xor.S	Tue Aug 30 17:51:51 2005 -0600
    27.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.3 @@ -1,184 +0,0 @@
    27.4 -/*
    27.5 - * arch/ia64/lib/xor.S
    27.6 - *
    27.7 - * Optimized RAID-5 checksumming functions for IA-64.
    27.8 - *
    27.9 - * This program is free software; you can redistribute it and/or modify
   27.10 - * it under the terms of the GNU General Public License as published by
   27.11 - * the Free Software Foundation; either version 2, or (at your option)
   27.12 - * any later version.
   27.13 - *
   27.14 - * You should have received a copy of the GNU General Public License
   27.15 - * (for example /usr/src/linux/COPYING); if not, write to the Free
   27.16 - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   27.17 - */
   27.18 -
   27.19 -#include <asm/asmmacro.h>
   27.20 -
   27.21 -GLOBAL_ENTRY(xor_ia64_2)
   27.22 -	.prologue
   27.23 -	.fframe 0
   27.24 -	.save ar.pfs, r31
   27.25 -	alloc r31 = ar.pfs, 3, 0, 13, 16
   27.26 -	.save ar.lc, r30
   27.27 -	mov r30 = ar.lc
   27.28 -	.save pr, r29
   27.29 -	mov r29 = pr
   27.30 -	;;
   27.31 -	.body
   27.32 -	mov r8 = in1
   27.33 -	mov ar.ec = 6 + 2
   27.34 -	shr in0 = in0, 3
   27.35 -	;;
   27.36 -	adds in0 = -1, in0
   27.37 -	mov r16 = in1
   27.38 -	mov r17 = in2
   27.39 -	;;
   27.40 -	mov ar.lc = in0
   27.41 -	mov pr.rot = 1 << 16
   27.42 -	;;
   27.43 -	.rotr s1[6+1], s2[6+1], d[2]
   27.44 -	.rotp p[6+2]
   27.45 -0:
   27.46 -(p[0])	ld8.nta s1[0] = [r16], 8
   27.47 -(p[0])	ld8.nta s2[0] = [r17], 8
   27.48 -(p[6])	xor d[0] = s1[6], s2[6]
   27.49 -(p[6+1])st8.nta [r8] = d[1], 8
   27.50 -	nop.f 0
   27.51 -	br.ctop.dptk.few 0b
   27.52 -	;;
   27.53 -	mov ar.lc = r30
   27.54 -	mov pr = r29, -1
   27.55 -	br.ret.sptk.few rp
   27.56 -END(xor_ia64_2)
   27.57 -
   27.58 -GLOBAL_ENTRY(xor_ia64_3)
   27.59 -	.prologue
   27.60 -	.fframe 0
   27.61 -	.save ar.pfs, r31
   27.62 -	alloc r31 = ar.pfs, 4, 0, 20, 24
   27.63 -	.save ar.lc, r30
   27.64 -	mov r30 = ar.lc
   27.65 -	.save pr, r29
   27.66 -	mov r29 = pr
   27.67 -	;;
   27.68 -	.body
   27.69 -	mov r8 = in1
   27.70 -	mov ar.ec = 6 + 2
   27.71 -	shr in0 = in0, 3
   27.72 -	;;
   27.73 -	adds in0 = -1, in0
   27.74 -	mov r16 = in1
   27.75 -	mov r17 = in2
   27.76 -	;;
   27.77 -	mov r18 = in3
   27.78 -	mov ar.lc = in0
   27.79 -	mov pr.rot = 1 << 16
   27.80 -	;;
   27.81 -	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
   27.82 -	.rotp p[6+2]
   27.83 -0:
   27.84 -(p[0])	ld8.nta s1[0] = [r16], 8
   27.85 -(p[0])	ld8.nta s2[0] = [r17], 8
   27.86 -(p[6])	xor d[0] = s1[6], s2[6]
   27.87 -	;;
   27.88 -(p[0])	ld8.nta s3[0] = [r18], 8
   27.89 -(p[6+1])st8.nta [r8] = d[1], 8
   27.90 -(p[6])	xor d[0] = d[0], s3[6]
   27.91 -	br.ctop.dptk.few 0b
   27.92 -	;;
   27.93 -	mov ar.lc = r30
   27.94 -	mov pr = r29, -1
   27.95 -	br.ret.sptk.few rp
   27.96 -END(xor_ia64_3)
   27.97 -
   27.98 -GLOBAL_ENTRY(xor_ia64_4)
   27.99 -	.prologue
  27.100 -	.fframe 0
  27.101 -	.save ar.pfs, r31
  27.102 -	alloc r31 = ar.pfs, 5, 0, 27, 32
  27.103 -	.save ar.lc, r30
  27.104 -	mov r30 = ar.lc
  27.105 -	.save pr, r29
  27.106 -	mov r29 = pr
  27.107 -	;;
  27.108 -	.body
  27.109 -	mov r8 = in1
  27.110 -	mov ar.ec = 6 + 2
  27.111 -	shr in0 = in0, 3
  27.112 -	;;
  27.113 -	adds in0 = -1, in0
  27.114 -	mov r16 = in1
  27.115 -	mov r17 = in2
  27.116 -	;;
  27.117 -	mov r18 = in3
  27.118 -	mov ar.lc = in0
  27.119 -	mov pr.rot = 1 << 16
  27.120 -	mov r19 = in4
  27.121 -	;;
  27.122 -	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
  27.123 -	.rotp p[6+2]
  27.124 -0:
  27.125 -(p[0])	ld8.nta s1[0] = [r16], 8
  27.126 -(p[0])	ld8.nta s2[0] = [r17], 8
  27.127 -(p[6])	xor d[0] = s1[6], s2[6]
  27.128 -(p[0])	ld8.nta s3[0] = [r18], 8
  27.129 -(p[0])	ld8.nta s4[0] = [r19], 8
  27.130 -(p[6])	xor r20 = s3[6], s4[6]
  27.131 -	;;
  27.132 -(p[6+1])st8.nta [r8] = d[1], 8
  27.133 -(p[6])	xor d[0] = d[0], r20
  27.134 -	br.ctop.dptk.few 0b
  27.135 -	;;
  27.136 -	mov ar.lc = r30
  27.137 -	mov pr = r29, -1
  27.138 -	br.ret.sptk.few rp
  27.139 -END(xor_ia64_4)
  27.140 -
  27.141 -GLOBAL_ENTRY(xor_ia64_5)
  27.142 -	.prologue
  27.143 -	.fframe 0
  27.144 -	.save ar.pfs, r31
  27.145 -	alloc r31 = ar.pfs, 6, 0, 34, 40
  27.146 -	.save ar.lc, r30
  27.147 -	mov r30 = ar.lc
  27.148 -	.save pr, r29
  27.149 -	mov r29 = pr
  27.150 -	;;
  27.151 -	.body
  27.152 -	mov r8 = in1
  27.153 -	mov ar.ec = 6 + 2
  27.154 -	shr in0 = in0, 3
  27.155 -	;;
  27.156 -	adds in0 = -1, in0
  27.157 -	mov r16 = in1
  27.158 -	mov r17 = in2
  27.159 -	;;
  27.160 -	mov r18 = in3
  27.161 -	mov ar.lc = in0
  27.162 -	mov pr.rot = 1 << 16
  27.163 -	mov r19 = in4
  27.164 -	mov r20 = in5
  27.165 -	;;
  27.166 -	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
  27.167 -	.rotp p[6+2]
  27.168 -0:
  27.169 -(p[0])	ld8.nta s1[0] = [r16], 8
  27.170 -(p[0])	ld8.nta s2[0] = [r17], 8
  27.171 -(p[6])	xor d[0] = s1[6], s2[6]
  27.172 -(p[0])	ld8.nta s3[0] = [r18], 8
  27.173 -(p[0])	ld8.nta s4[0] = [r19], 8
  27.174 -(p[6])	xor r21 = s3[6], s4[6]
  27.175 -	;;
  27.176 -(p[0])	ld8.nta s5[0] = [r20], 8
  27.177 -(p[6+1])st8.nta [r8] = d[1], 8
  27.178 -(p[6])	xor d[0] = d[0], r21
  27.179 -	;;
  27.180 -(p[6])	  xor d[0] = d[0], s5[6]
  27.181 -	nop.f 0
  27.182 -	br.ctop.dptk.few 0b
  27.183 -	;;
  27.184 -	mov ar.lc = r30
  27.185 -	mov pr = r29, -1
  27.186 -	br.ret.sptk.few rp
  27.187 -END(xor_ia64_5)
    28.1 --- a/xen/arch/ia64/linux/minstate.h	Tue Aug 30 17:51:51 2005 -0600
    28.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.3 @@ -1,254 +0,0 @@
    28.4 -#include <linux/config.h>
    28.5 -
    28.6 -#include <asm/cache.h>
    28.7 -
    28.8 -#include "entry.h"
    28.9 -
   28.10 -/*
   28.11 - * For ivt.s we want to access the stack virtually so we don't have to disable translation
   28.12 - * on interrupts.
   28.13 - *
   28.14 - *  On entry:
   28.15 - *	r1:	pointer to current task (ar.k6)
   28.16 - */
   28.17 -#define MINSTATE_START_SAVE_MIN_VIRT								\
   28.18 -(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
   28.19 -	;;											\
   28.20 -(pUStk)	mov.m r24=ar.rnat;									\
   28.21 -(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
   28.22 -(pKStk) mov r1=sp;					/* get sp  */				\
   28.23 -	;;											\
   28.24 -(pUStk) lfetch.fault.excl.nt1 [r22];								\
   28.25 -(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
   28.26 -(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
   28.27 -	;;											\
   28.28 -(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
   28.29 -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
   28.30 -	;;											\
   28.31 -(pUStk)	mov r18=ar.bsp;										\
   28.32 -(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
   28.33 -
   28.34 -#define MINSTATE_END_SAVE_MIN_VIRT								\
   28.35 -	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
   28.36 -	;;
   28.37 -
   28.38 -/*
   28.39 - * For mca_asm.S we want to access the stack physically since the state is saved before we
   28.40 - * go virtual and don't want to destroy the iip or ipsr.
   28.41 - */
   28.42 -#define MINSTATE_START_SAVE_MIN_PHYS								\
   28.43 -(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;								\
   28.44 -(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;							\
   28.45 -(pKStk) ld8 r3 = [r3];;										\
   28.46 -(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;						\
   28.47 -(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;						\
   28.48 -(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
   28.49 -(pUStk)	addl r22=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	\
   28.50 -	;;											\
   28.51 -(pUStk)	mov r24=ar.rnat;									\
   28.52 -(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
   28.53 -(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
   28.54 -(pUStk)	dep r22=-1,r22,61,3;			/* compute kernel virtual addr of RBS */	\
   28.55 -	;;											\
   28.56 -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		\
   28.57 -(pUStk)	mov ar.bspstore=r22;			/* switch to kernel RBS */			\
   28.58 -	;;											\
   28.59 -(pUStk)	mov r18=ar.bsp;										\
   28.60 -(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
   28.61 -
   28.62 -#define MINSTATE_END_SAVE_MIN_PHYS								\
   28.63 -	dep r12=-1,r12,61,3;		/* make sp a kernel virtual address */			\
   28.64 -	;;
   28.65 -
   28.66 -#ifdef MINSTATE_VIRT
   28.67 -# define MINSTATE_GET_CURRENT(reg)	\
   28.68 -		movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
   28.69 -		ld8 reg=[reg]
   28.70 -# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_VIRT
   28.71 -# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_VIRT
   28.72 -#endif
   28.73 -
   28.74 -#ifdef MINSTATE_PHYS
   28.75 -# define MINSTATE_GET_CURRENT(reg)	mov reg=IA64_KR(CURRENT);; tpa reg=reg
   28.76 -# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_PHYS
   28.77 -# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_PHYS
   28.78 -#endif
   28.79 -
   28.80 -/*
   28.81 - * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
   28.82 - * the minimum state necessary that allows us to turn psr.ic back
   28.83 - * on.
   28.84 - *
   28.85 - * Assumed state upon entry:
   28.86 - *	psr.ic: off
   28.87 - *	r31:	contains saved predicates (pr)
   28.88 - *
   28.89 - * Upon exit, the state is as follows:
   28.90 - *	psr.ic: off
   28.91 - *	 r2 = points to &pt_regs.r16
   28.92 - *	 r8 = contents of ar.ccv
   28.93 - *	 r9 = contents of ar.csd
   28.94 - *	r10 = contents of ar.ssd
   28.95 - *	r11 = FPSR_DEFAULT
   28.96 - *	r12 = kernel sp (kernel virtual address)
   28.97 - *	r13 = points to current task_struct (kernel virtual address)
   28.98 - *	p15 = TRUE if psr.i is set in cr.ipsr
   28.99 - *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
  28.100 - *		preserved
  28.101 - *
  28.102 - * Note that psr.ic is NOT turned on by this macro.  This is so that
  28.103 - * we can pass interruption state as arguments to a handler.
  28.104 - */
  28.105 -#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)							\
  28.106 -	MINSTATE_GET_CURRENT(r16);	/* M (or M;;I) */					\
  28.107 -	mov r27=ar.rsc;			/* M */							\
  28.108 -	mov r20=r1;			/* A */							\
  28.109 -	mov r25=ar.unat;		/* M */							\
  28.110 -	mov r29=cr.ipsr;		/* M */							\
  28.111 -	mov r26=ar.pfs;			/* I */							\
  28.112 -	mov r28=cr.iip;			/* M */							\
  28.113 -	mov r21=ar.fpsr;		/* M */							\
  28.114 -	COVER;				/* B;; (or nothing) */					\
  28.115 -	;;											\
  28.116 -	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
  28.117 -	;;											\
  28.118 -	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
  28.119 -	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
  28.120 -	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
  28.121 -	/* switch from user to kernel RBS: */							\
  28.122 -	;;											\
  28.123 -	invala;				/* M */							\
  28.124 -	SAVE_IFS;										\
  28.125 -	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
  28.126 -	;;											\
  28.127 -	MINSTATE_START_SAVE_MIN									\
  28.128 -	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
  28.129 -	adds r16=PT(CR_IPSR),r1;								\
  28.130 -	;;											\
  28.131 -	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
  28.132 -	st8 [r16]=r29;		/* save cr.ipsr */						\
  28.133 -	;;											\
  28.134 -	lfetch.fault.excl.nt1 [r17];								\
  28.135 -	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
  28.136 -	mov r29=b0										\
  28.137 -	;;											\
  28.138 -	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
  28.139 -	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
  28.140 -(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
  28.141 -	;;											\
  28.142 -.mem.offset 0,0; st8.spill [r16]=r8,16;								\
  28.143 -.mem.offset 8,0; st8.spill [r17]=r9,16;								\
  28.144 -        ;;											\
  28.145 -.mem.offset 0,0; st8.spill [r16]=r10,24;							\
  28.146 -.mem.offset 8,0; st8.spill [r17]=r11,24;							\
  28.147 -        ;;											\
  28.148 -	st8 [r16]=r28,16;	/* save cr.iip */						\
  28.149 -	st8 [r17]=r30,16;	/* save cr.ifs */						\
  28.150 -(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
  28.151 -	mov r8=ar.ccv;										\
  28.152 -	mov r9=ar.csd;										\
  28.153 -	mov r10=ar.ssd;										\
  28.154 -	movl r11=FPSR_DEFAULT;   /* L-unit */							\
  28.155 -	;;											\
  28.156 -	st8 [r16]=r25,16;	/* save ar.unat */						\
  28.157 -	st8 [r17]=r26,16;	/* save ar.pfs */						\
  28.158 -	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
  28.159 -	;;											\
  28.160 -	st8 [r16]=r27,16;	/* save ar.rsc */						\
  28.161 -(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
  28.162 -(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
  28.163 -	;;			/* avoid RAW on r16 & r17 */					\
  28.164 -(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
  28.165 -	st8 [r17]=r31,16;	/* save predicates */						\
  28.166 -(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
  28.167 -	;;											\
  28.168 -	st8 [r16]=r29,16;	/* save b0 */							\
  28.169 -	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
  28.170 -	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
  28.171 -	;;											\
  28.172 -.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
  28.173 -.mem.offset 8,0; st8.spill [r17]=r12,16;							\
  28.174 -	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
  28.175 -	;;											\
  28.176 -.mem.offset 0,0; st8.spill [r16]=r13,16;							\
  28.177 -.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
  28.178 -	movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;					\
  28.179 -	ld8 r13=[r13];			/* establish 'current' */				\
  28.180 -	;;											\
  28.181 -.mem.offset 0,0; st8.spill [r16]=r15,16;							\
  28.182 -.mem.offset 8,0; st8.spill [r17]=r14,16;							\
  28.183 -	;;											\
  28.184 -.mem.offset 0,0; st8.spill [r16]=r2,16;								\
  28.185 -.mem.offset 8,0; st8.spill [r17]=r3,16;								\
  28.186 -	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
  28.187 -	;;											\
  28.188 -	EXTRA;											\
  28.189 -	movl r1=__gp;		/* establish kernel global pointer */				\
  28.190 -	;;											\
  28.191 -	MINSTATE_END_SAVE_MIN
  28.192 -
  28.193 -/*
  28.194 - * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
  28.195 - *
  28.196 - * Assumed state upon entry:
  28.197 - *	psr.ic: on
  28.198 - *	r2:	points to &pt_regs.r16
  28.199 - *	r3:	points to &pt_regs.r17
  28.200 - *	r8:	contents of ar.ccv
  28.201 - *	r9:	contents of ar.csd
  28.202 - *	r10:	contents of ar.ssd
  28.203 - *	r11:	FPSR_DEFAULT
  28.204 - *
  28.205 - * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
  28.206 - */
  28.207 -#define SAVE_REST				\
  28.208 -.mem.offset 0,0; st8.spill [r2]=r16,16;		\
  28.209 -.mem.offset 8,0; st8.spill [r3]=r17,16;		\
  28.210 -	;;					\
  28.211 -.mem.offset 0,0; st8.spill [r2]=r18,16;		\
  28.212 -.mem.offset 8,0; st8.spill [r3]=r19,16;		\
  28.213 -	;;					\
  28.214 -.mem.offset 0,0; st8.spill [r2]=r20,16;		\
  28.215 -.mem.offset 8,0; st8.spill [r3]=r21,16;		\
  28.216 -	mov r18=b6;				\
  28.217 -	;;					\
  28.218 -.mem.offset 0,0; st8.spill [r2]=r22,16;		\
  28.219 -.mem.offset 8,0; st8.spill [r3]=r23,16;		\
  28.220 -	mov r19=b7;				\
  28.221 -	;;					\
  28.222 -.mem.offset 0,0; st8.spill [r2]=r24,16;		\
  28.223 -.mem.offset 8,0; st8.spill [r3]=r25,16;		\
  28.224 -	;;					\
  28.225 -.mem.offset 0,0; st8.spill [r2]=r26,16;		\
  28.226 -.mem.offset 8,0; st8.spill [r3]=r27,16;		\
  28.227 -	;;					\
  28.228 -.mem.offset 0,0; st8.spill [r2]=r28,16;		\
  28.229 -.mem.offset 8,0; st8.spill [r3]=r29,16;		\
  28.230 -	;;					\
  28.231 -.mem.offset 0,0; st8.spill [r2]=r30,16;		\
  28.232 -.mem.offset 8,0; st8.spill [r3]=r31,32;		\
  28.233 -	;;					\
  28.234 -	mov ar.fpsr=r11;	/* M-unit */	\
  28.235 -	st8 [r2]=r8,8;		/* ar.ccv */	\
  28.236 -	adds r24=PT(B6)-PT(F7),r3;		\
  28.237 -	;;					\
  28.238 -	stf.spill [r2]=f6,32;			\
  28.239 -	stf.spill [r3]=f7,32;			\
  28.240 -	;;					\
  28.241 -	stf.spill [r2]=f8,32;			\
  28.242 -	stf.spill [r3]=f9,32;			\
  28.243 -	;;					\
  28.244 -	stf.spill [r2]=f10;			\
  28.245 -	stf.spill [r3]=f11;			\
  28.246 -	adds r25=PT(B7)-PT(F11),r3;		\
  28.247 -	;;					\
  28.248 -	st8 [r24]=r18,16;       /* b6 */	\
  28.249 -	st8 [r25]=r19,16;       /* b7 */	\
  28.250 -	;;					\
  28.251 -	st8 [r24]=r9;        	/* ar.csd */	\
  28.252 -	st8 [r25]=r10;      	/* ar.ssd */	\
  28.253 -	;;
  28.254 -
  28.255 -#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(cover, mov r30=cr.ifs,)
  28.256 -#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
  28.257 -#define SAVE_MIN		DO_SAVE_MIN(     , mov r30=r0, )
    29.1 --- a/xen/arch/ia64/linux/pcdp.h	Tue Aug 30 17:51:51 2005 -0600
    29.2 +++ b/xen/arch/ia64/linux/pcdp.h	Wed Aug 31 14:32:27 2005 -0600
    29.3 @@ -2,7 +2,7 @@
    29.4   * Definitions for PCDP-defined console devices
    29.5   *
    29.6   * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
    29.7 - * v2.0:  http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf
    29.8 + * v2.0:  http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
    29.9   *
   29.10   * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
   29.11   *	Khalid Aziz <khalid.aziz@hp.com>
   29.12 @@ -52,11 +52,36 @@ struct pcdp_uart {
   29.13  	u32				clock_rate;
   29.14  	u8				pci_prog_intfc;
   29.15  	u8				flags;
   29.16 -};
   29.17 +	u16				conout_index;
   29.18 +	u32				reserved;
   29.19 +} __attribute__((packed));
   29.20 +
   29.21 +#define PCDP_IF_PCI	1
   29.22 +
   29.23 +/* pcdp_if_pci.trans */
   29.24 +#define PCDP_PCI_TRANS_IOPORT	0x02
   29.25 +#define PCDP_PCI_TRANS_MMIO	0x01
   29.26 +
   29.27 +struct pcdp_if_pci {
   29.28 +	u8			interconnect;
   29.29 +	u8			reserved;
   29.30 +	u16			length;
   29.31 +	u8			segment;
   29.32 +	u8			bus;
   29.33 +	u8			dev;
   29.34 +	u8			fun;
   29.35 +	u16			dev_id;
   29.36 +	u16			vendor_id;
   29.37 +	u32			acpi_interrupt;
   29.38 +	u64			mmio_tra;
   29.39 +	u64			ioport_tra;
   29.40 +	u8			flags;
   29.41 +	u8			trans;
   29.42 +} __attribute__((packed));
   29.43  
   29.44  struct pcdp_vga {
   29.45  	u8			count;		/* address space descriptors */
   29.46 -};
   29.47 +} __attribute__((packed));
   29.48  
   29.49  /* pcdp_device.flags */
   29.50  #define PCDP_PRIMARY_CONSOLE	1
   29.51 @@ -66,7 +91,9 @@ struct pcdp_device {
   29.52  	u8			flags;
   29.53  	u16			length;
   29.54  	u16			efi_index;
   29.55 -};
   29.56 +	/* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
   29.57 +	/* next data is device specific type (currently only pcdp_vga) */
   29.58 +} __attribute__((packed));
   29.59  
   29.60  struct pcdp {
   29.61  	u8			signature[4];
   29.62 @@ -81,4 +108,4 @@ struct pcdp {
   29.63  	u32			num_uarts;
   29.64  	struct pcdp_uart	uart[0];	/* actual size is num_uarts */
   29.65  	/* remainder of table is pcdp_device structures */
   29.66 -};
   29.67 +} __attribute__((packed));
    30.1 --- a/xen/arch/ia64/pdb-stub.c	Tue Aug 30 17:51:51 2005 -0600
    30.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.3 @@ -1,59 +0,0 @@
    30.4 -
    30.5 -/*
    30.6 - * pervasive debugger
    30.7 - * www.cl.cam.ac.uk/netos/pdb
    30.8 - *
    30.9 - * alex ho
   30.10 - * 2004
   30.11 - * university of cambridge computer laboratory
   30.12 - *
   30.13 - * code adapted originally from kgdb, nemesis, & gdbserver
   30.14 - */
   30.15 -
   30.16 -#include <xen/lib.h>
   30.17 -#include <xen/sched.h>
   30.18 -#include <asm/ptrace.h>
   30.19 -#include <xen/keyhandler.h> 
   30.20 -#include <asm/processor.h>
   30.21 -#include <asm/pdb.h>
   30.22 -#include <xen/list.h>
   30.23 -#include <xen/serial.h>
   30.24 -
   30.25 -#define __PDB_GET_VAL 1
   30.26 -#define __PDB_SET_VAL 2
   30.27 -
   30.28 -/*
   30.29 - * Read or write memory in an address space
   30.30 - */
   30.31 -int pdb_change_values(u_char *buffer, int length,
   30.32 -		      unsigned long cr3, unsigned long addr, int rw)
   30.33 -{
   30.34 -	dummy();
   30.35 -	return 0;
   30.36 -}
   30.37 -
   30.38 -/*
   30.39 - * Set memory in a domain's address space
   30.40 - * Set "length" bytes at "address" from "domain" to the values in "buffer".
   30.41 - * Return the number of bytes set, 0 if there was a problem.
   30.42 - */
   30.43 -
   30.44 -int pdb_set_values(u_char *buffer, int length,
   30.45 -		   unsigned long cr3, unsigned long addr)
   30.46 -{
   30.47 -    int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL);
   30.48 -    return count;
   30.49 -}
   30.50 -
   30.51 -/*
   30.52 - * Read memory from a domain's address space.
   30.53 - * Fetch "length" bytes at "address" from "domain" into "buffer".
   30.54 - * Return the number of bytes read, 0 if there was a problem.
   30.55 - */
   30.56 -
   30.57 -int pdb_get_values(u_char *buffer, int length,
   30.58 -		   unsigned long cr3, unsigned long addr)
   30.59 -{
   30.60 -  return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL);
   30.61 -}
   30.62 -
    31.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.2 +++ b/xen/include/asm-ia64/linux/sort.h	Wed Aug 31 14:32:27 2005 -0600
    31.3 @@ -0,0 +1,10 @@
    31.4 +#ifndef _LINUX_SORT_H
    31.5 +#define _LINUX_SORT_H
    31.6 +
    31.7 +#include <linux/types.h>
    31.8 +
    31.9 +void sort(void *base, size_t num, size_t size,
   31.10 +	  int (*cmp)(const void *, const void *),
   31.11 +	  void (*swap)(void *, void *, int));
   31.12 +
   31.13 +#endif