ia64/xen-unstable
changeset 6454:b7276814008c
Begin updating to 2.6.13 base
author | djm@kirby.fc.hp.com |
---|---|
date | Wed Aug 31 14:32:27 2005 -0600 (2005-08-31) |
parents | 44316ce83277 |
children | 4e4f1db8ea94 |
files | xen/arch/ia64/Makefile xen/arch/ia64/linux-xen/minstate.h xen/arch/ia64/linux-xen/setup.c xen/arch/ia64/linux-xen/sort.c xen/arch/ia64/linux/README.origin xen/arch/ia64/linux/extable.c xen/arch/ia64/linux/ia64_ksyms.c xen/arch/ia64/linux/irq_lsapic.c xen/arch/ia64/linux/lib/flush.S xen/arch/ia64/linux/lib/memcpy_mck.S xen/arch/ia64/linux/lib/memset.S xen/arch/ia64/linux/pcdp.h xen/include/asm-ia64/linux/sort.h |
line diff
1.1 --- a/xen/arch/ia64/Makefile Tue Aug 30 17:51:51 2005 -0600 1.2 +++ b/xen/arch/ia64/Makefile Wed Aug 31 14:32:27 2005 -0600 1.3 @@ -1,19 +1,22 @@ 1.4 include $(BASEDIR)/Rules.mk 1.5 1.6 -VPATH = linux linux-xen 1.7 +VPATH = linux linux-xen linux/lib 1.8 +#VPATH = linux-xen linux/lib 1.9 1.10 # libs-y += arch/ia64/lib/lib.a 1.11 1.12 OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \ 1.13 - xenmisc.o pdb-stub.o acpi.o hypercall.o \ 1.14 + xenmisc.o acpi.o hypercall.o \ 1.15 machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \ 1.16 idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \ 1.17 xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \ 1.18 - extable.o linuxextable.o xenirq.o xentime.o \ 1.19 + extable.o linuxextable.o sort.o xenirq.o xentime.o \ 1.20 regionreg.o entry.o unaligned.o privop.o vcpu.o \ 1.21 irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \ 1.22 grant_table.o sn_console.o 1.23 1.24 +#OBJS += idiv64.o idiv32.o \ 1.25 + 1.26 # TMP holder to contain *.0 moved out of CONFIG_VTI 1.27 OBJS += vmx_init.o 1.28 1.29 @@ -22,6 +25,13 @@ OBJS += vmx_virt.o vmx_vcpu.o vmx_proces 1.30 vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \ 1.31 vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o 1.32 endif 1.33 + 1.34 +# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib) 1.35 +OBJS += bitop.o clear_page.o flush.o copy_page_mck.o \ 1.36 + memset.o strlen.o memcpy_mck.o \ 1.37 + __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ 1.38 + __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o 1.39 + 1.40 # perfmon.o 1.41 # unwind.o needed for kernel unwinding (rare) 1.42 1.43 @@ -30,8 +40,8 @@ OBJS := $(subst $(TARGET_ARCH)/asm-offse 1.44 # remove following line if not privifying in memory 1.45 # OBJS += privify.o 1.46 1.47 -default: $(OBJS) head.o ia64lib.o xen.lds.s 1.48 - $(LD) -r -o arch.o $(OBJS) ia64lib.o 1.49 +default: $(OBJS) head.o xen.lds.s 1.50 + $(LD) -r -o arch.o $(OBJS) 1.51 $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \ 1.52 -Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms 1.53 $(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET) 1.54 @@ -79,12 +89,29 @@ xen.lds.s: xen.lds.S 1.55 $(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \ 1.56 -o xen.lds.s xen.lds.S 1.57 1.58 -ia64lib.o: 1.59 - $(MAKE) -C linux/lib && cp linux/lib/ia64lib.o . 1.60 +# variants of divide/modulo 1.61 +# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib) 1.62 +__divdi3.o: idiv64.S 1.63 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< 1.64 +__udivdi3.o: idiv64.S 1.65 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< 1.66 +__moddi3.o: idiv64.S 1.67 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< 1.68 +__umoddi3.o: idiv64.S 1.69 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< 1.70 +__divsi3.o: idiv32.S 1.71 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< 1.72 +__udivsi3.o: idiv32.S 1.73 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< 1.74 +__modsi3.o: idiv32.S 1.75 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< 1.76 +__umodsi3.o: idiv32.S 1.77 + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< 1.78 + 1.79 1.80 clean: 1.81 rm -f *.o *~ core xen.lds.s $(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s 1.82 rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h 1.83 - rm -f lib/*.o 1.84 + rm -f linux/lib/*.o 1.85 1.86 .PHONY: default clean
2.1 --- a/xen/arch/ia64/lib/Makefile Tue Aug 30 17:51:51 2005 -0600 2.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 2.3 @@ -1,44 +0,0 @@ 2.4 -# 2.5 -# Makefile for ia64-specific library routines.. 2.6 -# 2.7 - 2.8 -include $(BASEDIR)/Rules.mk 2.9 - 2.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ 2.11 - __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ 2.12 - bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \ 2.13 - clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ 2.14 - flush.o ip_fast_csum.o do_csum.o copy_user.o \ 2.15 - memset.o strlen.o memcpy.o 2.16 - 2.17 -default: $(OBJS) 2.18 - $(LD) -r -o ia64lib.o $(OBJS) 2.19 - 2.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__ 2.21 - 2.22 -__divdi3.o: idiv64.S 2.23 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< 2.24 - 2.25 -__udivdi3.o: idiv64.S 2.26 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< 2.27 - 2.28 -__moddi3.o: idiv64.S 2.29 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< 2.30 - 2.31 -__umoddi3.o: idiv64.S 2.32 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< 2.33 - 2.34 -__divsi3.o: idiv32.S 2.35 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< 2.36 - 2.37 -__udivsi3.o: idiv32.S 2.38 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< 2.39 - 2.40 -__modsi3.o: idiv32.S 2.41 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< 2.42 - 2.43 -__umodsi3.o: idiv32.S 2.44 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< 2.45 - 2.46 -clean: 2.47 - rm -f *.o *~
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/xen/arch/ia64/linux-xen/minstate.h Wed Aug 31 14:32:27 2005 -0600 3.3 @@ -0,0 +1,254 @@ 3.4 +#include <linux/config.h> 3.5 + 3.6 +#include <asm/cache.h> 3.7 + 3.8 +#include "entry.h" 3.9 + 3.10 +/* 3.11 + * For ivt.s we want to access the stack virtually so we don't have to disable translation 3.12 + * on interrupts. 3.13 + * 3.14 + * On entry: 3.15 + * r1: pointer to current task (ar.k6) 3.16 + */ 3.17 +#define MINSTATE_START_SAVE_MIN_VIRT \ 3.18 +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ 3.19 + ;; \ 3.20 +(pUStk) mov.m r24=ar.rnat; \ 3.21 +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ 3.22 +(pKStk) mov r1=sp; /* get sp */ \ 3.23 + ;; \ 3.24 +(pUStk) lfetch.fault.excl.nt1 [r22]; \ 3.25 +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ 3.26 +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ 3.27 + ;; \ 3.28 +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ 3.29 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ 3.30 + ;; \ 3.31 +(pUStk) mov r18=ar.bsp; \ 3.32 +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ 3.33 + 3.34 +#define MINSTATE_END_SAVE_MIN_VIRT \ 3.35 + bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ 3.36 + ;; 3.37 + 3.38 +/* 3.39 + * For mca_asm.S we want to access the stack physically since the state is saved before we 3.40 + * go virtual and don't want to destroy the iip or ipsr. 3.41 + */ 3.42 +#define MINSTATE_START_SAVE_MIN_PHYS \ 3.43 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \ 3.44 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \ 3.45 +(pKStk) ld8 r3 = [r3];; \ 3.46 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \ 3.47 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \ 3.48 +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ 3.49 +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \ 3.50 + ;; \ 3.51 +(pUStk) mov r24=ar.rnat; \ 3.52 +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ 3.53 +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ 3.54 +(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ 3.55 + ;; \ 3.56 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ 3.57 +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ 3.58 + ;; \ 3.59 +(pUStk) mov r18=ar.bsp; \ 3.60 +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ 3.61 + 3.62 +#define MINSTATE_END_SAVE_MIN_PHYS \ 3.63 + dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ 3.64 + ;; 3.65 + 3.66 +#ifdef MINSTATE_VIRT 3.67 +# define MINSTATE_GET_CURRENT(reg) \ 3.68 + movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\ 3.69 + ld8 reg=[reg] 3.70 +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT 3.71 +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT 3.72 +#endif 3.73 + 3.74 +#ifdef MINSTATE_PHYS 3.75 +# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg 3.76 +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS 3.77 +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS 3.78 +#endif 3.79 + 3.80 +/* 3.81 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves 3.82 + * the minimum state necessary that allows us to turn psr.ic back 3.83 + * on. 3.84 + * 3.85 + * Assumed state upon entry: 3.86 + * psr.ic: off 3.87 + * r31: contains saved predicates (pr) 3.88 + * 3.89 + * Upon exit, the state is as follows: 3.90 + * psr.ic: off 3.91 + * r2 = points to &pt_regs.r16 3.92 + * r8 = contents of ar.ccv 3.93 + * r9 = contents of ar.csd 3.94 + * r10 = contents of ar.ssd 3.95 + * r11 = FPSR_DEFAULT 3.96 + * r12 = kernel sp (kernel virtual address) 3.97 + * r13 = points to current task_struct (kernel virtual address) 3.98 + * p15 = TRUE if psr.i is set in cr.ipsr 3.99 + * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15: 3.100 + * preserved 3.101 + * 3.102 + * Note that psr.ic is NOT turned on by this macro. This is so that 3.103 + * we can pass interruption state as arguments to a handler. 3.104 + */ 3.105 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ 3.106 + MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \ 3.107 + mov r27=ar.rsc; /* M */ \ 3.108 + mov r20=r1; /* A */ \ 3.109 + mov r25=ar.unat; /* M */ \ 3.110 + mov r29=cr.ipsr; /* M */ \ 3.111 + mov r26=ar.pfs; /* I */ \ 3.112 + mov r28=cr.iip; /* M */ \ 3.113 + mov r21=ar.fpsr; /* M */ \ 3.114 + COVER; /* B;; (or nothing) */ \ 3.115 + ;; \ 3.116 + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \ 3.117 + ;; \ 3.118 + ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \ 3.119 + st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \ 3.120 + adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \ 3.121 + /* switch from user to kernel RBS: */ \ 3.122 + ;; \ 3.123 + invala; /* M */ \ 3.124 + SAVE_IFS; \ 3.125 + cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \ 3.126 + ;; \ 3.127 + MINSTATE_START_SAVE_MIN \ 3.128 + adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \ 3.129 + adds r16=PT(CR_IPSR),r1; \ 3.130 + ;; \ 3.131 + lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \ 3.132 + st8 [r16]=r29; /* save cr.ipsr */ \ 3.133 + ;; \ 3.134 + lfetch.fault.excl.nt1 [r17]; \ 3.135 + tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \ 3.136 + mov r29=b0 \ 3.137 + ;; \ 3.138 + adds r16=PT(R8),r1; /* initialize first base pointer */ \ 3.139 + adds r17=PT(R9),r1; /* initialize second base pointer */ \ 3.140 +(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \ 3.141 + ;; \ 3.142 +.mem.offset 0,0; st8.spill [r16]=r8,16; \ 3.143 +.mem.offset 8,0; st8.spill [r17]=r9,16; \ 3.144 + ;; \ 3.145 +.mem.offset 0,0; st8.spill [r16]=r10,24; \ 3.146 +.mem.offset 8,0; st8.spill [r17]=r11,24; \ 3.147 + ;; \ 3.148 + st8 [r16]=r28,16; /* save cr.iip */ \ 3.149 + st8 [r17]=r30,16; /* save cr.ifs */ \ 3.150 +(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \ 3.151 + mov r8=ar.ccv; \ 3.152 + mov r9=ar.csd; \ 3.153 + mov r10=ar.ssd; \ 3.154 + movl r11=FPSR_DEFAULT; /* L-unit */ \ 3.155 + ;; \ 3.156 + st8 [r16]=r25,16; /* save ar.unat */ \ 3.157 + st8 [r17]=r26,16; /* save ar.pfs */ \ 3.158 + shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \ 3.159 + ;; \ 3.160 + st8 [r16]=r27,16; /* save ar.rsc */ \ 3.161 +(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \ 3.162 +(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \ 3.163 + ;; /* avoid RAW on r16 & r17 */ \ 3.164 +(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \ 3.165 + st8 [r17]=r31,16; /* save predicates */ \ 3.166 +(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \ 3.167 + ;; \ 3.168 + st8 [r16]=r29,16; /* save b0 */ \ 3.169 + st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \ 3.170 + cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \ 3.171 + ;; \ 3.172 +.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \ 3.173 +.mem.offset 8,0; st8.spill [r17]=r12,16; \ 3.174 + adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \ 3.175 + ;; \ 3.176 +.mem.offset 0,0; st8.spill [r16]=r13,16; \ 3.177 +.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \ 3.178 + movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; \ 3.179 + ld8 r13=[r13]; /* establish 'current' */ \ 3.180 + ;; \ 3.181 +.mem.offset 0,0; st8.spill [r16]=r15,16; \ 3.182 +.mem.offset 8,0; st8.spill [r17]=r14,16; \ 3.183 + ;; \ 3.184 +.mem.offset 0,0; st8.spill [r16]=r2,16; \ 3.185 +.mem.offset 8,0; st8.spill [r17]=r3,16; \ 3.186 + adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ 3.187 + ;; \ 3.188 + EXTRA; \ 3.189 + movl r1=__gp; /* establish kernel global pointer */ \ 3.190 + ;; \ 3.191 + MINSTATE_END_SAVE_MIN 3.192 + 3.193 +/* 3.194 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on). 3.195 + * 3.196 + * Assumed state upon entry: 3.197 + * psr.ic: on 3.198 + * r2: points to &pt_regs.r16 3.199 + * r3: points to &pt_regs.r17 3.200 + * r8: contents of ar.ccv 3.201 + * r9: contents of ar.csd 3.202 + * r10: contents of ar.ssd 3.203 + * r11: FPSR_DEFAULT 3.204 + * 3.205 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST. 3.206 + */ 3.207 +#define SAVE_REST \ 3.208 +.mem.offset 0,0; st8.spill [r2]=r16,16; \ 3.209 +.mem.offset 8,0; st8.spill [r3]=r17,16; \ 3.210 + ;; \ 3.211 +.mem.offset 0,0; st8.spill [r2]=r18,16; \ 3.212 +.mem.offset 8,0; st8.spill [r3]=r19,16; \ 3.213 + ;; \ 3.214 +.mem.offset 0,0; st8.spill [r2]=r20,16; \ 3.215 +.mem.offset 8,0; st8.spill [r3]=r21,16; \ 3.216 + mov r18=b6; \ 3.217 + ;; \ 3.218 +.mem.offset 0,0; st8.spill [r2]=r22,16; \ 3.219 +.mem.offset 8,0; st8.spill [r3]=r23,16; \ 3.220 + mov r19=b7; \ 3.221 + ;; \ 3.222 +.mem.offset 0,0; st8.spill [r2]=r24,16; \ 3.223 +.mem.offset 8,0; st8.spill [r3]=r25,16; \ 3.224 + ;; \ 3.225 +.mem.offset 0,0; st8.spill [r2]=r26,16; \ 3.226 +.mem.offset 8,0; st8.spill [r3]=r27,16; \ 3.227 + ;; \ 3.228 +.mem.offset 0,0; st8.spill [r2]=r28,16; \ 3.229 +.mem.offset 8,0; st8.spill [r3]=r29,16; \ 3.230 + ;; \ 3.231 +.mem.offset 0,0; st8.spill [r2]=r30,16; \ 3.232 +.mem.offset 8,0; st8.spill [r3]=r31,32; \ 3.233 + ;; \ 3.234 + mov ar.fpsr=r11; /* M-unit */ \ 3.235 + st8 [r2]=r8,8; /* ar.ccv */ \ 3.236 + adds r24=PT(B6)-PT(F7),r3; \ 3.237 + ;; \ 3.238 + stf.spill [r2]=f6,32; \ 3.239 + stf.spill [r3]=f7,32; \ 3.240 + ;; \ 3.241 + stf.spill [r2]=f8,32; \ 3.242 + stf.spill [r3]=f9,32; \ 3.243 + ;; \ 3.244 + stf.spill [r2]=f10; \ 3.245 + stf.spill [r3]=f11; \ 3.246 + adds r25=PT(B7)-PT(F11),r3; \ 3.247 + ;; \ 3.248 + st8 [r24]=r18,16; /* b6 */ \ 3.249 + st8 [r25]=r19,16; /* b7 */ \ 3.250 + ;; \ 3.251 + st8 [r24]=r9; /* ar.csd */ \ 3.252 + st8 [r25]=r10; /* ar.ssd */ \ 3.253 + ;; 3.254 + 3.255 +#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,) 3.256 +#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19) 3.257 +#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
4.1 --- a/xen/arch/ia64/linux-xen/setup.c Tue Aug 30 17:51:51 2005 -0600 4.2 +++ b/xen/arch/ia64/linux-xen/setup.c Wed Aug 31 14:32:27 2005 -0600 4.3 @@ -4,10 +4,15 @@ 4.4 * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co 4.5 * David Mosberger-Tang <davidm@hpl.hp.com> 4.6 * Stephane Eranian <eranian@hpl.hp.com> 4.7 - * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com> 4.8 + * Copyright (C) 2000, 2004 Intel Corp 4.9 + * Rohit Seth <rohit.seth@intel.com> 4.10 + * Suresh Siddha <suresh.b.siddha@intel.com> 4.11 + * Gordon Jin <gordon.jin@intel.com> 4.12 * Copyright (C) 1999 VA Linux Systems 4.13 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> 4.14 * 4.15 + * 12/26/04 S.Siddha, G.Jin, R.Seth 4.16 + * Add multi-threading and multi-core detection 4.17 * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo(). 4.18 * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map 4.19 * 03/31/00 R.Seth cpu_initialized and current->processor fixes 4.20 @@ -15,6 +20,7 @@ 4.21 * 02/01/00 R.Seth fixed get_cpuinfo for SMP 4.22 * 01/07/99 S.Eranian added the support for command line argument 4.23 * 06/24/99 W.Drummond added boot_cpu_data. 4.24 + * 05/28/05 Z. Menyhart Dynamic stride size for "flush_icache_range()" 4.25 */ 4.26 #include <linux/config.h> 4.27 #include <linux/module.h> 4.28 @@ -35,6 +41,10 @@ 4.29 #include <linux/serial_core.h> 4.30 #include <linux/efi.h> 4.31 #include <linux/initrd.h> 4.32 +#ifndef XEN 4.33 +#include <linux/platform.h> 4.34 +#include <linux/pm.h> 4.35 +#endif 4.36 4.37 #include <asm/ia32.h> 4.38 #include <asm/machvec.h> 4.39 @@ -51,8 +61,10 @@ 4.40 #include <asm/smp.h> 4.41 #include <asm/system.h> 4.42 #include <asm/unistd.h> 4.43 +#ifdef XEN 4.44 #include <asm/vmx.h> 4.45 #include <asm/io.h> 4.46 +#endif 4.47 4.48 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) 4.49 # error "struct cpuinfo_ia64 too big!" 4.50 @@ -64,12 +76,16 @@ EXPORT_SYMBOL(__per_cpu_offset); 4.51 #endif 4.52 4.53 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); 4.54 +#ifdef XEN 4.55 DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr); 4.56 +#endif 4.57 DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); 4.58 DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); 4.59 unsigned long ia64_cycles_per_usec; 4.60 struct ia64_boot_param *ia64_boot_param; 4.61 struct screen_info screen_info; 4.62 +unsigned long vga_console_iobase; 4.63 +unsigned long vga_console_membase; 4.64 4.65 unsigned long ia64_max_cacheline_size; 4.66 unsigned long ia64_iobase; /* virtual address for I/O accesses */ 4.67 @@ -78,7 +94,12 @@ struct io_space io_space[MAX_IO_SPACES]; 4.68 EXPORT_SYMBOL(io_space); 4.69 unsigned int num_io_spaces; 4.70 4.71 -unsigned char aux_device_present = 0xaa; /* XXX remove this when legacy I/O is gone */ 4.72 +/* 4.73 + * "flush_icache_range()" needs to know what processor dependent stride size to use 4.74 + * when it makes i-cache(s) coherent with d-caches. 4.75 + */ 4.76 +#define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes by 32 bytes */ 4.77 +unsigned long ia64_i_cache_stride_shift = ~0; 4.78 4.79 /* 4.80 * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This 4.81 @@ -287,23 +308,25 @@ io_port_init (void) 4.82 static inline int __init 4.83 early_console_setup (char *cmdline) 4.84 { 4.85 + int earlycons = 0; 4.86 + 4.87 #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE 4.88 { 4.89 extern int sn_serial_console_early_setup(void); 4.90 if (!sn_serial_console_early_setup()) 4.91 - return 0; 4.92 + earlycons++; 4.93 } 4.94 #endif 4.95 #ifdef CONFIG_EFI_PCDP 4.96 if (!efi_setup_pcdp_console(cmdline)) 4.97 - return 0; 4.98 + earlycons++; 4.99 #endif 4.100 #ifdef CONFIG_SERIAL_8250_CONSOLE 4.101 if (!early_serial_console_init(cmdline)) 4.102 - return 0; 4.103 + earlycons++; 4.104 #endif 4.105 4.106 - return -1; 4.107 + return (earlycons) ? 0 : -1; 4.108 } 4.109 4.110 static inline void 4.111 @@ -315,7 +338,34 @@ mark_bsp_online (void) 4.112 #endif 4.113 } 4.114 4.115 -void __init 4.116 +#ifdef CONFIG_SMP 4.117 +static void 4.118 +check_for_logical_procs (void) 4.119 +{ 4.120 + pal_logical_to_physical_t info; 4.121 + s64 status; 4.122 + 4.123 + status = ia64_pal_logical_to_phys(0, &info); 4.124 + if (status == -1) { 4.125 + printk(KERN_INFO "No logical to physical processor mapping " 4.126 + "available\n"); 4.127 + return; 4.128 + } 4.129 + if (status) { 4.130 + printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n", 4.131 + status); 4.132 + return; 4.133 + } 4.134 + /* 4.135 + * Total number of siblings that BSP has. Though not all of them 4.136 + * may have booted successfully. The correct number of siblings 4.137 + * booted is in info.overview_num_log. 4.138 + */ 4.139 + smp_num_siblings = info.overview_tpc; 4.140 + smp_num_cpucores = info.overview_cpp; 4.141 +} 4.142 +#endif 4.143 + 4.144 #ifdef XEN 4.145 early_setup_arch (char **cmdline_p) 4.146 #else 4.147 @@ -398,6 +448,19 @@ late_setup_arch (char **cmdline_p) 4.148 4.149 #ifdef CONFIG_SMP 4.150 cpu_physical_id(0) = hard_smp_processor_id(); 4.151 + 4.152 + cpu_set(0, cpu_sibling_map[0]); 4.153 + cpu_set(0, cpu_core_map[0]); 4.154 + 4.155 + check_for_logical_procs(); 4.156 + if (smp_num_cpucores > 1) 4.157 + printk(KERN_INFO 4.158 + "cpu package is Multi-Core capable: number of cores=%d\n", 4.159 + smp_num_cpucores); 4.160 + if (smp_num_siblings > 1) 4.161 + printk(KERN_INFO 4.162 + "cpu package is Multi-Threading capable: number of siblings=%d\n", 4.163 + smp_num_siblings); 4.164 #endif 4.165 4.166 #ifdef XEN 4.167 @@ -505,12 +568,23 @@ show_cpuinfo (struct seq_file *m, void * 4.168 "cpu regs : %u\n" 4.169 "cpu MHz : %lu.%06lu\n" 4.170 "itc MHz : %lu.%06lu\n" 4.171 - "BogoMIPS : %lu.%02lu\n\n", 4.172 + "BogoMIPS : %lu.%02lu\n", 4.173 cpunum, c->vendor, family, c->model, c->revision, c->archrev, 4.174 features, c->ppn, c->number, 4.175 c->proc_freq / 1000000, c->proc_freq % 1000000, 4.176 c->itc_freq / 1000000, c->itc_freq % 1000000, 4.177 lpj*HZ/500000, (lpj*HZ/5000) % 100); 4.178 +#ifdef CONFIG_SMP 4.179 + seq_printf(m, "siblings : %u\n", c->num_log); 4.180 + if (c->threads_per_core > 1 || c->cores_per_socket > 1) 4.181 + seq_printf(m, 4.182 + "physical id: %u\n" 4.183 + "core id : %u\n" 4.184 + "thread id : %u\n", 4.185 + c->socket_id, c->core_id, c->thread_id); 4.186 +#endif 4.187 + seq_printf(m,"\n"); 4.188 + 4.189 return 0; 4.190 } 4.191 4.192 @@ -581,6 +655,14 @@ identify_cpu (struct cpuinfo_ia64 *c) 4.193 memcpy(c->vendor, cpuid.field.vendor, 16); 4.194 #ifdef CONFIG_SMP 4.195 c->cpu = smp_processor_id(); 4.196 + 4.197 + /* below default values will be overwritten by identify_siblings() 4.198 + * for Multi-Threading/Multi-Core capable cpu's 4.199 + */ 4.200 + c->threads_per_core = c->cores_per_socket = c->num_log = 1; 4.201 + c->socket_id = -1; 4.202 + 4.203 + identify_siblings(c); 4.204 #endif 4.205 c->ppn = cpuid.field.ppn; 4.206 c->number = cpuid.field.number; 4.207 @@ -611,6 +693,12 @@ setup_per_cpu_areas (void) 4.208 /* start_kernel() requires this... */ 4.209 } 4.210 4.211 +/* 4.212 + * Calculate the max. cache line size. 4.213 + * 4.214 + * In addition, the minimum of the i-cache stride sizes is calculated for 4.215 + * "flush_icache_range()". 4.216 + */ 4.217 static void 4.218 get_max_cacheline_size (void) 4.219 { 4.220 @@ -624,6 +712,8 @@ get_max_cacheline_size (void) 4.221 printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n", 4.222 __FUNCTION__, status); 4.223 max = SMP_CACHE_BYTES; 4.224 + /* Safest setup for "flush_icache_range()" */ 4.225 + ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT; 4.226 goto out; 4.227 } 4.228 4.229 @@ -632,14 +722,31 @@ get_max_cacheline_size (void) 4.230 &cci); 4.231 if (status != 0) { 4.232 printk(KERN_ERR 4.233 - "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n", 4.234 + "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n", 4.235 __FUNCTION__, l, status); 4.236 max = SMP_CACHE_BYTES; 4.237 + /* The safest setup for "flush_icache_range()" */ 4.238 + cci.pcci_stride = I_CACHE_STRIDE_SHIFT; 4.239 + cci.pcci_unified = 1; 4.240 } 4.241 line_size = 1 << cci.pcci_line_size; 4.242 if (line_size > max) 4.243 max = line_size; 4.244 - } 4.245 + if (!cci.pcci_unified) { 4.246 + status = ia64_pal_cache_config_info(l, 4.247 + /* cache_type (instruction)= */ 1, 4.248 + &cci); 4.249 + if (status != 0) { 4.250 + printk(KERN_ERR 4.251 + "%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n", 4.252 + __FUNCTION__, l, status); 4.253 + /* The safest setup for "flush_icache_range()" */ 4.254 + cci.pcci_stride = I_CACHE_STRIDE_SHIFT; 4.255 + } 4.256 + } 4.257 + if (cci.pcci_stride < ia64_i_cache_stride_shift) 4.258 + ia64_i_cache_stride_shift = cci.pcci_stride; 4.259 + } 4.260 out: 4.261 if (max > ia64_max_cacheline_size) 4.262 ia64_max_cacheline_size = max; 4.263 @@ -700,7 +807,17 @@ cpu_init (void) 4.264 ia64_set_kr(IA64_KR_FPU_OWNER, 0); 4.265 4.266 /* 4.267 - * Initialize default control register to defer all speculative faults. The 4.268 + * Initialize the page-table base register to a global 4.269 + * directory with all zeroes. This ensure that we can handle 4.270 + * TLB-misses to user address-space even before we created the 4.271 + * first user address-space. This may happen, e.g., due to 4.272 + * aggressive use of lfetch.fault. 4.273 + */ 4.274 + ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page))); 4.275 + 4.276 + /* 4.277 + * Initialize default control register to defer speculative faults except 4.278 + * for those arising from TLB misses, which are not deferred. The 4.279 * kernel MUST NOT depend on a particular setting of these bits (in other words, 4.280 * the kernel must have recovery code for all speculative accesses). Turn on 4.281 * dcr.lc as per recommendation by the architecture team. Most IA-32 apps 4.282 @@ -762,6 +879,9 @@ cpu_init (void) 4.283 /* size of physical stacked register partition plus 8 bytes: */ 4.284 __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8; 4.285 platform_cpu_init(); 4.286 +#ifndef XEN 4.287 + pm_idle = default_idle; 4.288 +#endif 4.289 } 4.290 4.291 void
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/xen/arch/ia64/linux-xen/sort.c Wed Aug 31 14:32:27 2005 -0600 5.3 @@ -0,0 +1,122 @@ 5.4 +/* 5.5 + * A fast, small, non-recursive O(nlog n) sort for the Linux kernel 5.6 + * 5.7 + * Jan 23 2005 Matt Mackall <mpm@selenic.com> 5.8 + */ 5.9 + 5.10 +#include <linux/kernel.h> 5.11 +#include <linux/module.h> 5.12 +#ifdef XEN 5.13 +#include <linux/types.h> 5.14 +#endif 5.15 + 5.16 +void u32_swap(void *a, void *b, int size) 5.17 +{ 5.18 + u32 t = *(u32 *)a; 5.19 + *(u32 *)a = *(u32 *)b; 5.20 + *(u32 *)b = t; 5.21 +} 5.22 + 5.23 +void generic_swap(void *a, void *b, int size) 5.24 +{ 5.25 + char t; 5.26 + 5.27 + do { 5.28 + t = *(char *)a; 5.29 + *(char *)a++ = *(char *)b; 5.30 + *(char *)b++ = t; 5.31 + } while (--size > 0); 5.32 +} 5.33 + 5.34 +/* 5.35 + * sort - sort an array of elements 5.36 + * @base: pointer to data to sort 5.37 + * @num: number of elements 5.38 + * @size: size of each element 5.39 + * @cmp: pointer to comparison function 5.40 + * @swap: pointer to swap function or NULL 5.41 + * 5.42 + * This function does a heapsort on the given array. You may provide a 5.43 + * swap function optimized to your element type. 5.44 + * 5.45 + * Sorting time is O(n log n) both on average and worst-case. While 5.46 + * qsort is about 20% faster on average, it suffers from exploitable 5.47 + * O(n*n) worst-case behavior and extra memory requirements that make 5.48 + * it less suitable for kernel use. 5.49 + */ 5.50 + 5.51 +void sort(void *base, size_t num, size_t size, 5.52 + int (*cmp)(const void *, const void *), 5.53 + void (*swap)(void *, void *, int size)) 5.54 +{ 5.55 + /* pre-scale counters for performance */ 5.56 + int i = (num/2) * size, n = num * size, c, r; 5.57 + 5.58 + if (!swap) 5.59 + swap = (size == 4 ? u32_swap : generic_swap); 5.60 + 5.61 + /* heapify */ 5.62 + for ( ; i >= 0; i -= size) { 5.63 + for (r = i; r * 2 < n; r = c) { 5.64 + c = r * 2; 5.65 + if (c < n - size && cmp(base + c, base + c + size) < 0) 5.66 + c += size; 5.67 + if (cmp(base + r, base + c) >= 0) 5.68 + break; 5.69 + swap(base + r, base + c, size); 5.70 + } 5.71 + } 5.72 + 5.73 + /* sort */ 5.74 + for (i = n - size; i >= 0; i -= size) { 5.75 + swap(base, base + i, size); 5.76 + for (r = 0; r * 2 < i; r = c) { 5.77 + c = r * 2; 5.78 + if (c < i - size && cmp(base + c, base + c + size) < 0) 5.79 + c += size; 5.80 + if (cmp(base + r, base + c) >= 0) 5.81 + break; 5.82 + swap(base + r, base + c, size); 5.83 + } 5.84 + } 5.85 +} 5.86 + 5.87 +EXPORT_SYMBOL(sort); 5.88 + 5.89 +#if 0 5.90 +/* a simple boot-time regression test */ 5.91 + 5.92 +int cmpint(const void *a, const void *b) 5.93 +{ 5.94 + return *(int *)a - *(int *)b; 5.95 +} 5.96 + 5.97 +static int sort_test(void) 5.98 +{ 5.99 + int *a, i, r = 1; 5.100 + 5.101 + a = kmalloc(1000 * sizeof(int), GFP_KERNEL); 5.102 + BUG_ON(!a); 5.103 + 5.104 + printk("testing sort()\n"); 5.105 + 5.106 + for (i = 0; i < 1000; i++) { 5.107 + r = (r * 725861) % 6599; 5.108 + a[i] = r; 5.109 + } 5.110 + 5.111 + sort(a, 1000, sizeof(int), cmpint, NULL); 5.112 + 5.113 + for (i = 0; i < 999; i++) 5.114 + if (a[i] > a[i+1]) { 5.115 + printk("sort() failed!\n"); 5.116 + break; 5.117 + } 5.118 + 5.119 + kfree(a); 5.120 + 5.121 + return 0; 5.122 +} 5.123 + 5.124 +module_init(sort_test); 5.125 +#endif
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/xen/arch/ia64/linux/README.origin Wed Aug 31 14:32:27 2005 -0600 6.3 @@ -0,0 +1,20 @@ 6.4 +Source files in this directory are identical copies of linux-2.6.13 files: 6.5 + 6.6 +cmdline.c -> linux/lib/cmdline.c 6.7 +efi_stub.S -> linux/arch/ia64/efi_stub.S 6.8 +extable.c -> linux/arch/ia64/mm/extable.c 6.9 +hpsim.S -> linux/arch/ia64/hp/sim/hpsim.S 6.10 +ia64_ksyms.c -> linux/arch/ia64/kernel/ia64_ksyms.c 6.11 +linuxextable.c -> linux/kernel/extable.c 6.12 +machvec.c -> linux/arch/ia64/kernel/machvec.c 6.13 +patch.c -> linux/arch/ia64/kernel/patch.c 6.14 +pcdp.h -> drivers/firmware/pcdp.h 6.15 +lib/bitop.c -> linux/arch/ia64/lib/bitop.c 6.16 +lib/clear_page.S -> linux/arch/ia64/lib/clear_page.S 6.17 +lib/copy_page_mck.S -> linux/arch/ia64/lib/copy_page_mck.S 6.18 +lib/flush.S -> linux/arch/ia64/lib/flush.S 6.19 +lib/idiv32.S -> linux/arch/ia64/lib/idiv32.S 6.20 +lib/idiv64.S -> linux/arch/ia64/lib/idiv64.S 6.21 +lib/memcpy_mck.S -> linux/arch/ia64/lib/memcpy_mck.S 6.22 +lib/memset.S -> linux/arch/ia64/lib/memset.S 6.23 +lib/strlen.S -> linux/arch/ia64/lib/strlen.S
7.1 --- a/xen/arch/ia64/linux/extable.c Tue Aug 30 17:51:51 2005 -0600 7.2 +++ b/xen/arch/ia64/linux/extable.c Wed Aug 31 14:32:27 2005 -0600 7.3 @@ -6,29 +6,29 @@ 7.4 */ 7.5 7.6 #include <linux/config.h> 7.7 +#include <linux/sort.h> 7.8 7.9 #include <asm/uaccess.h> 7.10 #include <asm/module.h> 7.11 7.12 -static inline int 7.13 -compare_entries (struct exception_table_entry *l, struct exception_table_entry *r) 7.14 +static int cmp_ex(const void *a, const void *b) 7.15 { 7.16 + const struct exception_table_entry *l = a, *r = b; 7.17 u64 lip = (u64) &l->addr + l->addr; 7.18 u64 rip = (u64) &r->addr + r->addr; 7.19 7.20 + /* avoid overflow */ 7.21 + if (lip > rip) 7.22 + return 1; 7.23 if (lip < rip) 7.24 return -1; 7.25 - if (lip == rip) 7.26 - return 0; 7.27 - else 7.28 - return 1; 7.29 + return 0; 7.30 } 7.31 7.32 -static inline void 7.33 -swap_entries (struct exception_table_entry *l, struct exception_table_entry *r) 7.34 +static void swap_ex(void *a, void *b, int size) 7.35 { 7.36 + struct exception_table_entry *l = a, *r = b, tmp; 7.37 u64 delta = (u64) r - (u64) l; 7.38 - struct exception_table_entry tmp; 7.39 7.40 tmp = *l; 7.41 l->addr = r->addr + delta; 7.42 @@ -38,23 +38,20 @@ swap_entries (struct exception_table_ent 7.43 } 7.44 7.45 /* 7.46 - * Sort the exception table. It's usually already sorted, but there may be unordered 7.47 - * entries due to multiple text sections (such as the .init text section). Note that the 7.48 - * exception-table-entries contain location-relative addresses, which requires a bit of 7.49 - * care during sorting to avoid overflows in the offset members (e.g., it would not be 7.50 - * safe to make a temporary copy of an exception-table entry on the stack, because the 7.51 - * stack may be more than 2GB away from the exception-table). 7.52 + * Sort the exception table. It's usually already sorted, but there 7.53 + * may be unordered entries due to multiple text sections (such as the 7.54 + * .init text section). Note that the exception-table-entries contain 7.55 + * location-relative addresses, which requires a bit of care during 7.56 + * sorting to avoid overflows in the offset members (e.g., it would 7.57 + * not be safe to make a temporary copy of an exception-table entry on 7.58 + * the stack, because the stack may be more than 2GB away from the 7.59 + * exception-table). 7.60 */ 7.61 -void 7.62 -sort_extable (struct exception_table_entry *start, struct exception_table_entry *finish) 7.63 +void sort_extable (struct exception_table_entry *start, 7.64 + struct exception_table_entry *finish) 7.65 { 7.66 - struct exception_table_entry *p, *q; 7.67 - 7.68 - /* insertion sort */ 7.69 - for (p = start + 1; p < finish; ++p) 7.70 - /* start .. p-1 is sorted; push p down to it's proper place */ 7.71 - for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; --q) 7.72 - swap_entries(&q[0], &q[-1]); 7.73 + sort(start, finish - start, sizeof(struct exception_table_entry), 7.74 + cmp_ex, swap_ex); 7.75 } 7.76 7.77 const struct exception_table_entry *
8.1 --- a/xen/arch/ia64/linux/ia64_ksyms.c Tue Aug 30 17:51:51 2005 -0600 8.2 +++ b/xen/arch/ia64/linux/ia64_ksyms.c Wed Aug 31 14:32:27 2005 -0600 8.3 @@ -58,9 +58,6 @@ EXPORT_SYMBOL(__strlen_user); 8.4 EXPORT_SYMBOL(__strncpy_from_user); 8.5 EXPORT_SYMBOL(__strnlen_user); 8.6 8.7 -#include <asm/unistd.h> 8.8 -EXPORT_SYMBOL(__ia64_syscall); 8.9 - 8.10 /* from arch/ia64/lib */ 8.11 extern void __divsi3(void); 8.12 extern void __udivsi3(void);
10.1 --- a/xen/arch/ia64/linux/lib/Makefile Tue Aug 30 17:51:51 2005 -0600 10.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 10.3 @@ -1,44 +0,0 @@ 10.4 -# 10.5 -# Makefile for ia64-specific library routines.. 10.6 -# 10.7 - 10.8 -include $(BASEDIR)/Rules.mk 10.9 - 10.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ 10.11 - __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ 10.12 - bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \ 10.13 - clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ 10.14 - flush.o ip_fast_csum.o do_csum.o copy_user.o \ 10.15 - memset.o strlen.o memcpy.o 10.16 - 10.17 -default: $(OBJS) 10.18 - $(LD) -r -o ia64lib.o $(OBJS) 10.19 - 10.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__ 10.21 - 10.22 -__divdi3.o: idiv64.S 10.23 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< 10.24 - 10.25 -__udivdi3.o: idiv64.S 10.26 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< 10.27 - 10.28 -__moddi3.o: idiv64.S 10.29 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< 10.30 - 10.31 -__umoddi3.o: idiv64.S 10.32 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< 10.33 - 10.34 -__divsi3.o: idiv32.S 10.35 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< 10.36 - 10.37 -__udivsi3.o: idiv32.S 10.38 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< 10.39 - 10.40 -__modsi3.o: idiv32.S 10.41 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< 10.42 - 10.43 -__umodsi3.o: idiv32.S 10.44 - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< 10.45 - 10.46 -clean: 10.47 - rm -f *.o *~
11.1 --- a/xen/arch/ia64/linux/lib/carta_random.S Tue Aug 30 17:51:51 2005 -0600 11.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 11.3 @@ -1,54 +0,0 @@ 11.4 -/* 11.5 - * Fast, simple, yet decent quality random number generator based on 11.6 - * a paper by David G. Carta ("Two Fast Implementations of the 11.7 - * `Minimal Standard' Random Number Generator," Communications of the 11.8 - * ACM, January, 1990). 11.9 - * 11.10 - * Copyright (C) 2002 Hewlett-Packard Co 11.11 - * David Mosberger-Tang <davidm@hpl.hp.com> 11.12 - */ 11.13 - 11.14 -#include <asm/asmmacro.h> 11.15 - 11.16 -#define a r2 11.17 -#define m r3 11.18 -#define lo r8 11.19 -#define hi r9 11.20 -#define t0 r16 11.21 -#define t1 r17 11.22 -#define seed r32 11.23 - 11.24 -GLOBAL_ENTRY(carta_random32) 11.25 - movl a = (16807 << 16) | 16807 11.26 - ;; 11.27 - pmpyshr2.u t0 = a, seed, 0 11.28 - pmpyshr2.u t1 = a, seed, 16 11.29 - ;; 11.30 - unpack2.l t0 = t1, t0 11.31 - dep m = -1, r0, 0, 31 11.32 - ;; 11.33 - zxt4 lo = t0 11.34 - shr.u hi = t0, 32 11.35 - ;; 11.36 - dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff) 11.37 - ;; 11.38 - shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16 11.39 - shr t1 = hi, 15 // t1 = (hi >> 15) 11.40 - ;; 11.41 - add lo = lo, t0 11.42 - ;; 11.43 - cmp.gtu p6, p0 = lo, m 11.44 - ;; 11.45 -(p6) and lo = lo, m 11.46 - ;; 11.47 -(p6) add lo = 1, lo 11.48 - ;; 11.49 - add lo = lo, t1 11.50 - ;; 11.51 - cmp.gtu p6, p0 = lo, m 11.52 - ;; 11.53 -(p6) and lo = lo, m 11.54 - ;; 11.55 -(p6) add lo = 1, lo 11.56 - br.ret.sptk.many rp 11.57 -END(carta_random32)
12.1 --- a/xen/arch/ia64/linux/lib/checksum.c Tue Aug 30 17:51:51 2005 -0600 12.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 12.3 @@ -1,102 +0,0 @@ 12.4 -/* 12.5 - * Network checksum routines 12.6 - * 12.7 - * Copyright (C) 1999, 2003 Hewlett-Packard Co 12.8 - * Stephane Eranian <eranian@hpl.hp.com> 12.9 - * 12.10 - * Most of the code coming from arch/alpha/lib/checksum.c 12.11 - * 12.12 - * This file contains network checksum routines that are better done 12.13 - * in an architecture-specific manner due to speed.. 12.14 - */ 12.15 - 12.16 -#include <linux/module.h> 12.17 -#include <linux/string.h> 12.18 - 12.19 -#include <asm/byteorder.h> 12.20 - 12.21 -static inline unsigned short 12.22 -from64to16 (unsigned long x) 12.23 -{ 12.24 - /* add up 32-bit words for 33 bits */ 12.25 - x = (x & 0xffffffff) + (x >> 32); 12.26 - /* add up 16-bit and 17-bit words for 17+c bits */ 12.27 - x = (x & 0xffff) + (x >> 16); 12.28 - /* add up 16-bit and 2-bit for 16+c bit */ 12.29 - x = (x & 0xffff) + (x >> 16); 12.30 - /* add up carry.. */ 12.31 - x = (x & 0xffff) + (x >> 16); 12.32 - return x; 12.33 -} 12.34 - 12.35 -/* 12.36 - * computes the checksum of the TCP/UDP pseudo-header 12.37 - * returns a 16-bit checksum, already complemented. 12.38 - */ 12.39 -unsigned short int 12.40 -csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len, 12.41 - unsigned short proto, unsigned int sum) 12.42 -{ 12.43 - return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) + 12.44 - ((unsigned long) proto << 8)); 12.45 -} 12.46 - 12.47 -EXPORT_SYMBOL(csum_tcpudp_magic); 12.48 - 12.49 -unsigned int 12.50 -csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len, 12.51 - unsigned short proto, unsigned int sum) 12.52 -{ 12.53 - unsigned long result; 12.54 - 12.55 - result = (saddr + daddr + sum + 12.56 - ((unsigned long) ntohs(len) << 16) + 12.57 - ((unsigned long) proto << 8)); 12.58 - 12.59 - /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */ 12.60 - /* 64 to 33 */ 12.61 - result = (result & 0xffffffff) + (result >> 32); 12.62 - /* 33 to 32 */ 12.63 - result = (result & 0xffffffff) + (result >> 32); 12.64 - return result; 12.65 -} 12.66 - 12.67 -extern unsigned long do_csum (const unsigned char *, long); 12.68 - 12.69 -/* 12.70 - * computes the checksum of a memory block at buff, length len, 12.71 - * and adds in "sum" (32-bit) 12.72 - * 12.73 - * returns a 32-bit number suitable for feeding into itself 12.74 - * or csum_tcpudp_magic 12.75 - * 12.76 - * this function must be called with even lengths, except 12.77 - * for the last fragment, which may be odd 12.78 - * 12.79 - * it's best to have buff aligned on a 32-bit boundary 12.80 - */ 12.81 -unsigned int 12.82 -csum_partial (const unsigned char * buff, int len, unsigned int sum) 12.83 -{ 12.84 - unsigned long result = do_csum(buff, len); 12.85 - 12.86 - /* add in old sum, and carry.. */ 12.87 - result += sum; 12.88 - /* 32+c bits -> 32 bits */ 12.89 - result = (result & 0xffffffff) + (result >> 32); 12.90 - return result; 12.91 -} 12.92 - 12.93 -EXPORT_SYMBOL(csum_partial); 12.94 - 12.95 -/* 12.96 - * this routine is used for miscellaneous IP-like checksums, mainly 12.97 - * in icmp.c 12.98 - */ 12.99 -unsigned short 12.100 -ip_compute_csum (unsigned char * buff, int len) 12.101 -{ 12.102 - return ~do_csum(buff,len); 12.103 -} 12.104 - 12.105 -EXPORT_SYMBOL(ip_compute_csum);
13.1 --- a/xen/arch/ia64/linux/lib/clear_user.S Tue Aug 30 17:51:51 2005 -0600 13.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 13.3 @@ -1,209 +0,0 @@ 13.4 -/* 13.5 - * This routine clears to zero a linear memory buffer in user space. 13.6 - * 13.7 - * Inputs: 13.8 - * in0: address of buffer 13.9 - * in1: length of buffer in bytes 13.10 - * Outputs: 13.11 - * r8: number of bytes that didn't get cleared due to a fault 13.12 - * 13.13 - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co 13.14 - * Stephane Eranian <eranian@hpl.hp.com> 13.15 - */ 13.16 - 13.17 -#include <asm/asmmacro.h> 13.18 - 13.19 -// 13.20 -// arguments 13.21 -// 13.22 -#define buf r32 13.23 -#define len r33 13.24 - 13.25 -// 13.26 -// local registers 13.27 -// 13.28 -#define cnt r16 13.29 -#define buf2 r17 13.30 -#define saved_lc r18 13.31 -#define saved_pfs r19 13.32 -#define tmp r20 13.33 -#define len2 r21 13.34 -#define len3 r22 13.35 - 13.36 -// 13.37 -// Theory of operations: 13.38 -// - we check whether or not the buffer is small, i.e., less than 17 13.39 -// in which case we do the byte by byte loop. 13.40 -// 13.41 -// - Otherwise we go progressively from 1 byte store to 8byte store in 13.42 -// the head part, the body is a 16byte store loop and we finish we the 13.43 -// tail for the last 15 bytes. 13.44 -// The good point about this breakdown is that the long buffer handling 13.45 -// contains only 2 branches. 13.46 -// 13.47 -// The reason for not using shifting & masking for both the head and the 13.48 -// tail is to stay semantically correct. This routine is not supposed 13.49 -// to write bytes outside of the buffer. While most of the time this would 13.50 -// be ok, we can't tolerate a mistake. A classical example is the case 13.51 -// of multithreaded code were to the extra bytes touched is actually owned 13.52 -// by another thread which runs concurrently to ours. Another, less likely, 13.53 -// example is with device drivers where reading an I/O mapped location may 13.54 -// have side effects (same thing for writing). 13.55 -// 13.56 - 13.57 -GLOBAL_ENTRY(__do_clear_user) 13.58 - .prologue 13.59 - .save ar.pfs, saved_pfs 13.60 - alloc saved_pfs=ar.pfs,2,0,0,0 13.61 - cmp.eq p6,p0=r0,len // check for zero length 13.62 - .save ar.lc, saved_lc 13.63 - mov saved_lc=ar.lc // preserve ar.lc (slow) 13.64 - .body 13.65 - ;; // avoid WAW on CFM 13.66 - adds tmp=-1,len // br.ctop is repeat/until 13.67 - mov ret0=len // return value is length at this point 13.68 -(p6) br.ret.spnt.many rp 13.69 - ;; 13.70 - cmp.lt p6,p0=16,len // if len > 16 then long memset 13.71 - mov ar.lc=tmp // initialize lc for small count 13.72 -(p6) br.cond.dptk .long_do_clear 13.73 - ;; // WAR on ar.lc 13.74 - // 13.75 - // worst case 16 iterations, avg 8 iterations 13.76 - // 13.77 - // We could have played with the predicates to use the extra 13.78 - // M slot for 2 stores/iteration but the cost the initialization 13.79 - // the various counters compared to how long the loop is supposed 13.80 - // to last on average does not make this solution viable. 13.81 - // 13.82 -1: 13.83 - EX( .Lexit1, st1 [buf]=r0,1 ) 13.84 - adds len=-1,len // countdown length using len 13.85 - br.cloop.dptk 1b 13.86 - ;; // avoid RAW on ar.lc 13.87 - // 13.88 - // .Lexit4: comes from byte by byte loop 13.89 - // len contains bytes left 13.90 -.Lexit1: 13.91 - mov ret0=len // faster than using ar.lc 13.92 - mov ar.lc=saved_lc 13.93 - br.ret.sptk.many rp // end of short clear_user 13.94 - 13.95 - 13.96 - // 13.97 - // At this point we know we have more than 16 bytes to copy 13.98 - // so we focus on alignment (no branches required) 13.99 - // 13.100 - // The use of len/len2 for countdown of the number of bytes left 13.101 - // instead of ret0 is due to the fact that the exception code 13.102 - // changes the values of r8. 13.103 - // 13.104 -.long_do_clear: 13.105 - tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) 13.106 - ;; 13.107 - EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned 13.108 -(p6) adds len=-1,len;; // sync because buf is modified 13.109 - tbit.nz p6,p0=buf,1 13.110 - ;; 13.111 - EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned 13.112 -(p6) adds len=-2,len;; 13.113 - tbit.nz p6,p0=buf,2 13.114 - ;; 13.115 - EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned 13.116 -(p6) adds len=-4,len;; 13.117 - tbit.nz p6,p0=buf,3 13.118 - ;; 13.119 - EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned 13.120 -(p6) adds len=-8,len;; 13.121 - shr.u cnt=len,4 // number of 128-bit (2x64bit) words 13.122 - ;; 13.123 - cmp.eq p6,p0=r0,cnt 13.124 - adds tmp=-1,cnt 13.125 -(p6) br.cond.dpnt .dotail // we have less than 16 bytes left 13.126 - ;; 13.127 - adds buf2=8,buf // setup second base pointer 13.128 - mov ar.lc=tmp 13.129 - ;; 13.130 - 13.131 - // 13.132 - // 16bytes/iteration core loop 13.133 - // 13.134 - // The second store can never generate a fault because 13.135 - // we come into the loop only when we are 16-byte aligned. 13.136 - // This means that if we cross a page then it will always be 13.137 - // in the first store and never in the second. 13.138 - // 13.139 - // 13.140 - // We need to keep track of the remaining length. A possible (optimistic) 13.141 - // way would be to use ar.lc and derive how many byte were left by 13.142 - // doing : left= 16*ar.lc + 16. this would avoid the addition at 13.143 - // every iteration. 13.144 - // However we need to keep the synchronization point. A template 13.145 - // M;;MB does not exist and thus we can keep the addition at no 13.146 - // extra cycle cost (use a nop slot anyway). It also simplifies the 13.147 - // (unlikely) error recovery code 13.148 - // 13.149 - 13.150 -2: EX(.Lexit3, st8 [buf]=r0,16 ) 13.151 - ;; // needed to get len correct when error 13.152 - st8 [buf2]=r0,16 13.153 - adds len=-16,len 13.154 - br.cloop.dptk 2b 13.155 - ;; 13.156 - mov ar.lc=saved_lc 13.157 - // 13.158 - // tail correction based on len only 13.159 - // 13.160 - // We alternate the use of len3,len2 to allow parallelism and correct 13.161 - // error handling. We also reuse p6/p7 to return correct value. 13.162 - // The addition of len2/len3 does not cost anything more compared to 13.163 - // the regular memset as we had empty slots. 13.164 - // 13.165 -.dotail: 13.166 - mov len2=len // for parallelization of error handling 13.167 - mov len3=len 13.168 - tbit.nz p6,p0=len,3 13.169 - ;; 13.170 - EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes 13.171 -(p6) adds len3=-8,len2 13.172 - tbit.nz p7,p6=len,2 13.173 - ;; 13.174 - EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes 13.175 -(p7) adds len2=-4,len3 13.176 - tbit.nz p6,p7=len,1 13.177 - ;; 13.178 - EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes 13.179 -(p6) adds len3=-2,len2 13.180 - tbit.nz p7,p6=len,0 13.181 - ;; 13.182 - EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left 13.183 - mov ret0=r0 // success 13.184 - br.ret.sptk.many rp // end of most likely path 13.185 - 13.186 - // 13.187 - // Outlined error handling code 13.188 - // 13.189 - 13.190 - // 13.191 - // .Lexit3: comes from core loop, need restore pr/lc 13.192 - // len contains bytes left 13.193 - // 13.194 - // 13.195 - // .Lexit2: 13.196 - // if p6 -> coming from st8 or st2 : len2 contains what's left 13.197 - // if p7 -> coming from st4 or st1 : len3 contains what's left 13.198 - // We must restore lc/pr even though might not have been used. 13.199 -.Lexit2: 13.200 - .pred.rel "mutex", p6, p7 13.201 -(p6) mov len=len2 13.202 -(p7) mov len=len3 13.203 - ;; 13.204 - // 13.205 - // .Lexit4: comes from head, need not restore pr/lc 13.206 - // len contains bytes left 13.207 - // 13.208 -.Lexit3: 13.209 - mov ret0=len 13.210 - mov ar.lc=saved_lc 13.211 - br.ret.sptk.many rp 13.212 -END(__do_clear_user)
14.1 --- a/xen/arch/ia64/linux/lib/copy_page.S Tue Aug 30 17:51:51 2005 -0600 14.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 14.3 @@ -1,98 +0,0 @@ 14.4 -/* 14.5 - * 14.6 - * Optimized version of the standard copy_page() function 14.7 - * 14.8 - * Inputs: 14.9 - * in0: address of target page 14.10 - * in1: address of source page 14.11 - * Output: 14.12 - * no return value 14.13 - * 14.14 - * Copyright (C) 1999, 2001 Hewlett-Packard Co 14.15 - * Stephane Eranian <eranian@hpl.hp.com> 14.16 - * David Mosberger <davidm@hpl.hp.com> 14.17 - * 14.18 - * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies. 14.19 - */ 14.20 -#include <asm/asmmacro.h> 14.21 -#include <asm/page.h> 14.22 - 14.23 -#define PIPE_DEPTH 3 14.24 -#define EPI p[PIPE_DEPTH-1] 14.25 - 14.26 -#define lcount r16 14.27 -#define saved_pr r17 14.28 -#define saved_lc r18 14.29 -#define saved_pfs r19 14.30 -#define src1 r20 14.31 -#define src2 r21 14.32 -#define tgt1 r22 14.33 -#define tgt2 r23 14.34 -#define srcf r24 14.35 -#define tgtf r25 14.36 -#define tgt_last r26 14.37 - 14.38 -#define Nrot ((8*PIPE_DEPTH+7)&~7) 14.39 - 14.40 -GLOBAL_ENTRY(copy_page) 14.41 - .prologue 14.42 - .save ar.pfs, saved_pfs 14.43 - alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot 14.44 - 14.45 - .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \ 14.46 - t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH] 14.47 - .rotp p[PIPE_DEPTH] 14.48 - 14.49 - .save ar.lc, saved_lc 14.50 - mov saved_lc=ar.lc 14.51 - mov ar.ec=PIPE_DEPTH 14.52 - 14.53 - mov lcount=PAGE_SIZE/64-1 14.54 - .save pr, saved_pr 14.55 - mov saved_pr=pr 14.56 - mov pr.rot=1<<16 14.57 - 14.58 - .body 14.59 - 14.60 - mov src1=in1 14.61 - adds src2=8,in1 14.62 - mov tgt_last = PAGE_SIZE 14.63 - ;; 14.64 - adds tgt2=8,in0 14.65 - add srcf=512,in1 14.66 - mov ar.lc=lcount 14.67 - mov tgt1=in0 14.68 - add tgtf=512,in0 14.69 - add tgt_last = tgt_last, in0 14.70 - ;; 14.71 -1: 14.72 -(p[0]) ld8 t1[0]=[src1],16 14.73 -(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16 14.74 -(p[0]) ld8 t2[0]=[src2],16 14.75 -(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16 14.76 - cmp.ltu p6,p0 = tgtf, tgt_last 14.77 - ;; 14.78 -(p[0]) ld8 t3[0]=[src1],16 14.79 -(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16 14.80 -(p[0]) ld8 t4[0]=[src2],16 14.81 -(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16 14.82 - ;; 14.83 -(p[0]) ld8 t5[0]=[src1],16 14.84 -(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16 14.85 -(p[0]) ld8 t6[0]=[src2],16 14.86 -(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16 14.87 - ;; 14.88 -(p[0]) ld8 t7[0]=[src1],16 14.89 -(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16 14.90 -(p[0]) ld8 t8[0]=[src2],16 14.91 -(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16 14.92 - 14.93 -(p6) lfetch [srcf], 64 14.94 -(p6) lfetch [tgtf], 64 14.95 - br.ctop.sptk.few 1b 14.96 - ;; 14.97 - mov pr=saved_pr,0xffffffffffff0000 // restore predicates 14.98 - mov ar.pfs=saved_pfs 14.99 - mov ar.lc=saved_lc 14.100 - br.ret.sptk.many rp 14.101 -END(copy_page)
15.1 --- a/xen/arch/ia64/linux/lib/copy_user.S Tue Aug 30 17:51:51 2005 -0600 15.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 15.3 @@ -1,610 +0,0 @@ 15.4 -/* 15.5 - * 15.6 - * Optimized version of the copy_user() routine. 15.7 - * It is used to copy date across the kernel/user boundary. 15.8 - * 15.9 - * The source and destination are always on opposite side of 15.10 - * the boundary. When reading from user space we must catch 15.11 - * faults on loads. When writing to user space we must catch 15.12 - * errors on stores. Note that because of the nature of the copy 15.13 - * we don't need to worry about overlapping regions. 15.14 - * 15.15 - * 15.16 - * Inputs: 15.17 - * in0 address of source buffer 15.18 - * in1 address of destination buffer 15.19 - * in2 number of bytes to copy 15.20 - * 15.21 - * Outputs: 15.22 - * ret0 0 in case of success. The number of bytes NOT copied in 15.23 - * case of error. 15.24 - * 15.25 - * Copyright (C) 2000-2001 Hewlett-Packard Co 15.26 - * Stephane Eranian <eranian@hpl.hp.com> 15.27 - * 15.28 - * Fixme: 15.29 - * - handle the case where we have more than 16 bytes and the alignment 15.30 - * are different. 15.31 - * - more benchmarking 15.32 - * - fix extraneous stop bit introduced by the EX() macro. 15.33 - */ 15.34 - 15.35 -#include <asm/asmmacro.h> 15.36 - 15.37 -// 15.38 -// Tuneable parameters 15.39 -// 15.40 -#define COPY_BREAK 16 // we do byte copy below (must be >=16) 15.41 -#define PIPE_DEPTH 21 // pipe depth 15.42 - 15.43 -#define EPI p[PIPE_DEPTH-1] 15.44 - 15.45 -// 15.46 -// arguments 15.47 -// 15.48 -#define dst in0 15.49 -#define src in1 15.50 -#define len in2 15.51 - 15.52 -// 15.53 -// local registers 15.54 -// 15.55 -#define t1 r2 // rshift in bytes 15.56 -#define t2 r3 // lshift in bytes 15.57 -#define rshift r14 // right shift in bits 15.58 -#define lshift r15 // left shift in bits 15.59 -#define word1 r16 15.60 -#define word2 r17 15.61 -#define cnt r18 15.62 -#define len2 r19 15.63 -#define saved_lc r20 15.64 -#define saved_pr r21 15.65 -#define tmp r22 15.66 -#define val r23 15.67 -#define src1 r24 15.68 -#define dst1 r25 15.69 -#define src2 r26 15.70 -#define dst2 r27 15.71 -#define len1 r28 15.72 -#define enddst r29 15.73 -#define endsrc r30 15.74 -#define saved_pfs r31 15.75 - 15.76 -GLOBAL_ENTRY(__copy_user) 15.77 - .prologue 15.78 - .save ar.pfs, saved_pfs 15.79 - alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) 15.80 - 15.81 - .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] 15.82 - .rotp p[PIPE_DEPTH] 15.83 - 15.84 - adds len2=-1,len // br.ctop is repeat/until 15.85 - mov ret0=r0 15.86 - 15.87 - ;; // RAW of cfm when len=0 15.88 - cmp.eq p8,p0=r0,len // check for zero length 15.89 - .save ar.lc, saved_lc 15.90 - mov saved_lc=ar.lc // preserve ar.lc (slow) 15.91 -(p8) br.ret.spnt.many rp // empty mempcy() 15.92 - ;; 15.93 - add enddst=dst,len // first byte after end of source 15.94 - add endsrc=src,len // first byte after end of destination 15.95 - .save pr, saved_pr 15.96 - mov saved_pr=pr // preserve predicates 15.97 - 15.98 - .body 15.99 - 15.100 - mov dst1=dst // copy because of rotation 15.101 - mov ar.ec=PIPE_DEPTH 15.102 - mov pr.rot=1<<16 // p16=true all others are false 15.103 - 15.104 - mov src1=src // copy because of rotation 15.105 - mov ar.lc=len2 // initialize lc for small count 15.106 - cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy 15.107 - 15.108 - xor tmp=src,dst // same alignment test prepare 15.109 -(p10) br.cond.dptk .long_copy_user 15.110 - ;; // RAW pr.rot/p16 ? 15.111 - // 15.112 - // Now we do the byte by byte loop with software pipeline 15.113 - // 15.114 - // p7 is necessarily false by now 15.115 -1: 15.116 - EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 15.117 - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 15.118 - br.ctop.dptk.few 1b 15.119 - ;; 15.120 - mov ar.lc=saved_lc 15.121 - mov pr=saved_pr,0xffffffffffff0000 15.122 - mov ar.pfs=saved_pfs // restore ar.ec 15.123 - br.ret.sptk.many rp // end of short memcpy 15.124 - 15.125 - // 15.126 - // Not 8-byte aligned 15.127 - // 15.128 -.diff_align_copy_user: 15.129 - // At this point we know we have more than 16 bytes to copy 15.130 - // and also that src and dest do _not_ have the same alignment. 15.131 - and src2=0x7,src1 // src offset 15.132 - and dst2=0x7,dst1 // dst offset 15.133 - ;; 15.134 - // The basic idea is that we copy byte-by-byte at the head so 15.135 - // that we can reach 8-byte alignment for both src1 and dst1. 15.136 - // Then copy the body using software pipelined 8-byte copy, 15.137 - // shifting the two back-to-back words right and left, then copy 15.138 - // the tail by copying byte-by-byte. 15.139 - // 15.140 - // Fault handling. If the byte-by-byte at the head fails on the 15.141 - // load, then restart and finish the pipleline by copying zeros 15.142 - // to the dst1. Then copy zeros for the rest of dst1. 15.143 - // If 8-byte software pipeline fails on the load, do the same as 15.144 - // failure_in3 does. If the byte-by-byte at the tail fails, it is 15.145 - // handled simply by failure_in_pipe1. 15.146 - // 15.147 - // The case p14 represents the source has more bytes in the 15.148 - // the first word (by the shifted part), whereas the p15 needs to 15.149 - // copy some bytes from the 2nd word of the source that has the 15.150 - // tail of the 1st of the destination. 15.151 - // 15.152 - 15.153 - // 15.154 - // Optimization. If dst1 is 8-byte aligned (quite common), we don't need 15.155 - // to copy the head to dst1, to start 8-byte copy software pipeline. 15.156 - // We know src1 is not 8-byte aligned in this case. 15.157 - // 15.158 - cmp.eq p14,p15=r0,dst2 15.159 -(p15) br.cond.spnt 1f 15.160 - ;; 15.161 - sub t1=8,src2 15.162 - mov t2=src2 15.163 - ;; 15.164 - shl rshift=t2,3 15.165 - sub len1=len,t1 // set len1 15.166 - ;; 15.167 - sub lshift=64,rshift 15.168 - ;; 15.169 - br.cond.spnt .word_copy_user 15.170 - ;; 15.171 -1: 15.172 - cmp.leu p14,p15=src2,dst2 15.173 - sub t1=dst2,src2 15.174 - ;; 15.175 - .pred.rel "mutex", p14, p15 15.176 -(p14) sub word1=8,src2 // (8 - src offset) 15.177 -(p15) sub t1=r0,t1 // absolute value 15.178 -(p15) sub word1=8,dst2 // (8 - dst offset) 15.179 - ;; 15.180 - // For the case p14, we don't need to copy the shifted part to 15.181 - // the 1st word of destination. 15.182 - sub t2=8,t1 15.183 -(p14) sub word1=word1,t1 15.184 - ;; 15.185 - sub len1=len,word1 // resulting len 15.186 -(p15) shl rshift=t1,3 // in bits 15.187 -(p14) shl rshift=t2,3 15.188 - ;; 15.189 -(p14) sub len1=len1,t1 15.190 - adds cnt=-1,word1 15.191 - ;; 15.192 - sub lshift=64,rshift 15.193 - mov ar.ec=PIPE_DEPTH 15.194 - mov pr.rot=1<<16 // p16=true all others are false 15.195 - mov ar.lc=cnt 15.196 - ;; 15.197 -2: 15.198 - EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) 15.199 - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 15.200 - br.ctop.dptk.few 2b 15.201 - ;; 15.202 - clrrrb 15.203 - ;; 15.204 -.word_copy_user: 15.205 - cmp.gtu p9,p0=16,len1 15.206 -(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy 15.207 - ;; 15.208 - shr.u cnt=len1,3 // number of 64-bit words 15.209 - ;; 15.210 - adds cnt=-1,cnt 15.211 - ;; 15.212 - .pred.rel "mutex", p14, p15 15.213 -(p14) sub src1=src1,t2 15.214 -(p15) sub src1=src1,t1 15.215 - // 15.216 - // Now both src1 and dst1 point to an 8-byte aligned address. And 15.217 - // we have more than 8 bytes to copy. 15.218 - // 15.219 - mov ar.lc=cnt 15.220 - mov ar.ec=PIPE_DEPTH 15.221 - mov pr.rot=1<<16 // p16=true all others are false 15.222 - ;; 15.223 -3: 15.224 - // 15.225 - // The pipleline consists of 3 stages: 15.226 - // 1 (p16): Load a word from src1 15.227 - // 2 (EPI_1): Shift right pair, saving to tmp 15.228 - // 3 (EPI): Store tmp to dst1 15.229 - // 15.230 - // To make it simple, use at least 2 (p16) loops to set up val1[n] 15.231 - // because we need 2 back-to-back val1[] to get tmp. 15.232 - // Note that this implies EPI_2 must be p18 or greater. 15.233 - // 15.234 - 15.235 -#define EPI_1 p[PIPE_DEPTH-2] 15.236 -#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift 15.237 -#define CASE(pred, shift) \ 15.238 - (pred) br.cond.spnt .copy_user_bit##shift 15.239 -#define BODY(rshift) \ 15.240 -.copy_user_bit##rshift: \ 15.241 -1: \ 15.242 - EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ 15.243 -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 15.244 - EX(3f,(p16) ld8 val1[1]=[src1],8); \ 15.245 -(p16) mov val1[0]=r0; \ 15.246 - br.ctop.dptk 1b; \ 15.247 - ;; \ 15.248 - br.cond.sptk.many .diff_align_do_tail; \ 15.249 -2: \ 15.250 -(EPI) st8 [dst1]=tmp,8; \ 15.251 -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 15.252 -3: \ 15.253 -(p16) mov val1[1]=r0; \ 15.254 -(p16) mov val1[0]=r0; \ 15.255 - br.ctop.dptk 2b; \ 15.256 - ;; \ 15.257 - br.cond.sptk.many .failure_in2 15.258 - 15.259 - // 15.260 - // Since the instruction 'shrp' requires a fixed 128-bit value 15.261 - // specifying the bits to shift, we need to provide 7 cases 15.262 - // below. 15.263 - // 15.264 - SWITCH(p6, 8) 15.265 - SWITCH(p7, 16) 15.266 - SWITCH(p8, 24) 15.267 - SWITCH(p9, 32) 15.268 - SWITCH(p10, 40) 15.269 - SWITCH(p11, 48) 15.270 - SWITCH(p12, 56) 15.271 - ;; 15.272 - CASE(p6, 8) 15.273 - CASE(p7, 16) 15.274 - CASE(p8, 24) 15.275 - CASE(p9, 32) 15.276 - CASE(p10, 40) 15.277 - CASE(p11, 48) 15.278 - CASE(p12, 56) 15.279 - ;; 15.280 - BODY(8) 15.281 - BODY(16) 15.282 - BODY(24) 15.283 - BODY(32) 15.284 - BODY(40) 15.285 - BODY(48) 15.286 - BODY(56) 15.287 - ;; 15.288 -.diff_align_do_tail: 15.289 - .pred.rel "mutex", p14, p15 15.290 -(p14) sub src1=src1,t1 15.291 -(p14) adds dst1=-8,dst1 15.292 -(p15) sub dst1=dst1,t1 15.293 - ;; 15.294 -4: 15.295 - // Tail correction. 15.296 - // 15.297 - // The problem with this piplelined loop is that the last word is not 15.298 - // loaded and thus parf of the last word written is not correct. 15.299 - // To fix that, we simply copy the tail byte by byte. 15.300 - 15.301 - sub len1=endsrc,src1,1 15.302 - clrrrb 15.303 - ;; 15.304 - mov ar.ec=PIPE_DEPTH 15.305 - mov pr.rot=1<<16 // p16=true all others are false 15.306 - mov ar.lc=len1 15.307 - ;; 15.308 -5: 15.309 - EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 15.310 - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 15.311 - br.ctop.dptk.few 5b 15.312 - ;; 15.313 - mov ar.lc=saved_lc 15.314 - mov pr=saved_pr,0xffffffffffff0000 15.315 - mov ar.pfs=saved_pfs 15.316 - br.ret.sptk.many rp 15.317 - 15.318 - // 15.319 - // Beginning of long mempcy (i.e. > 16 bytes) 15.320 - // 15.321 -.long_copy_user: 15.322 - tbit.nz p6,p7=src1,0 // odd alignment 15.323 - and tmp=7,tmp 15.324 - ;; 15.325 - cmp.eq p10,p8=r0,tmp 15.326 - mov len1=len // copy because of rotation 15.327 -(p8) br.cond.dpnt .diff_align_copy_user 15.328 - ;; 15.329 - // At this point we know we have more than 16 bytes to copy 15.330 - // and also that both src and dest have the same alignment 15.331 - // which may not be the one we want. So for now we must move 15.332 - // forward slowly until we reach 16byte alignment: no need to 15.333 - // worry about reaching the end of buffer. 15.334 - // 15.335 - EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned 15.336 -(p6) adds len1=-1,len1;; 15.337 - tbit.nz p7,p0=src1,1 15.338 - ;; 15.339 - EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned 15.340 -(p7) adds len1=-2,len1;; 15.341 - tbit.nz p8,p0=src1,2 15.342 - ;; 15.343 - // 15.344 - // Stop bit not required after ld4 because if we fail on ld4 15.345 - // we have never executed the ld1, therefore st1 is not executed. 15.346 - // 15.347 - EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned 15.348 - ;; 15.349 - EX(.failure_out,(p6) st1 [dst1]=val1[0],1) 15.350 - tbit.nz p9,p0=src1,3 15.351 - ;; 15.352 - // 15.353 - // Stop bit not required after ld8 because if we fail on ld8 15.354 - // we have never executed the ld2, therefore st2 is not executed. 15.355 - // 15.356 - EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned 15.357 - EX(.failure_out,(p7) st2 [dst1]=val1[1],2) 15.358 -(p8) adds len1=-4,len1 15.359 - ;; 15.360 - EX(.failure_out, (p8) st4 [dst1]=val2[0],4) 15.361 -(p9) adds len1=-8,len1;; 15.362 - shr.u cnt=len1,4 // number of 128-bit (2x64bit) words 15.363 - ;; 15.364 - EX(.failure_out, (p9) st8 [dst1]=val2[1],8) 15.365 - tbit.nz p6,p0=len1,3 15.366 - cmp.eq p7,p0=r0,cnt 15.367 - adds tmp=-1,cnt // br.ctop is repeat/until 15.368 -(p7) br.cond.dpnt .dotail // we have less than 16 bytes left 15.369 - ;; 15.370 - adds src2=8,src1 15.371 - adds dst2=8,dst1 15.372 - mov ar.lc=tmp 15.373 - ;; 15.374 - // 15.375 - // 16bytes/iteration 15.376 - // 15.377 -2: 15.378 - EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) 15.379 -(p16) ld8 val2[0]=[src2],16 15.380 - 15.381 - EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) 15.382 -(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 15.383 - br.ctop.dptk 2b 15.384 - ;; // RAW on src1 when fall through from loop 15.385 - // 15.386 - // Tail correction based on len only 15.387 - // 15.388 - // No matter where we come from (loop or test) the src1 pointer 15.389 - // is 16 byte aligned AND we have less than 16 bytes to copy. 15.390 - // 15.391 -.dotail: 15.392 - EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes 15.393 - tbit.nz p7,p0=len1,2 15.394 - ;; 15.395 - EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes 15.396 - tbit.nz p8,p0=len1,1 15.397 - ;; 15.398 - EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes 15.399 - tbit.nz p9,p0=len1,0 15.400 - ;; 15.401 - EX(.failure_out, (p6) st8 [dst1]=val1[0],8) 15.402 - ;; 15.403 - EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left 15.404 - mov ar.lc=saved_lc 15.405 - ;; 15.406 - EX(.failure_out,(p7) st4 [dst1]=val1[1],4) 15.407 - mov pr=saved_pr,0xffffffffffff0000 15.408 - ;; 15.409 - EX(.failure_out, (p8) st2 [dst1]=val2[0],2) 15.410 - mov ar.pfs=saved_pfs 15.411 - ;; 15.412 - EX(.failure_out, (p9) st1 [dst1]=val2[1]) 15.413 - br.ret.sptk.many rp 15.414 - 15.415 - 15.416 - // 15.417 - // Here we handle the case where the byte by byte copy fails 15.418 - // on the load. 15.419 - // Several factors make the zeroing of the rest of the buffer kind of 15.420 - // tricky: 15.421 - // - the pipeline: loads/stores are not in sync (pipeline) 15.422 - // 15.423 - // In the same loop iteration, the dst1 pointer does not directly 15.424 - // reflect where the faulty load was. 15.425 - // 15.426 - // - pipeline effect 15.427 - // When you get a fault on load, you may have valid data from 15.428 - // previous loads not yet store in transit. Such data must be 15.429 - // store normally before moving onto zeroing the rest. 15.430 - // 15.431 - // - single/multi dispersal independence. 15.432 - // 15.433 - // solution: 15.434 - // - we don't disrupt the pipeline, i.e. data in transit in 15.435 - // the software pipeline will be eventually move to memory. 15.436 - // We simply replace the load with a simple mov and keep the 15.437 - // pipeline going. We can't really do this inline because 15.438 - // p16 is always reset to 1 when lc > 0. 15.439 - // 15.440 -.failure_in_pipe1: 15.441 - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 15.442 -1: 15.443 -(p16) mov val1[0]=r0 15.444 -(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 15.445 - br.ctop.dptk 1b 15.446 - ;; 15.447 - mov pr=saved_pr,0xffffffffffff0000 15.448 - mov ar.lc=saved_lc 15.449 - mov ar.pfs=saved_pfs 15.450 - br.ret.sptk.many rp 15.451 - 15.452 - // 15.453 - // This is the case where the byte by byte copy fails on the load 15.454 - // when we copy the head. We need to finish the pipeline and copy 15.455 - // zeros for the rest of the destination. Since this happens 15.456 - // at the top we still need to fill the body and tail. 15.457 -.failure_in_pipe2: 15.458 - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 15.459 -2: 15.460 -(p16) mov val1[0]=r0 15.461 -(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 15.462 - br.ctop.dptk 2b 15.463 - ;; 15.464 - sub len=enddst,dst1,1 // precompute len 15.465 - br.cond.dptk.many .failure_in1bis 15.466 - ;; 15.467 - 15.468 - // 15.469 - // Here we handle the head & tail part when we check for alignment. 15.470 - // The following code handles only the load failures. The 15.471 - // main diffculty comes from the fact that loads/stores are 15.472 - // scheduled. So when you fail on a load, the stores corresponding 15.473 - // to previous successful loads must be executed. 15.474 - // 15.475 - // However some simplifications are possible given the way 15.476 - // things work. 15.477 - // 15.478 - // 1) HEAD 15.479 - // Theory of operation: 15.480 - // 15.481 - // Page A | Page B 15.482 - // ---------|----- 15.483 - // 1|8 x 15.484 - // 1 2|8 x 15.485 - // 4|8 x 15.486 - // 1 4|8 x 15.487 - // 2 4|8 x 15.488 - // 1 2 4|8 x 15.489 - // |1 15.490 - // |2 x 15.491 - // |4 x 15.492 - // 15.493 - // page_size >= 4k (2^12). (x means 4, 2, 1) 15.494 - // Here we suppose Page A exists and Page B does not. 15.495 - // 15.496 - // As we move towards eight byte alignment we may encounter faults. 15.497 - // The numbers on each page show the size of the load (current alignment). 15.498 - // 15.499 - // Key point: 15.500 - // - if you fail on 1, 2, 4 then you have never executed any smaller 15.501 - // size loads, e.g. failing ld4 means no ld1 nor ld2 executed 15.502 - // before. 15.503 - // 15.504 - // This allows us to simplify the cleanup code, because basically you 15.505 - // only have to worry about "pending" stores in the case of a failing 15.506 - // ld8(). Given the way the code is written today, this means only 15.507 - // worry about st2, st4. There we can use the information encapsulated 15.508 - // into the predicates. 15.509 - // 15.510 - // Other key point: 15.511 - // - if you fail on the ld8 in the head, it means you went straight 15.512 - // to it, i.e. 8byte alignment within an unexisting page. 15.513 - // Again this comes from the fact that if you crossed just for the ld8 then 15.514 - // you are 8byte aligned but also 16byte align, therefore you would 15.515 - // either go for the 16byte copy loop OR the ld8 in the tail part. 15.516 - // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible 15.517 - // because it would mean you had 15bytes to copy in which case you 15.518 - // would have defaulted to the byte by byte copy. 15.519 - // 15.520 - // 15.521 - // 2) TAIL 15.522 - // Here we now we have less than 16 bytes AND we are either 8 or 16 byte 15.523 - // aligned. 15.524 - // 15.525 - // Key point: 15.526 - // This means that we either: 15.527 - // - are right on a page boundary 15.528 - // OR 15.529 - // - are at more than 16 bytes from a page boundary with 15.530 - // at most 15 bytes to copy: no chance of crossing. 15.531 - // 15.532 - // This allows us to assume that if we fail on a load we haven't possibly 15.533 - // executed any of the previous (tail) ones, so we don't need to do 15.534 - // any stores. For instance, if we fail on ld2, this means we had 15.535 - // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. 15.536 - // 15.537 - // This means that we are in a situation similar the a fault in the 15.538 - // head part. That's nice! 15.539 - // 15.540 -.failure_in1: 15.541 - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 15.542 - sub len=endsrc,src1,1 15.543 - // 15.544 - // we know that ret0 can never be zero at this point 15.545 - // because we failed why trying to do a load, i.e. there is still 15.546 - // some work to do. 15.547 - // The failure_in1bis and length problem is taken care of at the 15.548 - // calling side. 15.549 - // 15.550 - ;; 15.551 -.failure_in1bis: // from (.failure_in3) 15.552 - mov ar.lc=len // Continue with a stupid byte store. 15.553 - ;; 15.554 -5: 15.555 - st1 [dst1]=r0,1 15.556 - br.cloop.dptk 5b 15.557 - ;; 15.558 - mov pr=saved_pr,0xffffffffffff0000 15.559 - mov ar.lc=saved_lc 15.560 - mov ar.pfs=saved_pfs 15.561 - br.ret.sptk.many rp 15.562 - 15.563 - // 15.564 - // Here we simply restart the loop but instead 15.565 - // of doing loads we fill the pipeline with zeroes 15.566 - // We can't simply store r0 because we may have valid 15.567 - // data in transit in the pipeline. 15.568 - // ar.lc and ar.ec are setup correctly at this point 15.569 - // 15.570 - // we MUST use src1/endsrc here and not dst1/enddst because 15.571 - // of the pipeline effect. 15.572 - // 15.573 -.failure_in3: 15.574 - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 15.575 - ;; 15.576 -2: 15.577 -(p16) mov val1[0]=r0 15.578 -(p16) mov val2[0]=r0 15.579 -(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 15.580 -(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 15.581 - br.ctop.dptk 2b 15.582 - ;; 15.583 - cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 15.584 - sub len=enddst,dst1,1 // precompute len 15.585 -(p6) br.cond.dptk .failure_in1bis 15.586 - ;; 15.587 - mov pr=saved_pr,0xffffffffffff0000 15.588 - mov ar.lc=saved_lc 15.589 - mov ar.pfs=saved_pfs 15.590 - br.ret.sptk.many rp 15.591 - 15.592 -.failure_in2: 15.593 - sub ret0=endsrc,src1 15.594 - cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 15.595 - sub len=enddst,dst1,1 // precompute len 15.596 -(p6) br.cond.dptk .failure_in1bis 15.597 - ;; 15.598 - mov pr=saved_pr,0xffffffffffff0000 15.599 - mov ar.lc=saved_lc 15.600 - mov ar.pfs=saved_pfs 15.601 - br.ret.sptk.many rp 15.602 - 15.603 - // 15.604 - // handling of failures on stores: that's the easy part 15.605 - // 15.606 -.failure_out: 15.607 - sub ret0=enddst,dst1 15.608 - mov pr=saved_pr,0xffffffffffff0000 15.609 - mov ar.lc=saved_lc 15.610 - 15.611 - mov ar.pfs=saved_pfs 15.612 - br.ret.sptk.many rp 15.613 -END(__copy_user)
16.1 --- a/xen/arch/ia64/linux/lib/csum_partial_copy.c Tue Aug 30 17:51:51 2005 -0600 16.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 16.3 @@ -1,151 +0,0 @@ 16.4 -/* 16.5 - * Network Checksum & Copy routine 16.6 - * 16.7 - * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co 16.8 - * Stephane Eranian <eranian@hpl.hp.com> 16.9 - * 16.10 - * Most of the code has been imported from Linux/Alpha 16.11 - */ 16.12 - 16.13 -#include <linux/module.h> 16.14 -#include <linux/types.h> 16.15 -#include <linux/string.h> 16.16 - 16.17 -#include <asm/uaccess.h> 16.18 - 16.19 -/* 16.20 - * XXX Fixme: those 2 inlines are meant for debugging and will go away 16.21 - */ 16.22 -static inline unsigned 16.23 -short from64to16(unsigned long x) 16.24 -{ 16.25 - /* add up 32-bit words for 33 bits */ 16.26 - x = (x & 0xffffffff) + (x >> 32); 16.27 - /* add up 16-bit and 17-bit words for 17+c bits */ 16.28 - x = (x & 0xffff) + (x >> 16); 16.29 - /* add up 16-bit and 2-bit for 16+c bit */ 16.30 - x = (x & 0xffff) + (x >> 16); 16.31 - /* add up carry.. */ 16.32 - x = (x & 0xffff) + (x >> 16); 16.33 - return x; 16.34 -} 16.35 - 16.36 -static inline 16.37 -unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum) 16.38 -{ 16.39 - int odd, count; 16.40 - unsigned long result = (unsigned long)psum; 16.41 - 16.42 - if (len <= 0) 16.43 - goto out; 16.44 - odd = 1 & (unsigned long) buff; 16.45 - if (odd) { 16.46 - result = *buff << 8; 16.47 - len--; 16.48 - buff++; 16.49 - } 16.50 - count = len >> 1; /* nr of 16-bit words.. */ 16.51 - if (count) { 16.52 - if (2 & (unsigned long) buff) { 16.53 - result += *(unsigned short *) buff; 16.54 - count--; 16.55 - len -= 2; 16.56 - buff += 2; 16.57 - } 16.58 - count >>= 1; /* nr of 32-bit words.. */ 16.59 - if (count) { 16.60 - if (4 & (unsigned long) buff) { 16.61 - result += *(unsigned int *) buff; 16.62 - count--; 16.63 - len -= 4; 16.64 - buff += 4; 16.65 - } 16.66 - count >>= 1; /* nr of 64-bit words.. */ 16.67 - if (count) { 16.68 - unsigned long carry = 0; 16.69 - do { 16.70 - unsigned long w = *(unsigned long *) buff; 16.71 - count--; 16.72 - buff += 8; 16.73 - result += carry; 16.74 - result += w; 16.75 - carry = (w > result); 16.76 - } while (count); 16.77 - result += carry; 16.78 - result = (result & 0xffffffff) + (result >> 32); 16.79 - } 16.80 - if (len & 4) { 16.81 - result += *(unsigned int *) buff; 16.82 - buff += 4; 16.83 - } 16.84 - } 16.85 - if (len & 2) { 16.86 - result += *(unsigned short *) buff; 16.87 - buff += 2; 16.88 - } 16.89 - } 16.90 - if (len & 1) 16.91 - result += *buff; 16.92 - 16.93 - result = from64to16(result); 16.94 - 16.95 - if (odd) 16.96 - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); 16.97 - 16.98 -out: 16.99 - return result; 16.100 -} 16.101 - 16.102 -/* 16.103 - * XXX Fixme 16.104 - * 16.105 - * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS. 16.106 - * But it's very tricky to get right even in C. 16.107 - */ 16.108 -extern unsigned long do_csum(const unsigned char *, long); 16.109 - 16.110 -static unsigned int 16.111 -do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst, 16.112 - int len, unsigned int psum, int *errp) 16.113 -{ 16.114 - unsigned long result; 16.115 - 16.116 - /* XXX Fixme 16.117 - * for now we separate the copy from checksum for obvious 16.118 - * alignment difficulties. Look at the Alpha code and you'll be 16.119 - * scared. 16.120 - */ 16.121 - 16.122 - if (__copy_from_user(dst, src, len) != 0 && errp) 16.123 - *errp = -EFAULT; 16.124 - 16.125 - result = do_csum(dst, len); 16.126 - 16.127 - /* add in old sum, and carry.. */ 16.128 - result += psum; 16.129 - /* 32+c bits -> 32 bits */ 16.130 - result = (result & 0xffffffff) + (result >> 32); 16.131 - return result; 16.132 -} 16.133 - 16.134 -unsigned int 16.135 -csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst, 16.136 - int len, unsigned int sum, int *errp) 16.137 -{ 16.138 - if (!access_ok(VERIFY_READ, src, len)) { 16.139 - *errp = -EFAULT; 16.140 - memset(dst, 0, len); 16.141 - return sum; 16.142 - } 16.143 - 16.144 - return do_csum_partial_copy_from_user(src, dst, len, sum, errp); 16.145 -} 16.146 - 16.147 -unsigned int 16.148 -csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst, 16.149 - int len, unsigned int sum) 16.150 -{ 16.151 - return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); 16.152 -} 16.153 - 16.154 -EXPORT_SYMBOL(csum_partial_copy_nocheck);
17.1 --- a/xen/arch/ia64/linux/lib/dec_and_lock.c Tue Aug 30 17:51:51 2005 -0600 17.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 17.3 @@ -1,42 +0,0 @@ 17.4 -/* 17.5 - * Copyright (C) 2003 Jerome Marchand, Bull S.A. 17.6 - * Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com> 17.7 - * 17.8 - * This file is released under the GPLv2, or at your option any later version. 17.9 - * 17.10 - * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction. This 17.11 - * code is an adaptation of the x86 version of "atomic_dec_and_lock()". 17.12 - */ 17.13 - 17.14 -#include <linux/compiler.h> 17.15 -#include <linux/module.h> 17.16 -#include <linux/spinlock.h> 17.17 -#include <asm/atomic.h> 17.18 - 17.19 -/* 17.20 - * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these 17.21 - * operations have to be done atomically, so that the count doesn't drop to zero without 17.22 - * acquiring the spinlock first. 17.23 - */ 17.24 -int 17.25 -_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock) 17.26 -{ 17.27 - int old, new; 17.28 - 17.29 - do { 17.30 - old = atomic_read(refcount); 17.31 - new = old - 1; 17.32 - 17.33 - if (unlikely (old == 1)) { 17.34 - /* oops, we may be decrementing to zero, do it the slow way... */ 17.35 - spin_lock(lock); 17.36 - if (atomic_dec_and_test(refcount)) 17.37 - return 1; 17.38 - spin_unlock(lock); 17.39 - return 0; 17.40 - } 17.41 - } while (cmpxchg(&refcount->counter, old, new) != old); 17.42 - return 0; 17.43 -} 17.44 - 17.45 -EXPORT_SYMBOL(_atomic_dec_and_lock);
18.1 --- a/xen/arch/ia64/linux/lib/do_csum.S Tue Aug 30 17:51:51 2005 -0600 18.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 18.3 @@ -1,323 +0,0 @@ 18.4 -/* 18.5 - * 18.6 - * Optmized version of the standard do_csum() function 18.7 - * 18.8 - * Return: a 64bit quantity containing the 16bit Internet checksum 18.9 - * 18.10 - * Inputs: 18.11 - * in0: address of buffer to checksum (char *) 18.12 - * in1: length of the buffer (int) 18.13 - * 18.14 - * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co 18.15 - * Stephane Eranian <eranian@hpl.hp.com> 18.16 - * 18.17 - * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> 18.18 - * Data locality study on the checksum buffer. 18.19 - * More optimization cleanup - remove excessive stop bits. 18.20 - * 02/04/08 David Mosberger <davidm@hpl.hp.com> 18.21 - * More cleanup and tuning. 18.22 - * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> 18.23 - * Clean up and optimize and the software pipeline, loading two 18.24 - * back-to-back 8-byte words per loop. Clean up the initialization 18.25 - * for the loop. Support the cases where load latency = 1 or 2. 18.26 - * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). 18.27 - */ 18.28 - 18.29 -#include <asm/asmmacro.h> 18.30 - 18.31 -// 18.32 -// Theory of operations: 18.33 -// The goal is to go as quickly as possible to the point where 18.34 -// we can checksum 16 bytes/loop. Before reaching that point we must 18.35 -// take care of incorrect alignment of first byte. 18.36 -// 18.37 -// The code hereafter also takes care of the "tail" part of the buffer 18.38 -// before entering the core loop, if any. The checksum is a sum so it 18.39 -// allows us to commute operations. So we do the "head" and "tail" 18.40 -// first to finish at full speed in the body. Once we get the head and 18.41 -// tail values, we feed them into the pipeline, very handy initialization. 18.42 -// 18.43 -// Of course we deal with the special case where the whole buffer fits 18.44 -// into one 8 byte word. In this case we have only one entry in the pipeline. 18.45 -// 18.46 -// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for 18.47 -// possible load latency and also to accommodate for head and tail. 18.48 -// 18.49 -// The end of the function deals with folding the checksum from 64bits 18.50 -// down to 16bits taking care of the carry. 18.51 -// 18.52 -// This version avoids synchronization in the core loop by also using a 18.53 -// pipeline for the accumulation of the checksum in resultx[] (x=1,2). 18.54 -// 18.55 -// wordx[] (x=1,2) 18.56 -// |---| 18.57 -// | | 0 : new value loaded in pipeline 18.58 -// |---| 18.59 -// | | - : in transit data 18.60 -// |---| 18.61 -// | | LOAD_LATENCY : current value to add to checksum 18.62 -// |---| 18.63 -// | | LOAD_LATENCY+1 : previous value added to checksum 18.64 -// |---| (previous iteration) 18.65 -// 18.66 -// resultx[] (x=1,2) 18.67 -// |---| 18.68 -// | | 0 : initial value 18.69 -// |---| 18.70 -// | | LOAD_LATENCY-1 : new checksum 18.71 -// |---| 18.72 -// | | LOAD_LATENCY : previous value of checksum 18.73 -// |---| 18.74 -// | | LOAD_LATENCY+1 : final checksum when out of the loop 18.75 -// |---| 18.76 -// 18.77 -// 18.78 -// See RFC1071 "Computing the Internet Checksum" for various techniques for 18.79 -// calculating the Internet checksum. 18.80 -// 18.81 -// NOT YET DONE: 18.82 -// - Maybe another algorithm which would take care of the folding at the 18.83 -// end in a different manner 18.84 -// - Work with people more knowledgeable than me on the network stack 18.85 -// to figure out if we could not split the function depending on the 18.86 -// type of packet or alignment we get. Like the ip_fast_csum() routine 18.87 -// where we know we have at least 20bytes worth of data to checksum. 18.88 -// - Do a better job of handling small packets. 18.89 -// - Note on prefetching: it was found that under various load, i.e. ftp read/write, 18.90 -// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% 18.91 -// on the data that buffer points to (partly because the checksum is often preceded by 18.92 -// a copy_from_user()). This finding indiate that lfetch will not be beneficial since 18.93 -// the data is already in the cache. 18.94 -// 18.95 - 18.96 -#define saved_pfs r11 18.97 -#define hmask r16 18.98 -#define tmask r17 18.99 -#define first1 r18 18.100 -#define firstval r19 18.101 -#define firstoff r20 18.102 -#define last r21 18.103 -#define lastval r22 18.104 -#define lastoff r23 18.105 -#define saved_lc r24 18.106 -#define saved_pr r25 18.107 -#define tmp1 r26 18.108 -#define tmp2 r27 18.109 -#define tmp3 r28 18.110 -#define carry1 r29 18.111 -#define carry2 r30 18.112 -#define first2 r31 18.113 - 18.114 -#define buf in0 18.115 -#define len in1 18.116 - 18.117 -#define LOAD_LATENCY 2 // XXX fix me 18.118 - 18.119 -#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) 18.120 -# error "Only 1 or 2 is supported/tested for LOAD_LATENCY." 18.121 -#endif 18.122 - 18.123 -#define PIPE_DEPTH (LOAD_LATENCY+2) 18.124 -#define ELD p[LOAD_LATENCY] // end of load 18.125 -#define ELD_1 p[LOAD_LATENCY+1] // and next stage 18.126 - 18.127 -// unsigned long do_csum(unsigned char *buf,long len) 18.128 - 18.129 -GLOBAL_ENTRY(do_csum) 18.130 - .prologue 18.131 - .save ar.pfs, saved_pfs 18.132 - alloc saved_pfs=ar.pfs,2,16,0,16 18.133 - .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] 18.134 - .rotp p[PIPE_DEPTH], pC1[2], pC2[2] 18.135 - mov ret0=r0 // in case we have zero length 18.136 - cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) 18.137 - ;; 18.138 - add tmp1=buf,len // last byte's address 18.139 - .save pr, saved_pr 18.140 - mov saved_pr=pr // preserve predicates (rotation) 18.141 -(p6) br.ret.spnt.many rp // return if zero or negative length 18.142 - 18.143 - mov hmask=-1 // initialize head mask 18.144 - tbit.nz p15,p0=buf,0 // is buf an odd address? 18.145 - and first1=-8,buf // 8-byte align down address of first1 element 18.146 - 18.147 - and firstoff=7,buf // how many bytes off for first1 element 18.148 - mov tmask=-1 // initialize tail mask 18.149 - 18.150 - ;; 18.151 - adds tmp2=-1,tmp1 // last-1 18.152 - and lastoff=7,tmp1 // how many bytes off for last element 18.153 - ;; 18.154 - sub tmp1=8,lastoff // complement to lastoff 18.155 - and last=-8,tmp2 // address of word containing last byte 18.156 - ;; 18.157 - sub tmp3=last,first1 // tmp3=distance from first1 to last 18.158 - .save ar.lc, saved_lc 18.159 - mov saved_lc=ar.lc // save lc 18.160 - cmp.eq p8,p9=last,first1 // everything fits in one word ? 18.161 - 18.162 - ld8 firstval=[first1],8 // load, ahead of time, "first1" word 18.163 - and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 18.164 - shl tmp2=firstoff,3 // number of bits 18.165 - ;; 18.166 -(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed 18.167 - shl tmp1=tmp1,3 // number of bits 18.168 -(p9) adds tmp3=-8,tmp3 // effectively loaded 18.169 - ;; 18.170 -(p8) mov lastval=r0 // we don't need lastval if first1==last 18.171 - shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ 18.172 - shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] 18.173 - ;; 18.174 - .body 18.175 -#define count tmp3 18.176 - 18.177 -(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only 18.178 -(p9) and word2[0]=lastval,tmask // mask last it as appropriate 18.179 - shr.u count=count,3 // how many 8-byte? 18.180 - ;; 18.181 - // If count is odd, finish this 8-byte word so that we can 18.182 - // load two back-to-back 8-byte words per loop thereafter. 18.183 - and word1[0]=firstval,hmask // and mask it as appropriate 18.184 - tbit.nz p10,p11=count,0 // if (count is odd) 18.185 - ;; 18.186 -(p8) mov result1[0]=word1[0] 18.187 -(p9) add result1[0]=word1[0],word2[0] 18.188 - ;; 18.189 - cmp.ltu p6,p0=result1[0],word1[0] // check the carry 18.190 - cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte 18.191 - ;; 18.192 -(p6) adds result1[0]=1,result1[0] 18.193 -(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) 18.194 -(p11) br.cond.dptk .do_csum16 // if (count is even) 18.195 - 18.196 - // Here count is odd. 18.197 - ld8 word1[1]=[first1],8 // load an 8-byte word 18.198 - cmp.eq p9,p10=1,count // if (count == 1) 18.199 - adds count=-1,count // loaded an 8-byte word 18.200 - ;; 18.201 - add result1[0]=result1[0],word1[1] 18.202 - ;; 18.203 - cmp.ltu p6,p0=result1[0],word1[1] 18.204 - ;; 18.205 -(p6) adds result1[0]=1,result1[0] 18.206 -(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit 18.207 - // Fall through to caluculate the checksum, feeding result1[0] as 18.208 - // the initial value in result1[0]. 18.209 - // 18.210 - // Calculate the checksum loading two 8-byte words per loop. 18.211 - // 18.212 -.do_csum16: 18.213 - add first2=8,first1 18.214 - shr.u count=count,1 // we do 16 bytes per loop 18.215 - ;; 18.216 - adds count=-1,count 18.217 - mov carry1=r0 18.218 - mov carry2=r0 18.219 - brp.loop.imp 1f,2f 18.220 - ;; 18.221 - mov ar.ec=PIPE_DEPTH 18.222 - mov ar.lc=count // set lc 18.223 - mov pr.rot=1<<16 18.224 - // result1[0] must be initialized in advance. 18.225 - mov result2[0]=r0 18.226 - ;; 18.227 - .align 32 18.228 -1: 18.229 -(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] 18.230 -(pC1[1])adds carry1=1,carry1 18.231 -(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] 18.232 -(pC2[1])adds carry2=1,carry2 18.233 -(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] 18.234 -(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] 18.235 -2: 18.236 -(p[0]) ld8 word1[0]=[first1],16 18.237 -(p[0]) ld8 word2[0]=[first2],16 18.238 - br.ctop.sptk 1b 18.239 - ;; 18.240 - // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. 18.241 -(pC1[1])adds carry1=1,carry1 // since we miss the last one 18.242 -(pC2[1])adds carry2=1,carry2 18.243 - ;; 18.244 - add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 18.245 - add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 18.246 - ;; 18.247 - cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 18.248 - cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 18.249 - ;; 18.250 -(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] 18.251 -(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] 18.252 - ;; 18.253 - add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] 18.254 - ;; 18.255 - cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] 18.256 - ;; 18.257 -(p6) adds result1[0]=1,result1[0] 18.258 - ;; 18.259 -.do_csum_exit: 18.260 - // 18.261 - // now fold 64 into 16 bits taking care of carry 18.262 - // that's not very good because it has lots of sequentiality 18.263 - // 18.264 - mov tmp3=0xffff 18.265 - zxt4 tmp1=result1[0] 18.266 - shr.u tmp2=result1[0],32 18.267 - ;; 18.268 - add result1[0]=tmp1,tmp2 18.269 - ;; 18.270 - and tmp1=result1[0],tmp3 18.271 - shr.u tmp2=result1[0],16 18.272 - ;; 18.273 - add result1[0]=tmp1,tmp2 18.274 - ;; 18.275 - and tmp1=result1[0],tmp3 18.276 - shr.u tmp2=result1[0],16 18.277 - ;; 18.278 - add result1[0]=tmp1,tmp2 18.279 - ;; 18.280 - and tmp1=result1[0],tmp3 18.281 - shr.u tmp2=result1[0],16 18.282 - ;; 18.283 - add ret0=tmp1,tmp2 18.284 - mov pr=saved_pr,0xffffffffffff0000 18.285 - ;; 18.286 - // if buf was odd then swap bytes 18.287 - mov ar.pfs=saved_pfs // restore ar.ec 18.288 -(p15) mux1 ret0=ret0,@rev // reverse word 18.289 - ;; 18.290 - mov ar.lc=saved_lc 18.291 -(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 18.292 - br.ret.sptk.many rp 18.293 - 18.294 -// I (Jun Nakajima) wrote an equivalent code (see below), but it was 18.295 -// not much better than the original. So keep the original there so that 18.296 -// someone else can challenge. 18.297 -// 18.298 -// shr.u word1[0]=result1[0],32 18.299 -// zxt4 result1[0]=result1[0] 18.300 -// ;; 18.301 -// add result1[0]=result1[0],word1[0] 18.302 -// ;; 18.303 -// zxt2 result2[0]=result1[0] 18.304 -// extr.u word1[0]=result1[0],16,16 18.305 -// shr.u carry1=result1[0],32 18.306 -// ;; 18.307 -// add result2[0]=result2[0],word1[0] 18.308 -// ;; 18.309 -// add result2[0]=result2[0],carry1 18.310 -// ;; 18.311 -// extr.u ret0=result2[0],16,16 18.312 -// ;; 18.313 -// add ret0=ret0,result2[0] 18.314 -// ;; 18.315 -// zxt2 ret0=ret0 18.316 -// mov ar.pfs=saved_pfs // restore ar.ec 18.317 -// mov pr=saved_pr,0xffffffffffff0000 18.318 -// ;; 18.319 -// // if buf was odd then swap bytes 18.320 -// mov ar.lc=saved_lc 18.321 -//(p15) mux1 ret0=ret0,@rev // reverse word 18.322 -// ;; 18.323 -//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 18.324 -// br.ret.sptk.many rp 18.325 - 18.326 -END(do_csum)
19.1 --- a/xen/arch/ia64/linux/lib/flush.S Tue Aug 30 17:51:51 2005 -0600 19.2 +++ b/xen/arch/ia64/linux/lib/flush.S Wed Aug 31 14:32:27 2005 -0600 19.3 @@ -1,39 +1,61 @@ 19.4 /* 19.5 * Cache flushing routines. 19.6 * 19.7 - * Copyright (C) 1999-2001 Hewlett-Packard Co 19.8 - * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com> 19.9 + * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co 19.10 + * David Mosberger-Tang <davidm@hpl.hp.com> 19.11 + * 19.12 + * 05/28/05 Zoltan Menyhart Dynamic stride size 19.13 */ 19.14 + 19.15 #include <asm/asmmacro.h> 19.16 -#include <asm/page.h> 19.17 + 19.18 19.19 /* 19.20 * flush_icache_range(start,end) 19.21 - * Must flush range from start to end-1 but nothing else (need to 19.22 + * 19.23 + * Make i-cache(s) coherent with d-caches. 19.24 + * 19.25 + * Must deal with range from start to end-1 but nothing else (need to 19.26 * be careful not to touch addresses that may be unmapped). 19.27 + * 19.28 + * Note: "in0" and "in1" are preserved for debugging purposes. 19.29 */ 19.30 GLOBAL_ENTRY(flush_icache_range) 19.31 + 19.32 .prologue 19.33 - alloc r2=ar.pfs,2,0,0,0 19.34 - sub r8=in1,in0,1 19.35 + alloc r2=ar.pfs,2,0,0,0 19.36 + movl r3=ia64_i_cache_stride_shift 19.37 + mov r21=1 19.38 + ;; 19.39 + ld8 r20=[r3] // r20: stride shift 19.40 + sub r22=in1,r0,1 // last byte address 19.41 ;; 19.42 - shr.u r8=r8,5 // we flush 32 bytes per iteration 19.43 - .save ar.lc, r3 19.44 - mov r3=ar.lc // save ar.lc 19.45 + shr.u r23=in0,r20 // start / (stride size) 19.46 + shr.u r22=r22,r20 // (last byte address) / (stride size) 19.47 + shl r21=r21,r20 // r21: stride size of the i-cache(s) 19.48 + ;; 19.49 + sub r8=r22,r23 // number of strides - 1 19.50 + shl r24=r23,r20 // r24: addresses for "fc.i" = 19.51 + // "start" rounded down to stride boundary 19.52 + .save ar.lc,r3 19.53 + mov r3=ar.lc // save ar.lc 19.54 ;; 19.55 19.56 .body 19.57 - 19.58 - mov ar.lc=r8 19.59 + mov ar.lc=r8 19.60 ;; 19.61 -.Loop: fc in0 // issuable on M0 only 19.62 - add in0=32,in0 19.63 + /* 19.64 + * 32 byte aligned loop, even number of (actually 2) bundles 19.65 + */ 19.66 +.Loop: fc.i r24 // issuable on M0 only 19.67 + add r24=r21,r24 // we flush "stride size" bytes per iteration 19.68 + nop.i 0 19.69 br.cloop.sptk.few .Loop 19.70 ;; 19.71 sync.i 19.72 ;; 19.73 srlz.i 19.74 ;; 19.75 - mov ar.lc=r3 // restore ar.lc 19.76 + mov ar.lc=r3 // restore ar.lc 19.77 br.ret.sptk.many rp 19.78 END(flush_icache_range)
20.1 --- a/xen/arch/ia64/linux/lib/io.c Tue Aug 30 17:51:51 2005 -0600 20.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 20.3 @@ -1,165 +0,0 @@ 20.4 -#include <linux/config.h> 20.5 -#include <linux/module.h> 20.6 -#include <linux/types.h> 20.7 - 20.8 -#include <asm/io.h> 20.9 - 20.10 -/* 20.11 - * Copy data from IO memory space to "real" memory space. 20.12 - * This needs to be optimized. 20.13 - */ 20.14 -void memcpy_fromio(void *to, const volatile void __iomem *from, long count) 20.15 -{ 20.16 - char *dst = to; 20.17 - 20.18 - while (count) { 20.19 - count--; 20.20 - *dst++ = readb(from++); 20.21 - } 20.22 -} 20.23 -EXPORT_SYMBOL(memcpy_fromio); 20.24 - 20.25 -/* 20.26 - * Copy data from "real" memory space to IO memory space. 20.27 - * This needs to be optimized. 20.28 - */ 20.29 -void memcpy_toio(volatile void __iomem *to, const void *from, long count) 20.30 -{ 20.31 - const char *src = from; 20.32 - 20.33 - while (count) { 20.34 - count--; 20.35 - writeb(*src++, to++); 20.36 - } 20.37 -} 20.38 -EXPORT_SYMBOL(memcpy_toio); 20.39 - 20.40 -/* 20.41 - * "memset" on IO memory space. 20.42 - * This needs to be optimized. 20.43 - */ 20.44 -void memset_io(volatile void __iomem *dst, int c, long count) 20.45 -{ 20.46 - unsigned char ch = (char)(c & 0xff); 20.47 - 20.48 - while (count) { 20.49 - count--; 20.50 - writeb(ch, dst); 20.51 - dst++; 20.52 - } 20.53 -} 20.54 -EXPORT_SYMBOL(memset_io); 20.55 - 20.56 -#ifdef CONFIG_IA64_GENERIC 20.57 - 20.58 -#undef __ia64_inb 20.59 -#undef __ia64_inw 20.60 -#undef __ia64_inl 20.61 -#undef __ia64_outb 20.62 -#undef __ia64_outw 20.63 -#undef __ia64_outl 20.64 -#undef __ia64_readb 20.65 -#undef __ia64_readw 20.66 -#undef __ia64_readl 20.67 -#undef __ia64_readq 20.68 -#undef __ia64_readb_relaxed 20.69 -#undef __ia64_readw_relaxed 20.70 -#undef __ia64_readl_relaxed 20.71 -#undef __ia64_readq_relaxed 20.72 -#undef __ia64_writeb 20.73 -#undef __ia64_writew 20.74 -#undef __ia64_writel 20.75 -#undef __ia64_writeq 20.76 -#undef __ia64_mmiowb 20.77 - 20.78 -unsigned int 20.79 -__ia64_inb (unsigned long port) 20.80 -{ 20.81 - return ___ia64_inb(port); 20.82 -} 20.83 - 20.84 -unsigned int 20.85 -__ia64_inw (unsigned long port) 20.86 -{ 20.87 - return ___ia64_inw(port); 20.88 -} 20.89 - 20.90 -unsigned int 20.91 -__ia64_inl (unsigned long port) 20.92 -{ 20.93 - return ___ia64_inl(port); 20.94 -} 20.95 - 20.96 -void 20.97 -__ia64_outb (unsigned char val, unsigned long port) 20.98 -{ 20.99 - ___ia64_outb(val, port); 20.100 -} 20.101 - 20.102 -void 20.103 -__ia64_outw (unsigned short val, unsigned long port) 20.104 -{ 20.105 - ___ia64_outw(val, port); 20.106 -} 20.107 - 20.108 -void 20.109 -__ia64_outl (unsigned int val, unsigned long port) 20.110 -{ 20.111 - ___ia64_outl(val, port); 20.112 -} 20.113 - 20.114 -unsigned char 20.115 -__ia64_readb (void __iomem *addr) 20.116 -{ 20.117 - return ___ia64_readb (addr); 20.118 -} 20.119 - 20.120 -unsigned short 20.121 -__ia64_readw (void __iomem *addr) 20.122 -{ 20.123 - return ___ia64_readw (addr); 20.124 -} 20.125 - 20.126 -unsigned int 20.127 -__ia64_readl (void __iomem *addr) 20.128 -{ 20.129 - return ___ia64_readl (addr); 20.130 -} 20.131 - 20.132 -unsigned long 20.133 -__ia64_readq (void __iomem *addr) 20.134 -{ 20.135 - return ___ia64_readq (addr); 20.136 -} 20.137 - 20.138 -unsigned char 20.139 -__ia64_readb_relaxed (void __iomem *addr) 20.140 -{ 20.141 - return ___ia64_readb (addr); 20.142 -} 20.143 - 20.144 -unsigned short 20.145 -__ia64_readw_relaxed (void __iomem *addr) 20.146 -{ 20.147 - return ___ia64_readw (addr); 20.148 -} 20.149 - 20.150 -unsigned int 20.151 -__ia64_readl_relaxed (void __iomem *addr) 20.152 -{ 20.153 - return ___ia64_readl (addr); 20.154 -} 20.155 - 20.156 -unsigned long 20.157 -__ia64_readq_relaxed (void __iomem *addr) 20.158 -{ 20.159 - return ___ia64_readq (addr); 20.160 -} 20.161 - 20.162 -void 20.163 -__ia64_mmiowb(void) 20.164 -{ 20.165 - ___ia64_mmiowb(); 20.166 -} 20.167 - 20.168 -#endif /* CONFIG_IA64_GENERIC */
21.1 --- a/xen/arch/ia64/linux/lib/ip_fast_csum.S Tue Aug 30 17:51:51 2005 -0600 21.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 21.3 @@ -1,90 +0,0 @@ 21.4 -/* 21.5 - * Optmized version of the ip_fast_csum() function 21.6 - * Used for calculating IP header checksum 21.7 - * 21.8 - * Return: 16bit checksum, complemented 21.9 - * 21.10 - * Inputs: 21.11 - * in0: address of buffer to checksum (char *) 21.12 - * in1: length of the buffer (int) 21.13 - * 21.14 - * Copyright (C) 2002 Intel Corp. 21.15 - * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> 21.16 - */ 21.17 - 21.18 -#include <asm/asmmacro.h> 21.19 - 21.20 -/* 21.21 - * Since we know that most likely this function is called with buf aligned 21.22 - * on 4-byte boundary and 20 bytes in length, we can execution rather quickly 21.23 - * versus calling generic version of do_csum, which has lots of overhead in 21.24 - * handling various alignments and sizes. However, due to lack of constrains 21.25 - * put on the function input argument, cases with alignment not on 4-byte or 21.26 - * size not equal to 20 bytes will be handled by the generic do_csum function. 21.27 - */ 21.28 - 21.29 -#define in0 r32 21.30 -#define in1 r33 21.31 -#define ret0 r8 21.32 - 21.33 -GLOBAL_ENTRY(ip_fast_csum) 21.34 - .prologue 21.35 - .body 21.36 - cmp.ne p6,p7=5,in1 // size other than 20 byte? 21.37 - and r14=3,in0 // is it aligned on 4-byte? 21.38 - add r15=4,in0 // second source pointer 21.39 - ;; 21.40 - cmp.ne.or.andcm p6,p7=r14,r0 21.41 - ;; 21.42 -(p7) ld4 r20=[in0],8 21.43 -(p7) ld4 r21=[r15],8 21.44 -(p6) br.spnt .generic 21.45 - ;; 21.46 - ld4 r22=[in0],8 21.47 - ld4 r23=[r15],8 21.48 - ;; 21.49 - ld4 r24=[in0] 21.50 - add r20=r20,r21 21.51 - add r22=r22,r23 21.52 - ;; 21.53 - add r20=r20,r22 21.54 - ;; 21.55 - add r20=r20,r24 21.56 - ;; 21.57 - shr.u ret0=r20,16 // now need to add the carry 21.58 - zxt2 r20=r20 21.59 - ;; 21.60 - add r20=ret0,r20 21.61 - ;; 21.62 - shr.u ret0=r20,16 // add carry again 21.63 - zxt2 r20=r20 21.64 - ;; 21.65 - add r20=ret0,r20 21.66 - ;; 21.67 - shr.u ret0=r20,16 21.68 - zxt2 r20=r20 21.69 - ;; 21.70 - add r20=ret0,r20 21.71 - ;; 21.72 - andcm ret0=-1,r20 21.73 - .restore sp // reset frame state 21.74 - br.ret.sptk.many b0 21.75 - ;; 21.76 - 21.77 -.generic: 21.78 - .prologue 21.79 - .save ar.pfs, r35 21.80 - alloc r35=ar.pfs,2,2,2,0 21.81 - .save rp, r34 21.82 - mov r34=b0 21.83 - .body 21.84 - dep.z out1=in1,2,30 21.85 - mov out0=in0 21.86 - ;; 21.87 - br.call.sptk.many b0=do_csum 21.88 - ;; 21.89 - andcm ret0=-1,ret0 21.90 - mov ar.pfs=r35 21.91 - mov b0=r34 21.92 - br.ret.sptk.many b0 21.93 -END(ip_fast_csum)
22.1 --- a/xen/arch/ia64/linux/lib/memcpy.S Tue Aug 30 17:51:51 2005 -0600 22.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 22.3 @@ -1,301 +0,0 @@ 22.4 -/* 22.5 - * 22.6 - * Optimized version of the standard memcpy() function 22.7 - * 22.8 - * Inputs: 22.9 - * in0: destination address 22.10 - * in1: source address 22.11 - * in2: number of bytes to copy 22.12 - * Output: 22.13 - * no return value 22.14 - * 22.15 - * Copyright (C) 2000-2001 Hewlett-Packard Co 22.16 - * Stephane Eranian <eranian@hpl.hp.com> 22.17 - * David Mosberger-Tang <davidm@hpl.hp.com> 22.18 - */ 22.19 -#include <asm/asmmacro.h> 22.20 - 22.21 -GLOBAL_ENTRY(memcpy) 22.22 - 22.23 -# define MEM_LAT 21 /* latency to memory */ 22.24 - 22.25 -# define dst r2 22.26 -# define src r3 22.27 -# define retval r8 22.28 -# define saved_pfs r9 22.29 -# define saved_lc r10 22.30 -# define saved_pr r11 22.31 -# define cnt r16 22.32 -# define src2 r17 22.33 -# define t0 r18 22.34 -# define t1 r19 22.35 -# define t2 r20 22.36 -# define t3 r21 22.37 -# define t4 r22 22.38 -# define src_end r23 22.39 - 22.40 -# define N (MEM_LAT + 4) 22.41 -# define Nrot ((N + 7) & ~7) 22.42 - 22.43 - /* 22.44 - * First, check if everything (src, dst, len) is a multiple of eight. If 22.45 - * so, we handle everything with no taken branches (other than the loop 22.46 - * itself) and a small icache footprint. Otherwise, we jump off to 22.47 - * the more general copy routine handling arbitrary 22.48 - * sizes/alignment etc. 22.49 - */ 22.50 - .prologue 22.51 - .save ar.pfs, saved_pfs 22.52 - alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot 22.53 - .save ar.lc, saved_lc 22.54 - mov saved_lc=ar.lc 22.55 - or t0=in0,in1 22.56 - ;; 22.57 - 22.58 - or t0=t0,in2 22.59 - .save pr, saved_pr 22.60 - mov saved_pr=pr 22.61 - 22.62 - .body 22.63 - 22.64 - cmp.eq p6,p0=in2,r0 // zero length? 22.65 - mov retval=in0 // return dst 22.66 -(p6) br.ret.spnt.many rp // zero length, return immediately 22.67 - ;; 22.68 - 22.69 - mov dst=in0 // copy because of rotation 22.70 - shr.u cnt=in2,3 // number of 8-byte words to copy 22.71 - mov pr.rot=1<<16 22.72 - ;; 22.73 - 22.74 - adds cnt=-1,cnt // br.ctop is repeat/until 22.75 - cmp.gtu p7,p0=16,in2 // copying less than 16 bytes? 22.76 - mov ar.ec=N 22.77 - ;; 22.78 - 22.79 - and t0=0x7,t0 22.80 - mov ar.lc=cnt 22.81 - ;; 22.82 - cmp.ne p6,p0=t0,r0 22.83 - 22.84 - mov src=in1 // copy because of rotation 22.85 -(p7) br.cond.spnt.few .memcpy_short 22.86 -(p6) br.cond.spnt.few .memcpy_long 22.87 - ;; 22.88 - nop.m 0 22.89 - ;; 22.90 - nop.m 0 22.91 - nop.i 0 22.92 - ;; 22.93 - nop.m 0 22.94 - ;; 22.95 - .rotr val[N] 22.96 - .rotp p[N] 22.97 - .align 32 22.98 -1: { .mib 22.99 -(p[0]) ld8 val[0]=[src],8 22.100 - nop.i 0 22.101 - brp.loop.imp 1b, 2f 22.102 -} 22.103 -2: { .mfb 22.104 -(p[N-1])st8 [dst]=val[N-1],8 22.105 - nop.f 0 22.106 - br.ctop.dptk.few 1b 22.107 -} 22.108 - ;; 22.109 - mov ar.lc=saved_lc 22.110 - mov pr=saved_pr,-1 22.111 - mov ar.pfs=saved_pfs 22.112 - br.ret.sptk.many rp 22.113 - 22.114 - /* 22.115 - * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time 22.116 - * copy loop. This performs relatively poorly on Itanium, but it doesn't 22.117 - * get used very often (gcc inlines small copies) and due to atomicity 22.118 - * issues, we want to avoid read-modify-write of entire words. 22.119 - */ 22.120 - .align 32 22.121 -.memcpy_short: 22.122 - adds cnt=-1,in2 // br.ctop is repeat/until 22.123 - mov ar.ec=MEM_LAT 22.124 - brp.loop.imp 1f, 2f 22.125 - ;; 22.126 - mov ar.lc=cnt 22.127 - ;; 22.128 - nop.m 0 22.129 - ;; 22.130 - nop.m 0 22.131 - nop.i 0 22.132 - ;; 22.133 - nop.m 0 22.134 - ;; 22.135 - nop.m 0 22.136 - ;; 22.137 - /* 22.138 - * It is faster to put a stop bit in the loop here because it makes 22.139 - * the pipeline shorter (and latency is what matters on short copies). 22.140 - */ 22.141 - .align 32 22.142 -1: { .mib 22.143 -(p[0]) ld1 val[0]=[src],1 22.144 - nop.i 0 22.145 - brp.loop.imp 1b, 2f 22.146 -} ;; 22.147 -2: { .mfb 22.148 -(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 22.149 - nop.f 0 22.150 - br.ctop.dptk.few 1b 22.151 -} ;; 22.152 - mov ar.lc=saved_lc 22.153 - mov pr=saved_pr,-1 22.154 - mov ar.pfs=saved_pfs 22.155 - br.ret.sptk.many rp 22.156 - 22.157 - /* 22.158 - * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't 22.159 - * an overriding concern here, but throughput is. We first do 22.160 - * sub-word copying until the destination is aligned, then we check 22.161 - * if the source is also aligned. If so, we do a simple load/store-loop 22.162 - * until there are less than 8 bytes left over and then we do the tail, 22.163 - * by storing the last few bytes using sub-word copying. If the source 22.164 - * is not aligned, we branch off to the non-congruent loop. 22.165 - * 22.166 - * stage: op: 22.167 - * 0 ld 22.168 - * : 22.169 - * MEM_LAT+3 shrp 22.170 - * MEM_LAT+4 st 22.171 - * 22.172 - * On Itanium, the pipeline itself runs without stalls. However, br.ctop 22.173 - * seems to introduce an unavoidable bubble in the pipeline so the overall 22.174 - * latency is 2 cycles/iteration. This gives us a _copy_ throughput 22.175 - * of 4 byte/cycle. Still not bad. 22.176 - */ 22.177 -# undef N 22.178 -# undef Nrot 22.179 -# define N (MEM_LAT + 5) /* number of stages */ 22.180 -# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */ 22.181 - 22.182 -#define LOG_LOOP_SIZE 6 22.183 - 22.184 -.memcpy_long: 22.185 - alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame 22.186 - and t0=-8,src // t0 = src & ~7 22.187 - and t2=7,src // t2 = src & 7 22.188 - ;; 22.189 - ld8 t0=[t0] // t0 = 1st source word 22.190 - adds src2=7,src // src2 = (src + 7) 22.191 - sub t4=r0,dst // t4 = -dst 22.192 - ;; 22.193 - and src2=-8,src2 // src2 = (src + 7) & ~7 22.194 - shl t2=t2,3 // t2 = 8*(src & 7) 22.195 - shl t4=t4,3 // t4 = 8*(dst & 7) 22.196 - ;; 22.197 - ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise 22.198 - sub t3=64,t2 // t3 = 64-8*(src & 7) 22.199 - shr.u t0=t0,t2 22.200 - ;; 22.201 - add src_end=src,in2 22.202 - shl t1=t1,t3 22.203 - mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7) 22.204 - ;; 22.205 - or t0=t0,t1 22.206 - mov cnt=r0 22.207 - adds src_end=-1,src_end 22.208 - ;; 22.209 -(p3) st1 [dst]=t0,1 22.210 -(p3) shr.u t0=t0,8 22.211 -(p3) adds cnt=1,cnt 22.212 - ;; 22.213 -(p4) st2 [dst]=t0,2 22.214 -(p4) shr.u t0=t0,16 22.215 -(p4) adds cnt=2,cnt 22.216 - ;; 22.217 -(p5) st4 [dst]=t0,4 22.218 -(p5) adds cnt=4,cnt 22.219 - and src_end=-8,src_end // src_end = last word of source buffer 22.220 - ;; 22.221 - 22.222 - // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: 22.223 - 22.224 -1:{ add src=cnt,src // make src point to remainder of source buffer 22.225 - sub cnt=in2,cnt // cnt = number of bytes left to copy 22.226 - mov t4=ip 22.227 - } ;; 22.228 - and src2=-8,src // align source pointer 22.229 - adds t4=.memcpy_loops-1b,t4 22.230 - mov ar.ec=N 22.231 - 22.232 - and t0=7,src // t0 = src & 7 22.233 - shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy 22.234 - shl cnt=cnt,3 // move bits 0-2 to 3-5 22.235 - ;; 22.236 - 22.237 - .rotr val[N+1], w[2] 22.238 - .rotp p[N] 22.239 - 22.240 - cmp.ne p6,p0=t0,r0 // is src aligned, too? 22.241 - shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7) 22.242 - adds t2=-1,t2 // br.ctop is repeat/until 22.243 - ;; 22.244 - add t4=t0,t4 22.245 - mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy 22.246 - mov ar.lc=t2 22.247 - ;; 22.248 - nop.m 0 22.249 - ;; 22.250 - nop.m 0 22.251 - nop.i 0 22.252 - ;; 22.253 - nop.m 0 22.254 - ;; 22.255 -(p6) ld8 val[1]=[src2],8 // prime the pump... 22.256 - mov b6=t4 22.257 - br.sptk.few b6 22.258 - ;; 22.259 - 22.260 -.memcpy_tail: 22.261 - // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is 22.262 - // less than 8) and t0 contains the last few bytes of the src buffer: 22.263 -(p5) st4 [dst]=t0,4 22.264 -(p5) shr.u t0=t0,32 22.265 - mov ar.lc=saved_lc 22.266 - ;; 22.267 -(p4) st2 [dst]=t0,2 22.268 -(p4) shr.u t0=t0,16 22.269 - mov ar.pfs=saved_pfs 22.270 - ;; 22.271 -(p3) st1 [dst]=t0 22.272 - mov pr=saved_pr,-1 22.273 - br.ret.sptk.many rp 22.274 - 22.275 -/////////////////////////////////////////////////////// 22.276 - .align 64 22.277 - 22.278 -#define COPY(shift,index) \ 22.279 - 1: { .mib \ 22.280 - (p[0]) ld8 val[0]=[src2],8; \ 22.281 - (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \ 22.282 - brp.loop.imp 1b, 2f \ 22.283 - }; \ 22.284 - 2: { .mfb \ 22.285 - (p[MEM_LAT+4]) st8 [dst]=w[1],8; \ 22.286 - nop.f 0; \ 22.287 - br.ctop.dptk.few 1b; \ 22.288 - }; \ 22.289 - ;; \ 22.290 - ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \ 22.291 - ;; \ 22.292 - shrp t0=val[N-1],val[N-index],shift; \ 22.293 - br .memcpy_tail 22.294 -.memcpy_loops: 22.295 - COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ 22.296 - COPY(8, 0) 22.297 - COPY(16, 0) 22.298 - COPY(24, 0) 22.299 - COPY(32, 0) 22.300 - COPY(40, 0) 22.301 - COPY(48, 0) 22.302 - COPY(56, 0) 22.303 - 22.304 -END(memcpy)
23.1 --- a/xen/arch/ia64/linux/lib/memcpy_mck.S Tue Aug 30 17:51:51 2005 -0600 23.2 +++ b/xen/arch/ia64/linux/lib/memcpy_mck.S Wed Aug 31 14:32:27 2005 -0600 23.3 @@ -75,6 +75,7 @@ GLOBAL_ENTRY(memcpy) 23.4 mov f6=f0 23.5 br.cond.sptk .common_code 23.6 ;; 23.7 +END(memcpy) 23.8 GLOBAL_ENTRY(__copy_user) 23.9 .prologue 23.10 // check dest alignment 23.11 @@ -300,7 +301,7 @@ EK(.ex_handler, (p[D]) st8 [dst1] = t15, 23.12 add src_pre_mem=0,src0 // prefetch src pointer 23.13 add dst_pre_mem=0,dst0 // prefetch dest pointer 23.14 and src0=-8,src0 // 1st src pointer 23.15 -(p7) mov ar.lc = r21 23.16 +(p7) mov ar.lc = cnt 23.17 (p8) mov ar.lc = r0 23.18 ;; 23.19 TEXT_ALIGN(32) 23.20 @@ -524,7 +525,6 @@ EK(.ex_handler, (p17) st8 [dst1]=r39,8) 23.21 #undef B 23.22 #undef C 23.23 #undef D 23.24 -END(memcpy) 23.25 23.26 /* 23.27 * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
24.1 --- a/xen/arch/ia64/linux/lib/memset.S Tue Aug 30 17:51:51 2005 -0600 24.2 +++ b/xen/arch/ia64/linux/lib/memset.S Wed Aug 31 14:32:27 2005 -0600 24.3 @@ -57,10 +57,10 @@ GLOBAL_ENTRY(memset) 24.4 { .mmi 24.5 .prologue 24.6 alloc tmp = ar.pfs, 3, 0, 0, 0 24.7 - .body 24.8 lfetch.nt1 [dest] // 24.9 .save ar.lc, save_lc 24.10 mov.i save_lc = ar.lc 24.11 + .body 24.12 } { .mmi 24.13 mov ret0 = dest // return value 24.14 cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
25.1 --- a/xen/arch/ia64/linux/lib/strlen_user.S Tue Aug 30 17:51:51 2005 -0600 25.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 25.3 @@ -1,198 +0,0 @@ 25.4 -/* 25.5 - * Optimized version of the strlen_user() function 25.6 - * 25.7 - * Inputs: 25.8 - * in0 address of buffer 25.9 - * 25.10 - * Outputs: 25.11 - * ret0 0 in case of fault, strlen(buffer)+1 otherwise 25.12 - * 25.13 - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co 25.14 - * David Mosberger-Tang <davidm@hpl.hp.com> 25.15 - * Stephane Eranian <eranian@hpl.hp.com> 25.16 - * 25.17 - * 01/19/99 S.Eranian heavily enhanced version (see details below) 25.18 - * 09/24/99 S.Eranian added speculation recovery code 25.19 - */ 25.20 - 25.21 -#include <asm/asmmacro.h> 25.22 - 25.23 -// 25.24 -// int strlen_user(char *) 25.25 -// ------------------------ 25.26 -// Returns: 25.27 -// - length of string + 1 25.28 -// - 0 in case an exception is raised 25.29 -// 25.30 -// This is an enhanced version of the basic strlen_user. it includes a 25.31 -// combination of compute zero index (czx), parallel comparisons, speculative 25.32 -// loads and loop unroll using rotating registers. 25.33 -// 25.34 -// General Ideas about the algorithm: 25.35 -// The goal is to look at the string in chunks of 8 bytes. 25.36 -// so we need to do a few extra checks at the beginning because the 25.37 -// string may not be 8-byte aligned. In this case we load the 8byte 25.38 -// quantity which includes the start of the string and mask the unused 25.39 -// bytes with 0xff to avoid confusing czx. 25.40 -// We use speculative loads and software pipelining to hide memory 25.41 -// latency and do read ahead safely. This way we defer any exception. 25.42 -// 25.43 -// Because we don't want the kernel to be relying on particular 25.44 -// settings of the DCR register, we provide recovery code in case 25.45 -// speculation fails. The recovery code is going to "redo" the work using 25.46 -// only normal loads. If we still get a fault then we return an 25.47 -// error (ret0=0). Otherwise we return the strlen+1 as usual. 25.48 -// The fact that speculation may fail can be caused, for instance, by 25.49 -// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., 25.50 -// a NaT bit will be set if the translation is not present. The normal 25.51 -// load, on the other hand, will cause the translation to be inserted 25.52 -// if the mapping exists. 25.53 -// 25.54 -// It should be noted that we execute recovery code only when we need 25.55 -// to use the data that has been speculatively loaded: we don't execute 25.56 -// recovery code on pure read ahead data. 25.57 -// 25.58 -// Remarks: 25.59 -// - the cmp r0,r0 is used as a fast way to initialize a predicate 25.60 -// register to 1. This is required to make sure that we get the parallel 25.61 -// compare correct. 25.62 -// 25.63 -// - we don't use the epilogue counter to exit the loop but we need to set 25.64 -// it to zero beforehand. 25.65 -// 25.66 -// - after the loop we must test for Nat values because neither the 25.67 -// czx nor cmp instruction raise a NaT consumption fault. We must be 25.68 -// careful not to look too far for a Nat for which we don't care. 25.69 -// For instance we don't need to look at a NaT in val2 if the zero byte 25.70 -// was in val1. 25.71 -// 25.72 -// - Clearly performance tuning is required. 25.73 -// 25.74 - 25.75 -#define saved_pfs r11 25.76 -#define tmp r10 25.77 -#define base r16 25.78 -#define orig r17 25.79 -#define saved_pr r18 25.80 -#define src r19 25.81 -#define mask r20 25.82 -#define val r21 25.83 -#define val1 r22 25.84 -#define val2 r23 25.85 - 25.86 -GLOBAL_ENTRY(__strlen_user) 25.87 - .prologue 25.88 - .save ar.pfs, saved_pfs 25.89 - alloc saved_pfs=ar.pfs,11,0,0,8 25.90 - 25.91 - .rotr v[2], w[2] // declares our 4 aliases 25.92 - 25.93 - extr.u tmp=in0,0,3 // tmp=least significant 3 bits 25.94 - mov orig=in0 // keep trackof initial byte address 25.95 - dep src=0,in0,0,3 // src=8byte-aligned in0 address 25.96 - .save pr, saved_pr 25.97 - mov saved_pr=pr // preserve predicates (rotation) 25.98 - ;; 25.99 - 25.100 - .body 25.101 - 25.102 - ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate) 25.103 - shl tmp=tmp,3 // multiply by 8bits/byte 25.104 - mov mask=-1 // our mask 25.105 - ;; 25.106 - ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline 25.107 - cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and) 25.108 - sub tmp=64,tmp // how many bits to shift our mask on the right 25.109 - ;; 25.110 - shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part 25.111 - mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) 25.112 - ;; 25.113 - add base=-16,src // keep track of aligned base 25.114 - chk.s v[1], .recover // if already NaT, then directly skip to recover 25.115 - or v[1]=v[1],mask // now we have a safe initial byte pattern 25.116 - ;; 25.117 -1: 25.118 - ld8.s v[0]=[src],8 // speculatively load next 25.119 - czx1.r val1=v[1] // search 0 byte from right 25.120 - czx1.r val2=w[1] // search 0 byte from right following 8bytes 25.121 - ;; 25.122 - ld8.s w[0]=[src],8 // speculatively load next to next 25.123 - cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 25.124 - cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 25.125 -(p6) br.wtop.dptk.few 1b // loop until p6 == 0 25.126 - ;; 25.127 - // 25.128 - // We must return try the recovery code iff 25.129 - // val1_is_nat || (val1==8 && val2_is_nat) 25.130 - // 25.131 - // XXX Fixme 25.132 - // - there must be a better way of doing the test 25.133 - // 25.134 - cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) 25.135 - tnat.nz p6,p7=val1 // test NaT on val1 25.136 -(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT 25.137 - ;; 25.138 - // 25.139 - // if we come here p7 is true, i.e., initialized for // cmp 25.140 - // 25.141 - cmp.eq.and p7,p0=8,val1// val1==8? 25.142 - tnat.nz.and p7,p0=val2 // test NaT if val2 25.143 -(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT 25.144 - ;; 25.145 -(p8) mov val1=val2 // val2 contains the value 25.146 -(p8) adds src=-16,src // correct position when 3 ahead 25.147 -(p9) adds src=-24,src // correct position when 4 ahead 25.148 - ;; 25.149 - sub ret0=src,orig // distance from origin 25.150 - sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 25.151 - mov pr=saved_pr,0xffffffffffff0000 25.152 - ;; 25.153 - sub ret0=ret0,tmp // length=now - back -1 25.154 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what 25.155 - br.ret.sptk.many rp // end of normal execution 25.156 - 25.157 - // 25.158 - // Outlined recovery code when speculation failed 25.159 - // 25.160 - // This time we don't use speculation and rely on the normal exception 25.161 - // mechanism. that's why the loop is not as good as the previous one 25.162 - // because read ahead is not possible 25.163 - // 25.164 - // XXX Fixme 25.165 - // - today we restart from the beginning of the string instead 25.166 - // of trying to continue where we left off. 25.167 - // 25.168 -.recover: 25.169 - EX(.Lexit1, ld8 val=[base],8) // load the initial bytes 25.170 - ;; 25.171 - or val=val,mask // remask first bytes 25.172 - cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop 25.173 - ;; 25.174 - // 25.175 - // ar.ec is still zero here 25.176 - // 25.177 -2: 25.178 - EX(.Lexit1, (p6) ld8 val=[base],8) 25.179 - ;; 25.180 - czx1.r val1=val // search 0 byte from right 25.181 - ;; 25.182 - cmp.eq p6,p0=8,val1 // val1==8 ? 25.183 -(p6) br.wtop.dptk.few 2b // loop until p6 == 0 25.184 - ;; 25.185 - sub ret0=base,orig // distance from base 25.186 - sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 25.187 - mov pr=saved_pr,0xffffffffffff0000 25.188 - ;; 25.189 - sub ret0=ret0,tmp // length=now - back -1 25.190 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what 25.191 - br.ret.sptk.many rp // end of successful recovery code 25.192 - 25.193 - // 25.194 - // We failed even on the normal load (called from exception handler) 25.195 - // 25.196 -.Lexit1: 25.197 - mov ret0=0 25.198 - mov pr=saved_pr,0xffffffffffff0000 25.199 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what 25.200 - br.ret.sptk.many rp 25.201 -END(__strlen_user)
26.1 --- a/xen/arch/ia64/linux/lib/strncpy_from_user.S Tue Aug 30 17:51:51 2005 -0600 26.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 26.3 @@ -1,44 +0,0 @@ 26.4 -/* 26.5 - * Just like strncpy() except that if a fault occurs during copying, 26.6 - * -EFAULT is returned. 26.7 - * 26.8 - * Inputs: 26.9 - * in0: address of destination buffer 26.10 - * in1: address of string to be copied 26.11 - * in2: length of buffer in bytes 26.12 - * Outputs: 26.13 - * r8: -EFAULT in case of fault or number of bytes copied if no fault 26.14 - * 26.15 - * Copyright (C) 1998-2001 Hewlett-Packard Co 26.16 - * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com> 26.17 - * 26.18 - * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by 26.19 - * by Andreas Schwab <schwab@suse.de>). 26.20 - */ 26.21 - 26.22 -#include <asm/asmmacro.h> 26.23 - 26.24 -GLOBAL_ENTRY(__strncpy_from_user) 26.25 - alloc r2=ar.pfs,3,0,0,0 26.26 - mov r8=0 26.27 - mov r9=in1 26.28 - ;; 26.29 - add r10=in1,in2 26.30 - cmp.eq p6,p0=r0,in2 26.31 -(p6) br.ret.spnt.many rp 26.32 - 26.33 - // XXX braindead copy loop---this needs to be optimized 26.34 -.Loop1: 26.35 - EX(.Lexit, ld1 r8=[in1],1) 26.36 - ;; 26.37 - EX(.Lexit, st1 [in0]=r8,1) 26.38 - cmp.ne p6,p7=r8,r0 26.39 - ;; 26.40 -(p6) cmp.ne.unc p8,p0=in1,r10 26.41 -(p8) br.cond.dpnt.few .Loop1 26.42 - ;; 26.43 -(p6) mov r8=in2 // buffer filled up---return buffer length 26.44 -(p7) sub r8=in1,r9,1 // return string length (excluding NUL character) 26.45 -[.Lexit:] 26.46 - br.ret.sptk.many rp 26.47 -END(__strncpy_from_user)
27.1 --- a/xen/arch/ia64/linux/lib/strnlen_user.S Tue Aug 30 17:51:51 2005 -0600 27.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 27.3 @@ -1,45 +0,0 @@ 27.4 -/* 27.5 - * Returns 0 if exception before NUL or reaching the supplied limit (N), 27.6 - * a value greater than N if the string is longer than the limit, else 27.7 - * strlen. 27.8 - * 27.9 - * Inputs: 27.10 - * in0: address of buffer 27.11 - * in1: string length limit N 27.12 - * Outputs: 27.13 - * r8: 0 in case of fault, strlen(buffer)+1 otherwise 27.14 - * 27.15 - * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com> 27.16 - */ 27.17 - 27.18 -#include <asm/asmmacro.h> 27.19 - 27.20 -GLOBAL_ENTRY(__strnlen_user) 27.21 - .prologue 27.22 - alloc r2=ar.pfs,2,0,0,0 27.23 - .save ar.lc, r16 27.24 - mov r16=ar.lc // preserve ar.lc 27.25 - 27.26 - .body 27.27 - 27.28 - add r3=-1,in1 27.29 - ;; 27.30 - mov ar.lc=r3 27.31 - mov r9=0 27.32 - ;; 27.33 - // XXX braindead strlen loop---this needs to be optimized 27.34 -.Loop1: 27.35 - EXCLR(.Lexit, ld1 r8=[in0],1) 27.36 - add r9=1,r9 27.37 - ;; 27.38 - cmp.eq p6,p0=r8,r0 27.39 -(p6) br.cond.dpnt .Lexit 27.40 - br.cloop.dptk.few .Loop1 27.41 - 27.42 - add r9=1,in1 // NUL not found---return N+1 27.43 - ;; 27.44 -.Lexit: 27.45 - mov r8=r9 27.46 - mov ar.lc=r16 // restore ar.lc 27.47 - br.ret.sptk.many rp 27.48 -END(__strnlen_user)
28.1 --- a/xen/arch/ia64/linux/lib/xor.S Tue Aug 30 17:51:51 2005 -0600 28.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 28.3 @@ -1,184 +0,0 @@ 28.4 -/* 28.5 - * arch/ia64/lib/xor.S 28.6 - * 28.7 - * Optimized RAID-5 checksumming functions for IA-64. 28.8 - * 28.9 - * This program is free software; you can redistribute it and/or modify 28.10 - * it under the terms of the GNU General Public License as published by 28.11 - * the Free Software Foundation; either version 2, or (at your option) 28.12 - * any later version. 28.13 - * 28.14 - * You should have received a copy of the GNU General Public License 28.15 - * (for example /usr/src/linux/COPYING); if not, write to the Free 28.16 - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 28.17 - */ 28.18 - 28.19 -#include <asm/asmmacro.h> 28.20 - 28.21 -GLOBAL_ENTRY(xor_ia64_2) 28.22 - .prologue 28.23 - .fframe 0 28.24 - .save ar.pfs, r31 28.25 - alloc r31 = ar.pfs, 3, 0, 13, 16 28.26 - .save ar.lc, r30 28.27 - mov r30 = ar.lc 28.28 - .save pr, r29 28.29 - mov r29 = pr 28.30 - ;; 28.31 - .body 28.32 - mov r8 = in1 28.33 - mov ar.ec = 6 + 2 28.34 - shr in0 = in0, 3 28.35 - ;; 28.36 - adds in0 = -1, in0 28.37 - mov r16 = in1 28.38 - mov r17 = in2 28.39 - ;; 28.40 - mov ar.lc = in0 28.41 - mov pr.rot = 1 << 16 28.42 - ;; 28.43 - .rotr s1[6+1], s2[6+1], d[2] 28.44 - .rotp p[6+2] 28.45 -0: 28.46 -(p[0]) ld8.nta s1[0] = [r16], 8 28.47 -(p[0]) ld8.nta s2[0] = [r17], 8 28.48 -(p[6]) xor d[0] = s1[6], s2[6] 28.49 -(p[6+1])st8.nta [r8] = d[1], 8 28.50 - nop.f 0 28.51 - br.ctop.dptk.few 0b 28.52 - ;; 28.53 - mov ar.lc = r30 28.54 - mov pr = r29, -1 28.55 - br.ret.sptk.few rp 28.56 -END(xor_ia64_2) 28.57 - 28.58 -GLOBAL_ENTRY(xor_ia64_3) 28.59 - .prologue 28.60 - .fframe 0 28.61 - .save ar.pfs, r31 28.62 - alloc r31 = ar.pfs, 4, 0, 20, 24 28.63 - .save ar.lc, r30 28.64 - mov r30 = ar.lc 28.65 - .save pr, r29 28.66 - mov r29 = pr 28.67 - ;; 28.68 - .body 28.69 - mov r8 = in1 28.70 - mov ar.ec = 6 + 2 28.71 - shr in0 = in0, 3 28.72 - ;; 28.73 - adds in0 = -1, in0 28.74 - mov r16 = in1 28.75 - mov r17 = in2 28.76 - ;; 28.77 - mov r18 = in3 28.78 - mov ar.lc = in0 28.79 - mov pr.rot = 1 << 16 28.80 - ;; 28.81 - .rotr s1[6+1], s2[6+1], s3[6+1], d[2] 28.82 - .rotp p[6+2] 28.83 -0: 28.84 -(p[0]) ld8.nta s1[0] = [r16], 8 28.85 -(p[0]) ld8.nta s2[0] = [r17], 8 28.86 -(p[6]) xor d[0] = s1[6], s2[6] 28.87 - ;; 28.88 -(p[0]) ld8.nta s3[0] = [r18], 8 28.89 -(p[6+1])st8.nta [r8] = d[1], 8 28.90 -(p[6]) xor d[0] = d[0], s3[6] 28.91 - br.ctop.dptk.few 0b 28.92 - ;; 28.93 - mov ar.lc = r30 28.94 - mov pr = r29, -1 28.95 - br.ret.sptk.few rp 28.96 -END(xor_ia64_3) 28.97 - 28.98 -GLOBAL_ENTRY(xor_ia64_4) 28.99 - .prologue 28.100 - .fframe 0 28.101 - .save ar.pfs, r31 28.102 - alloc r31 = ar.pfs, 5, 0, 27, 32 28.103 - .save ar.lc, r30 28.104 - mov r30 = ar.lc 28.105 - .save pr, r29 28.106 - mov r29 = pr 28.107 - ;; 28.108 - .body 28.109 - mov r8 = in1 28.110 - mov ar.ec = 6 + 2 28.111 - shr in0 = in0, 3 28.112 - ;; 28.113 - adds in0 = -1, in0 28.114 - mov r16 = in1 28.115 - mov r17 = in2 28.116 - ;; 28.117 - mov r18 = in3 28.118 - mov ar.lc = in0 28.119 - mov pr.rot = 1 << 16 28.120 - mov r19 = in4 28.121 - ;; 28.122 - .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2] 28.123 - .rotp p[6+2] 28.124 -0: 28.125 -(p[0]) ld8.nta s1[0] = [r16], 8 28.126 -(p[0]) ld8.nta s2[0] = [r17], 8 28.127 -(p[6]) xor d[0] = s1[6], s2[6] 28.128 -(p[0]) ld8.nta s3[0] = [r18], 8 28.129 -(p[0]) ld8.nta s4[0] = [r19], 8 28.130 -(p[6]) xor r20 = s3[6], s4[6] 28.131 - ;; 28.132 -(p[6+1])st8.nta [r8] = d[1], 8 28.133 -(p[6]) xor d[0] = d[0], r20 28.134 - br.ctop.dptk.few 0b 28.135 - ;; 28.136 - mov ar.lc = r30 28.137 - mov pr = r29, -1 28.138 - br.ret.sptk.few rp 28.139 -END(xor_ia64_4) 28.140 - 28.141 -GLOBAL_ENTRY(xor_ia64_5) 28.142 - .prologue 28.143 - .fframe 0 28.144 - .save ar.pfs, r31 28.145 - alloc r31 = ar.pfs, 6, 0, 34, 40 28.146 - .save ar.lc, r30 28.147 - mov r30 = ar.lc 28.148 - .save pr, r29 28.149 - mov r29 = pr 28.150 - ;; 28.151 - .body 28.152 - mov r8 = in1 28.153 - mov ar.ec = 6 + 2 28.154 - shr in0 = in0, 3 28.155 - ;; 28.156 - adds in0 = -1, in0 28.157 - mov r16 = in1 28.158 - mov r17 = in2 28.159 - ;; 28.160 - mov r18 = in3 28.161 - mov ar.lc = in0 28.162 - mov pr.rot = 1 << 16 28.163 - mov r19 = in4 28.164 - mov r20 = in5 28.165 - ;; 28.166 - .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2] 28.167 - .rotp p[6+2] 28.168 -0: 28.169 -(p[0]) ld8.nta s1[0] = [r16], 8 28.170 -(p[0]) ld8.nta s2[0] = [r17], 8 28.171 -(p[6]) xor d[0] = s1[6], s2[6] 28.172 -(p[0]) ld8.nta s3[0] = [r18], 8 28.173 -(p[0]) ld8.nta s4[0] = [r19], 8 28.174 -(p[6]) xor r21 = s3[6], s4[6] 28.175 - ;; 28.176 -(p[0]) ld8.nta s5[0] = [r20], 8 28.177 -(p[6+1])st8.nta [r8] = d[1], 8 28.178 -(p[6]) xor d[0] = d[0], r21 28.179 - ;; 28.180 -(p[6]) xor d[0] = d[0], s5[6] 28.181 - nop.f 0 28.182 - br.ctop.dptk.few 0b 28.183 - ;; 28.184 - mov ar.lc = r30 28.185 - mov pr = r29, -1 28.186 - br.ret.sptk.few rp 28.187 -END(xor_ia64_5)
29.1 --- a/xen/arch/ia64/linux/minstate.h Tue Aug 30 17:51:51 2005 -0600 29.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 29.3 @@ -1,254 +0,0 @@ 29.4 -#include <linux/config.h> 29.5 - 29.6 -#include <asm/cache.h> 29.7 - 29.8 -#include "entry.h" 29.9 - 29.10 -/* 29.11 - * For ivt.s we want to access the stack virtually so we don't have to disable translation 29.12 - * on interrupts. 29.13 - * 29.14 - * On entry: 29.15 - * r1: pointer to current task (ar.k6) 29.16 - */ 29.17 -#define MINSTATE_START_SAVE_MIN_VIRT \ 29.18 -(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ 29.19 - ;; \ 29.20 -(pUStk) mov.m r24=ar.rnat; \ 29.21 -(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ 29.22 -(pKStk) mov r1=sp; /* get sp */ \ 29.23 - ;; \ 29.24 -(pUStk) lfetch.fault.excl.nt1 [r22]; \ 29.25 -(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ 29.26 -(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ 29.27 - ;; \ 29.28 -(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ 29.29 -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ 29.30 - ;; \ 29.31 -(pUStk) mov r18=ar.bsp; \ 29.32 -(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ 29.33 - 29.34 -#define MINSTATE_END_SAVE_MIN_VIRT \ 29.35 - bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ 29.36 - ;; 29.37 - 29.38 -/* 29.39 - * For mca_asm.S we want to access the stack physically since the state is saved before we 29.40 - * go virtual and don't want to destroy the iip or ipsr. 29.41 - */ 29.42 -#define MINSTATE_START_SAVE_MIN_PHYS \ 29.43 -(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \ 29.44 -(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \ 29.45 -(pKStk) ld8 r3 = [r3];; \ 29.46 -(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \ 29.47 -(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \ 29.48 -(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ 29.49 -(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \ 29.50 - ;; \ 29.51 -(pUStk) mov r24=ar.rnat; \ 29.52 -(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ 29.53 -(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ 29.54 -(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ 29.55 - ;; \ 29.56 -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ 29.57 -(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ 29.58 - ;; \ 29.59 -(pUStk) mov r18=ar.bsp; \ 29.60 -(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ 29.61 - 29.62 -#define MINSTATE_END_SAVE_MIN_PHYS \ 29.63 - dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ 29.64 - ;; 29.65 - 29.66 -#ifdef MINSTATE_VIRT 29.67 -# define MINSTATE_GET_CURRENT(reg) \ 29.68 - movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\ 29.69 - ld8 reg=[reg] 29.70 -# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT 29.71 -# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT 29.72 -#endif 29.73 - 29.74 -#ifdef MINSTATE_PHYS 29.75 -# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg 29.76 -# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS 29.77 -# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS 29.78 -#endif 29.79 - 29.80 -/* 29.81 - * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves 29.82 - * the minimum state necessary that allows us to turn psr.ic back 29.83 - * on. 29.84 - * 29.85 - * Assumed state upon entry: 29.86 - * psr.ic: off 29.87 - * r31: contains saved predicates (pr) 29.88 - * 29.89 - * Upon exit, the state is as follows: 29.90 - * psr.ic: off 29.91 - * r2 = points to &pt_regs.r16 29.92 - * r8 = contents of ar.ccv 29.93 - * r9 = contents of ar.csd 29.94 - * r10 = contents of ar.ssd 29.95 - * r11 = FPSR_DEFAULT 29.96 - * r12 = kernel sp (kernel virtual address) 29.97 - * r13 = points to current task_struct (kernel virtual address) 29.98 - * p15 = TRUE if psr.i is set in cr.ipsr 29.99 - * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15: 29.100 - * preserved 29.101 - * 29.102 - * Note that psr.ic is NOT turned on by this macro. This is so that 29.103 - * we can pass interruption state as arguments to a handler. 29.104 - */ 29.105 -#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ 29.106 - MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \ 29.107 - mov r27=ar.rsc; /* M */ \ 29.108 - mov r20=r1; /* A */ \ 29.109 - mov r25=ar.unat; /* M */ \ 29.110 - mov r29=cr.ipsr; /* M */ \ 29.111 - mov r26=ar.pfs; /* I */ \ 29.112 - mov r28=cr.iip; /* M */ \ 29.113 - mov r21=ar.fpsr; /* M */ \ 29.114 - COVER; /* B;; (or nothing) */ \ 29.115 - ;; \ 29.116 - adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \ 29.117 - ;; \ 29.118 - ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \ 29.119 - st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \ 29.120 - adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \ 29.121 - /* switch from user to kernel RBS: */ \ 29.122 - ;; \ 29.123 - invala; /* M */ \ 29.124 - SAVE_IFS; \ 29.125 - cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \ 29.126 - ;; \ 29.127 - MINSTATE_START_SAVE_MIN \ 29.128 - adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \ 29.129 - adds r16=PT(CR_IPSR),r1; \ 29.130 - ;; \ 29.131 - lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \ 29.132 - st8 [r16]=r29; /* save cr.ipsr */ \ 29.133 - ;; \ 29.134 - lfetch.fault.excl.nt1 [r17]; \ 29.135 - tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \ 29.136 - mov r29=b0 \ 29.137 - ;; \ 29.138 - adds r16=PT(R8),r1; /* initialize first base pointer */ \ 29.139 - adds r17=PT(R9),r1; /* initialize second base pointer */ \ 29.140 -(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \ 29.141 - ;; \ 29.142 -.mem.offset 0,0; st8.spill [r16]=r8,16; \ 29.143 -.mem.offset 8,0; st8.spill [r17]=r9,16; \ 29.144 - ;; \ 29.145 -.mem.offset 0,0; st8.spill [r16]=r10,24; \ 29.146 -.mem.offset 8,0; st8.spill [r17]=r11,24; \ 29.147 - ;; \ 29.148 - st8 [r16]=r28,16; /* save cr.iip */ \ 29.149 - st8 [r17]=r30,16; /* save cr.ifs */ \ 29.150 -(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \ 29.151 - mov r8=ar.ccv; \ 29.152 - mov r9=ar.csd; \ 29.153 - mov r10=ar.ssd; \ 29.154 - movl r11=FPSR_DEFAULT; /* L-unit */ \ 29.155 - ;; \ 29.156 - st8 [r16]=r25,16; /* save ar.unat */ \ 29.157 - st8 [r17]=r26,16; /* save ar.pfs */ \ 29.158 - shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \ 29.159 - ;; \ 29.160 - st8 [r16]=r27,16; /* save ar.rsc */ \ 29.161 -(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \ 29.162 -(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \ 29.163 - ;; /* avoid RAW on r16 & r17 */ \ 29.164 -(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \ 29.165 - st8 [r17]=r31,16; /* save predicates */ \ 29.166 -(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \ 29.167 - ;; \ 29.168 - st8 [r16]=r29,16; /* save b0 */ \ 29.169 - st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \ 29.170 - cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \ 29.171 - ;; \ 29.172 -.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \ 29.173 -.mem.offset 8,0; st8.spill [r17]=r12,16; \ 29.174 - adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \ 29.175 - ;; \ 29.176 -.mem.offset 0,0; st8.spill [r16]=r13,16; \ 29.177 -.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \ 29.178 - movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; \ 29.179 - ld8 r13=[r13]; /* establish 'current' */ \ 29.180 - ;; \ 29.181 -.mem.offset 0,0; st8.spill [r16]=r15,16; \ 29.182 -.mem.offset 8,0; st8.spill [r17]=r14,16; \ 29.183 - ;; \ 29.184 -.mem.offset 0,0; st8.spill [r16]=r2,16; \ 29.185 -.mem.offset 8,0; st8.spill [r17]=r3,16; \ 29.186 - adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ 29.187 - ;; \ 29.188 - EXTRA; \ 29.189 - movl r1=__gp; /* establish kernel global pointer */ \ 29.190 - ;; \ 29.191 - MINSTATE_END_SAVE_MIN 29.192 - 29.193 -/* 29.194 - * SAVE_REST saves the remainder of pt_regs (with psr.ic on). 29.195 - * 29.196 - * Assumed state upon entry: 29.197 - * psr.ic: on 29.198 - * r2: points to &pt_regs.r16 29.199 - * r3: points to &pt_regs.r17 29.200 - * r8: contents of ar.ccv 29.201 - * r9: contents of ar.csd 29.202 - * r10: contents of ar.ssd 29.203 - * r11: FPSR_DEFAULT 29.204 - * 29.205 - * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST. 29.206 - */ 29.207 -#define SAVE_REST \ 29.208 -.mem.offset 0,0; st8.spill [r2]=r16,16; \ 29.209 -.mem.offset 8,0; st8.spill [r3]=r17,16; \ 29.210 - ;; \ 29.211 -.mem.offset 0,0; st8.spill [r2]=r18,16; \ 29.212 -.mem.offset 8,0; st8.spill [r3]=r19,16; \ 29.213 - ;; \ 29.214 -.mem.offset 0,0; st8.spill [r2]=r20,16; \ 29.215 -.mem.offset 8,0; st8.spill [r3]=r21,16; \ 29.216 - mov r18=b6; \ 29.217 - ;; \ 29.218 -.mem.offset 0,0; st8.spill [r2]=r22,16; \ 29.219 -.mem.offset 8,0; st8.spill [r3]=r23,16; \ 29.220 - mov r19=b7; \ 29.221 - ;; \ 29.222 -.mem.offset 0,0; st8.spill [r2]=r24,16; \ 29.223 -.mem.offset 8,0; st8.spill [r3]=r25,16; \ 29.224 - ;; \ 29.225 -.mem.offset 0,0; st8.spill [r2]=r26,16; \ 29.226 -.mem.offset 8,0; st8.spill [r3]=r27,16; \ 29.227 - ;; \ 29.228 -.mem.offset 0,0; st8.spill [r2]=r28,16; \ 29.229 -.mem.offset 8,0; st8.spill [r3]=r29,16; \ 29.230 - ;; \ 29.231 -.mem.offset 0,0; st8.spill [r2]=r30,16; \ 29.232 -.mem.offset 8,0; st8.spill [r3]=r31,32; \ 29.233 - ;; \ 29.234 - mov ar.fpsr=r11; /* M-unit */ \ 29.235 - st8 [r2]=r8,8; /* ar.ccv */ \ 29.236 - adds r24=PT(B6)-PT(F7),r3; \ 29.237 - ;; \ 29.238 - stf.spill [r2]=f6,32; \ 29.239 - stf.spill [r3]=f7,32; \ 29.240 - ;; \ 29.241 - stf.spill [r2]=f8,32; \ 29.242 - stf.spill [r3]=f9,32; \ 29.243 - ;; \ 29.244 - stf.spill [r2]=f10; \ 29.245 - stf.spill [r3]=f11; \ 29.246 - adds r25=PT(B7)-PT(F11),r3; \ 29.247 - ;; \ 29.248 - st8 [r24]=r18,16; /* b6 */ \ 29.249 - st8 [r25]=r19,16; /* b7 */ \ 29.250 - ;; \ 29.251 - st8 [r24]=r9; /* ar.csd */ \ 29.252 - st8 [r25]=r10; /* ar.ssd */ \ 29.253 - ;; 29.254 - 29.255 -#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,) 29.256 -#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19) 29.257 -#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
30.1 --- a/xen/arch/ia64/linux/pcdp.h Tue Aug 30 17:51:51 2005 -0600 30.2 +++ b/xen/arch/ia64/linux/pcdp.h Wed Aug 31 14:32:27 2005 -0600 30.3 @@ -2,7 +2,7 @@ 30.4 * Definitions for PCDP-defined console devices 30.5 * 30.6 * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf 30.7 - * v2.0: http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf 30.8 + * v2.0: http://www.dig64.org/specifications/DIG64_PCDPv20.pdf 30.9 * 30.10 * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P. 30.11 * Khalid Aziz <khalid.aziz@hp.com> 30.12 @@ -52,11 +52,36 @@ struct pcdp_uart { 30.13 u32 clock_rate; 30.14 u8 pci_prog_intfc; 30.15 u8 flags; 30.16 -}; 30.17 + u16 conout_index; 30.18 + u32 reserved; 30.19 +} __attribute__((packed)); 30.20 + 30.21 +#define PCDP_IF_PCI 1 30.22 + 30.23 +/* pcdp_if_pci.trans */ 30.24 +#define PCDP_PCI_TRANS_IOPORT 0x02 30.25 +#define PCDP_PCI_TRANS_MMIO 0x01 30.26 + 30.27 +struct pcdp_if_pci { 30.28 + u8 interconnect; 30.29 + u8 reserved; 30.30 + u16 length; 30.31 + u8 segment; 30.32 + u8 bus; 30.33 + u8 dev; 30.34 + u8 fun; 30.35 + u16 dev_id; 30.36 + u16 vendor_id; 30.37 + u32 acpi_interrupt; 30.38 + u64 mmio_tra; 30.39 + u64 ioport_tra; 30.40 + u8 flags; 30.41 + u8 trans; 30.42 +} __attribute__((packed)); 30.43 30.44 struct pcdp_vga { 30.45 u8 count; /* address space descriptors */ 30.46 -}; 30.47 +} __attribute__((packed)); 30.48 30.49 /* pcdp_device.flags */ 30.50 #define PCDP_PRIMARY_CONSOLE 1 30.51 @@ -66,7 +91,9 @@ struct pcdp_device { 30.52 u8 flags; 30.53 u16 length; 30.54 u16 efi_index; 30.55 -}; 30.56 + /* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */ 30.57 + /* next data is device specific type (currently only pcdp_vga) */ 30.58 +} __attribute__((packed)); 30.59 30.60 struct pcdp { 30.61 u8 signature[4]; 30.62 @@ -81,4 +108,4 @@ struct pcdp { 30.63 u32 num_uarts; 30.64 struct pcdp_uart uart[0]; /* actual size is num_uarts */ 30.65 /* remainder of table is pcdp_device structures */ 30.66 -}; 30.67 +} __attribute__((packed));
31.1 --- a/xen/arch/ia64/pdb-stub.c Tue Aug 30 17:51:51 2005 -0600 31.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 31.3 @@ -1,59 +0,0 @@ 31.4 - 31.5 -/* 31.6 - * pervasive debugger 31.7 - * www.cl.cam.ac.uk/netos/pdb 31.8 - * 31.9 - * alex ho 31.10 - * 2004 31.11 - * university of cambridge computer laboratory 31.12 - * 31.13 - * code adapted originally from kgdb, nemesis, & gdbserver 31.14 - */ 31.15 - 31.16 -#include <xen/lib.h> 31.17 -#include <xen/sched.h> 31.18 -#include <asm/ptrace.h> 31.19 -#include <xen/keyhandler.h> 31.20 -#include <asm/processor.h> 31.21 -#include <asm/pdb.h> 31.22 -#include <xen/list.h> 31.23 -#include <xen/serial.h> 31.24 - 31.25 -#define __PDB_GET_VAL 1 31.26 -#define __PDB_SET_VAL 2 31.27 - 31.28 -/* 31.29 - * Read or write memory in an address space 31.30 - */ 31.31 -int pdb_change_values(u_char *buffer, int length, 31.32 - unsigned long cr3, unsigned long addr, int rw) 31.33 -{ 31.34 - dummy(); 31.35 - return 0; 31.36 -} 31.37 - 31.38 -/* 31.39 - * Set memory in a domain's address space 31.40 - * Set "length" bytes at "address" from "domain" to the values in "buffer". 31.41 - * Return the number of bytes set, 0 if there was a problem. 31.42 - */ 31.43 - 31.44 -int pdb_set_values(u_char *buffer, int length, 31.45 - unsigned long cr3, unsigned long addr) 31.46 -{ 31.47 - int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL); 31.48 - return count; 31.49 -} 31.50 - 31.51 -/* 31.52 - * Read memory from a domain's address space. 31.53 - * Fetch "length" bytes at "address" from "domain" into "buffer". 31.54 - * Return the number of bytes read, 0 if there was a problem. 31.55 - */ 31.56 - 31.57 -int pdb_get_values(u_char *buffer, int length, 31.58 - unsigned long cr3, unsigned long addr) 31.59 -{ 31.60 - return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL); 31.61 -} 31.62 -
32.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 32.2 +++ b/xen/include/asm-ia64/linux/sort.h Wed Aug 31 14:32:27 2005 -0600 32.3 @@ -0,0 +1,10 @@ 32.4 +#ifndef _LINUX_SORT_H 32.5 +#define _LINUX_SORT_H 32.6 + 32.7 +#include <linux/types.h> 32.8 + 32.9 +void sort(void *base, size_t num, size_t size, 32.10 + int (*cmp)(const void *, const void *), 32.11 + void (*swap)(void *, void *, int)); 32.12 + 32.13 +#endif