direct-io.hg

changeset 6428:e2127f19861b

We no longer need linux sources to build xen.

Signed-off-by: Arun Sharma <arun.sharma@intel.com>
author adsharma@xuni-t01.sc.intel.com
date Tue Aug 02 15:59:09 2005 -0800 (2005-08-02)
parents e173a853dc46
children f242de2e5a3c
files xen/arch/ia64/Makefile xen/arch/ia64/Rules.mk xen/arch/ia64/efi.c xen/arch/ia64/entry.S xen/arch/ia64/entry.h xen/arch/ia64/head.S xen/arch/ia64/irq_ia64.c xen/arch/ia64/linux/cmdline.c xen/arch/ia64/linux/efi_stub.S xen/arch/ia64/linux/extable.c xen/arch/ia64/linux/hpsim.S xen/arch/ia64/linux/ia64_ksyms.c xen/arch/ia64/linux/irq_lsapic.c xen/arch/ia64/linux/lib/Makefile xen/arch/ia64/linux/lib/bitop.c xen/arch/ia64/linux/lib/carta_random.S xen/arch/ia64/linux/lib/checksum.c xen/arch/ia64/linux/lib/clear_page.S xen/arch/ia64/linux/lib/clear_user.S xen/arch/ia64/linux/lib/copy_page.S xen/arch/ia64/linux/lib/copy_page_mck.S xen/arch/ia64/linux/lib/copy_user.S xen/arch/ia64/linux/lib/csum_partial_copy.c xen/arch/ia64/linux/lib/dec_and_lock.c xen/arch/ia64/linux/lib/do_csum.S xen/arch/ia64/linux/lib/flush.S xen/arch/ia64/linux/lib/idiv32.S xen/arch/ia64/linux/lib/idiv64.S xen/arch/ia64/linux/lib/io.c xen/arch/ia64/linux/lib/ip_fast_csum.S xen/arch/ia64/linux/lib/memcpy.S xen/arch/ia64/linux/lib/memcpy_mck.S xen/arch/ia64/linux/lib/memset.S xen/arch/ia64/linux/lib/strlen.S xen/arch/ia64/linux/lib/strlen_user.S xen/arch/ia64/linux/lib/strncpy_from_user.S xen/arch/ia64/linux/lib/strnlen_user.S xen/arch/ia64/linux/lib/xor.S xen/arch/ia64/linux/linuxextable.c xen/arch/ia64/linux/machvec.c xen/arch/ia64/linux/minstate.h xen/arch/ia64/linux/patch.c xen/arch/ia64/linux/pcdp.h xen/arch/ia64/linux/sal.c xen/arch/ia64/mm_contig.c xen/arch/ia64/pal.S xen/arch/ia64/setup.c xen/arch/ia64/time.c xen/arch/ia64/tlb.c xen/arch/ia64/tools/mkbuildtree xen/arch/ia64/unaligned.c xen/arch/ia64/xen.lds.S xen/include/asm-ia64/gcc_intrin.h xen/include/asm-ia64/hpsim_ssc.h xen/include/asm-ia64/ia64regs.h xen/include/asm-ia64/io.h xen/include/asm-ia64/kregs.h xen/include/asm-ia64/linux/asm-generic/bug.h xen/include/asm-ia64/linux/asm-generic/div64.h xen/include/asm-ia64/linux/asm-generic/errno-base.h xen/include/asm-ia64/linux/asm-generic/errno.h xen/include/asm-ia64/linux/asm-generic/ide_iops.h xen/include/asm-ia64/linux/asm-generic/iomap.h xen/include/asm-ia64/linux/asm-generic/pci-dma-compat.h xen/include/asm-ia64/linux/asm-generic/pci.h xen/include/asm-ia64/linux/asm-generic/pgtable-nopud.h xen/include/asm-ia64/linux/asm-generic/pgtable.h xen/include/asm-ia64/linux/asm-generic/sections.h xen/include/asm-ia64/linux/asm-generic/topology.h xen/include/asm-ia64/linux/asm-generic/vmlinux.lds.h xen/include/asm-ia64/linux/asm/acpi.h xen/include/asm-ia64/linux/asm/asmmacro.h xen/include/asm-ia64/linux/asm/atomic.h xen/include/asm-ia64/linux/asm/bitops.h xen/include/asm-ia64/linux/asm/break.h xen/include/asm-ia64/linux/asm/bug.h xen/include/asm-ia64/linux/asm/byteorder.h xen/include/asm-ia64/linux/asm/cache.h xen/include/asm-ia64/linux/asm/cacheflush.h xen/include/asm-ia64/linux/asm/checksum.h xen/include/asm-ia64/linux/asm/current.h xen/include/asm-ia64/linux/asm/delay.h xen/include/asm-ia64/linux/asm/desc.h xen/include/asm-ia64/linux/asm/div64.h xen/include/asm-ia64/linux/asm/dma-mapping.h xen/include/asm-ia64/linux/asm/dma.h xen/include/asm-ia64/linux/asm/errno.h xen/include/asm-ia64/linux/asm/fpu.h xen/include/asm-ia64/linux/asm/hardirq.h xen/include/asm-ia64/linux/asm/hdreg.h xen/include/asm-ia64/linux/asm/hw_irq.h xen/include/asm-ia64/linux/asm/ia32.h xen/include/asm-ia64/linux/asm/intrinsics.h xen/include/asm-ia64/linux/asm/ioctl.h xen/include/asm-ia64/linux/asm/irq.h xen/include/asm-ia64/linux/asm/linkage.h xen/include/asm-ia64/linux/asm/machvec.h xen/include/asm-ia64/linux/asm/machvec_hpsim.h xen/include/asm-ia64/linux/asm/mca.h xen/include/asm-ia64/linux/asm/meminit.h xen/include/asm-ia64/linux/asm/mman.h xen/include/asm-ia64/linux/asm/module.h xen/include/asm-ia64/linux/asm/numa.h xen/include/asm-ia64/linux/asm/param.h xen/include/asm-ia64/linux/asm/patch.h xen/include/asm-ia64/linux/asm/pci.h xen/include/asm-ia64/linux/asm/pdb.h xen/include/asm-ia64/linux/asm/percpu.h xen/include/asm-ia64/linux/asm/pgtable.h xen/include/asm-ia64/linux/asm/ptrace_offsets.h xen/include/asm-ia64/linux/asm/rse.h xen/include/asm-ia64/linux/asm/rwsem.h xen/include/asm-ia64/linux/asm/sal.h xen/include/asm-ia64/linux/asm/scatterlist.h xen/include/asm-ia64/linux/asm/sections.h xen/include/asm-ia64/linux/asm/semaphore.h xen/include/asm-ia64/linux/asm/setup.h xen/include/asm-ia64/linux/asm/sigcontext.h xen/include/asm-ia64/linux/asm/signal.h xen/include/asm-ia64/linux/asm/smp.h xen/include/asm-ia64/linux/asm/sn/arch.h xen/include/asm-ia64/linux/asm/sn/geo.h xen/include/asm-ia64/linux/asm/sn/nodepda.h xen/include/asm-ia64/linux/asm/sn/sn_cpuid.h xen/include/asm-ia64/linux/asm/sn/sn_sal.h xen/include/asm-ia64/linux/asm/spinlock.h xen/include/asm-ia64/linux/asm/string.h xen/include/asm-ia64/linux/asm/thread_info.h xen/include/asm-ia64/linux/asm/timex.h xen/include/asm-ia64/linux/asm/tlbflush.h xen/include/asm-ia64/linux/asm/topology.h xen/include/asm-ia64/linux/asm/unaligned.h xen/include/asm-ia64/linux/asm/unistd.h xen/include/asm-ia64/linux/asm/unwind.h xen/include/asm-ia64/linux/asm/ustack.h xen/include/asm-ia64/linux/bcd.h xen/include/asm-ia64/linux/bitmap.h xen/include/asm-ia64/linux/bitops.h xen/include/asm-ia64/linux/bootmem.h xen/include/asm-ia64/linux/byteorder/generic.h xen/include/asm-ia64/linux/byteorder/little_endian.h xen/include/asm-ia64/linux/byteorder/swab.h xen/include/asm-ia64/linux/cpu.h xen/include/asm-ia64/linux/cpumask.h xen/include/asm-ia64/linux/device.h xen/include/asm-ia64/linux/dma-mapping.h xen/include/asm-ia64/linux/efi.h xen/include/asm-ia64/linux/err.h xen/include/asm-ia64/linux/file.h xen/include/asm-ia64/linux/gfp.h xen/include/asm-ia64/linux/hardirq.h xen/include/asm-ia64/linux/initrd.h xen/include/asm-ia64/linux/interrupt.h xen/include/asm-ia64/linux/ioport.h xen/include/asm-ia64/linux/jiffies.h xen/include/asm-ia64/linux/kernel_stat.h xen/include/asm-ia64/linux/kmalloc_sizes.h xen/include/asm-ia64/linux/linkage.h xen/include/asm-ia64/linux/linuxtime.h xen/include/asm-ia64/linux/mmzone.h xen/include/asm-ia64/linux/module.h xen/include/asm-ia64/linux/numa.h xen/include/asm-ia64/linux/page-flags.h xen/include/asm-ia64/linux/percpu.h xen/include/asm-ia64/linux/preempt.h xen/include/asm-ia64/linux/proc_fs.h xen/include/asm-ia64/linux/profile.h xen/include/asm-ia64/linux/ptrace.h xen/include/asm-ia64/linux/random.h xen/include/asm-ia64/linux/rbtree.h xen/include/asm-ia64/linux/rtc.h xen/include/asm-ia64/linux/rwsem.h xen/include/asm-ia64/linux/seq_file.h xen/include/asm-ia64/linux/seqlock.h xen/include/asm-ia64/linux/serial.h xen/include/asm-ia64/linux/serial_core.h xen/include/asm-ia64/linux/signal.h xen/include/asm-ia64/linux/slab.h xen/include/asm-ia64/linux/smp_lock.h xen/include/asm-ia64/linux/stddef.h xen/include/asm-ia64/linux/swap.h xen/include/asm-ia64/linux/thread_info.h xen/include/asm-ia64/linux/threads.h xen/include/asm-ia64/linux/timex.h xen/include/asm-ia64/linux/topology.h xen/include/asm-ia64/linux/tty.h xen/include/asm-ia64/linux/wait.h xen/include/asm-ia64/mca_asm.h xen/include/asm-ia64/page.h xen/include/asm-ia64/pal.h xen/include/asm-ia64/pgalloc.h xen/include/asm-ia64/processor.h xen/include/asm-ia64/ptrace.h xen/include/asm-ia64/system.h xen/include/asm-ia64/types.h xen/include/asm-ia64/uaccess.h
line diff
     1.1 --- a/xen/arch/ia64/Makefile	Tue Aug 02 10:20:46 2005 -0700
     1.2 +++ b/xen/arch/ia64/Makefile	Tue Aug 02 15:59:09 2005 -0800
     1.3 @@ -1,5 +1,7 @@
     1.4  include $(BASEDIR)/Rules.mk
     1.5  
     1.6 +VPATH = linux
     1.7 +
     1.8  # libs-y	+= arch/ia64/lib/lib.a
     1.9  
    1.10  OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \
    1.11 @@ -75,7 +77,7 @@ xen.lds.s: xen.lds.S
    1.12  		-o xen.lds.s xen.lds.S
    1.13  
    1.14  ia64lib.o:
    1.15 -	$(MAKE) -C lib && cp lib/ia64lib.o .
    1.16 +	$(MAKE) -C linux/lib && cp linux/lib/ia64lib.o .
    1.17  
    1.18  clean:
    1.19  	rm -f *.o *~ core  xen.lds.s $(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s
     2.1 --- a/xen/arch/ia64/Rules.mk	Tue Aug 02 10:20:46 2005 -0700
     2.2 +++ b/xen/arch/ia64/Rules.mk	Tue Aug 02 15:59:09 2005 -0800
     2.3 @@ -6,14 +6,16 @@ ifneq ($(COMPILE_ARCH),$(TARGET_ARCH))
     2.4  CROSS_COMPILE ?= /usr/local/sp_env/v2.2.5/i686/bin/ia64-unknown-linux-
     2.5  endif
     2.6  AFLAGS  += -D__ASSEMBLY__
     2.7 -CPPFLAGS  += -I$(BASEDIR)/include -I$(BASEDIR)/include/asm-ia64
     2.8 +CPPFLAGS  += -I$(BASEDIR)/include -I$(BASEDIR)/include/asm-ia64 \
     2.9 +             -I$(BASEDIR)/include/asm-ia64/linux -I$(BASEDIR)/arch/ia64/linux
    2.10  CFLAGS  := -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
    2.11  #CFLAGS  += -O3		# -O3 over-inlines making debugging tough!
    2.12  CFLAGS  += -O2		# but no optimization causes compile errors!
    2.13  #CFLAGS  += -iwithprefix include -Wall -DMONITOR_BASE=$(MONITOR_BASE)
    2.14  CFLAGS  += -iwithprefix include -Wall
    2.15  CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
    2.16 -CFLAGS  += -I$(BASEDIR)/include/asm-ia64
    2.17 +CFLAGS  += -I$(BASEDIR)/include/asm-ia64 -I$(BASEDIR)/include/asm-ia64/linux \
    2.18 +           -I$(BASEDIR)/arch/ia64/linux -I$(BASEDIR)/arch/ia64
    2.19  CFLAGS  += -Wno-pointer-arith -Wredundant-decls
    2.20  CFLAGS  += -DIA64 -DXEN -DLINUX_2_6
    2.21  CFLAGS	+= -ffixed-r13 -mfixed-range=f12-f15,f32-f127
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/ia64/efi.c	Tue Aug 02 15:59:09 2005 -0800
     3.3 @@ -0,0 +1,866 @@
     3.4 +/*
     3.5 + * Extensible Firmware Interface
     3.6 + *
     3.7 + * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999
     3.8 + *
     3.9 + * Copyright (C) 1999 VA Linux Systems
    3.10 + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
    3.11 + * Copyright (C) 1999-2003 Hewlett-Packard Co.
    3.12 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    3.13 + *	Stephane Eranian <eranian@hpl.hp.com>
    3.14 + *
    3.15 + * All EFI Runtime Services are not implemented yet as EFI only
    3.16 + * supports physical mode addressing on SoftSDV. This is to be fixed
    3.17 + * in a future version.  --drummond 1999-07-20
    3.18 + *
    3.19 + * Implemented EFI runtime services and virtual mode calls.  --davidm
    3.20 + *
    3.21 + * Goutham Rao: <goutham.rao@intel.com>
    3.22 + *	Skip non-WB memory and ignore empty memory ranges.
    3.23 + */
    3.24 +#include <linux/config.h>
    3.25 +#include <linux/module.h>
    3.26 +#include <linux/kernel.h>
    3.27 +#include <linux/init.h>
    3.28 +#include <linux/types.h>
    3.29 +#include <linux/time.h>
    3.30 +#include <linux/efi.h>
    3.31 +
    3.32 +#include <asm/io.h>
    3.33 +#include <asm/kregs.h>
    3.34 +#include <asm/meminit.h>
    3.35 +#include <asm/pgtable.h>
    3.36 +#include <asm/processor.h>
    3.37 +#include <asm/mca.h>
    3.38 +
    3.39 +#define EFI_DEBUG	0
    3.40 +
    3.41 +extern efi_status_t efi_call_phys (void *, ...);
    3.42 +
    3.43 +struct efi efi;
    3.44 +EXPORT_SYMBOL(efi);
    3.45 +static efi_runtime_services_t *runtime;
    3.46 +static unsigned long mem_limit = ~0UL, max_addr = ~0UL;
    3.47 +
    3.48 +#define efi_call_virt(f, args...)	(*(f))(args)
    3.49 +
    3.50 +#define STUB_GET_TIME(prefix, adjust_arg)							  \
    3.51 +static efi_status_t										  \
    3.52 +prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)						  \
    3.53 +{												  \
    3.54 +	struct ia64_fpreg fr[6];								  \
    3.55 +	efi_time_cap_t *atc = NULL;								  \
    3.56 +	efi_status_t ret;									  \
    3.57 +												  \
    3.58 +	if (tc)											  \
    3.59 +		atc = adjust_arg(tc);								  \
    3.60 +	ia64_save_scratch_fpregs(fr);								  \
    3.61 +	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \
    3.62 +	ia64_load_scratch_fpregs(fr);								  \
    3.63 +	return ret;										  \
    3.64 +}
    3.65 +
    3.66 +#define STUB_SET_TIME(prefix, adjust_arg)							\
    3.67 +static efi_status_t										\
    3.68 +prefix##_set_time (efi_time_t *tm)								\
    3.69 +{												\
    3.70 +	struct ia64_fpreg fr[6];								\
    3.71 +	efi_status_t ret;									\
    3.72 +												\
    3.73 +	ia64_save_scratch_fpregs(fr);								\
    3.74 +	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm));	\
    3.75 +	ia64_load_scratch_fpregs(fr);								\
    3.76 +	return ret;										\
    3.77 +}
    3.78 +
    3.79 +#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)						\
    3.80 +static efi_status_t										\
    3.81 +prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm)		\
    3.82 +{												\
    3.83 +	struct ia64_fpreg fr[6];								\
    3.84 +	efi_status_t ret;									\
    3.85 +												\
    3.86 +	ia64_save_scratch_fpregs(fr);								\
    3.87 +	ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),	\
    3.88 +				adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));	\
    3.89 +	ia64_load_scratch_fpregs(fr);								\
    3.90 +	return ret;										\
    3.91 +}
    3.92 +
    3.93 +#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)						\
    3.94 +static efi_status_t										\
    3.95 +prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)					\
    3.96 +{												\
    3.97 +	struct ia64_fpreg fr[6];								\
    3.98 +	efi_time_t *atm = NULL;									\
    3.99 +	efi_status_t ret;									\
   3.100 +												\
   3.101 +	if (tm)											\
   3.102 +		atm = adjust_arg(tm);								\
   3.103 +	ia64_save_scratch_fpregs(fr);								\
   3.104 +	ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),	\
   3.105 +				enabled, atm);							\
   3.106 +	ia64_load_scratch_fpregs(fr);								\
   3.107 +	return ret;										\
   3.108 +}
   3.109 +
   3.110 +#define STUB_GET_VARIABLE(prefix, adjust_arg)						\
   3.111 +static efi_status_t									\
   3.112 +prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,		\
   3.113 +		       unsigned long *data_size, void *data)				\
   3.114 +{											\
   3.115 +	struct ia64_fpreg fr[6];							\
   3.116 +	u32 *aattr = NULL;									\
   3.117 +	efi_status_t ret;								\
   3.118 +											\
   3.119 +	if (attr)									\
   3.120 +		aattr = adjust_arg(attr);						\
   3.121 +	ia64_save_scratch_fpregs(fr);							\
   3.122 +	ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable),	\
   3.123 +				adjust_arg(name), adjust_arg(vendor), aattr,		\
   3.124 +				adjust_arg(data_size), adjust_arg(data));		\
   3.125 +	ia64_load_scratch_fpregs(fr);							\
   3.126 +	return ret;									\
   3.127 +}
   3.128 +
   3.129 +#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)						\
   3.130 +static efi_status_t										\
   3.131 +prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor)	\
   3.132 +{												\
   3.133 +	struct ia64_fpreg fr[6];								\
   3.134 +	efi_status_t ret;									\
   3.135 +												\
   3.136 +	ia64_save_scratch_fpregs(fr);								\
   3.137 +	ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable),	\
   3.138 +				adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));	\
   3.139 +	ia64_load_scratch_fpregs(fr);								\
   3.140 +	return ret;										\
   3.141 +}
   3.142 +
   3.143 +#define STUB_SET_VARIABLE(prefix, adjust_arg)						\
   3.144 +static efi_status_t									\
   3.145 +prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr,	\
   3.146 +		       unsigned long data_size, void *data)				\
   3.147 +{											\
   3.148 +	struct ia64_fpreg fr[6];							\
   3.149 +	efi_status_t ret;								\
   3.150 +											\
   3.151 +	ia64_save_scratch_fpregs(fr);							\
   3.152 +	ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable),	\
   3.153 +				adjust_arg(name), adjust_arg(vendor), attr, data_size,	\
   3.154 +				adjust_arg(data));					\
   3.155 +	ia64_load_scratch_fpregs(fr);							\
   3.156 +	return ret;									\
   3.157 +}
   3.158 +
   3.159 +#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)					\
   3.160 +static efi_status_t										\
   3.161 +prefix##_get_next_high_mono_count (u32 *count)							\
   3.162 +{												\
   3.163 +	struct ia64_fpreg fr[6];								\
   3.164 +	efi_status_t ret;									\
   3.165 +												\
   3.166 +	ia64_save_scratch_fpregs(fr);								\
   3.167 +	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)				\
   3.168 +				__va(runtime->get_next_high_mono_count), adjust_arg(count));	\
   3.169 +	ia64_load_scratch_fpregs(fr);								\
   3.170 +	return ret;										\
   3.171 +}
   3.172 +
   3.173 +#define STUB_RESET_SYSTEM(prefix, adjust_arg)					\
   3.174 +static void									\
   3.175 +prefix##_reset_system (int reset_type, efi_status_t status,			\
   3.176 +		       unsigned long data_size, efi_char16_t *data)		\
   3.177 +{										\
   3.178 +	struct ia64_fpreg fr[6];						\
   3.179 +	efi_char16_t *adata = NULL;						\
   3.180 +										\
   3.181 +	if (data)								\
   3.182 +		adata = adjust_arg(data);					\
   3.183 +										\
   3.184 +	ia64_save_scratch_fpregs(fr);						\
   3.185 +	efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system),	\
   3.186 +			  reset_type, status, data_size, adata);		\
   3.187 +	/* should not return, but just in case... */				\
   3.188 +	ia64_load_scratch_fpregs(fr);						\
   3.189 +}
   3.190 +
   3.191 +#define phys_ptr(arg)	((__typeof__(arg)) ia64_tpa(arg))
   3.192 +
   3.193 +STUB_GET_TIME(phys, phys_ptr)
   3.194 +STUB_SET_TIME(phys, phys_ptr)
   3.195 +STUB_GET_WAKEUP_TIME(phys, phys_ptr)
   3.196 +STUB_SET_WAKEUP_TIME(phys, phys_ptr)
   3.197 +STUB_GET_VARIABLE(phys, phys_ptr)
   3.198 +STUB_GET_NEXT_VARIABLE(phys, phys_ptr)
   3.199 +STUB_SET_VARIABLE(phys, phys_ptr)
   3.200 +STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr)
   3.201 +STUB_RESET_SYSTEM(phys, phys_ptr)
   3.202 +
   3.203 +#define id(arg)	arg
   3.204 +
   3.205 +STUB_GET_TIME(virt, id)
   3.206 +STUB_SET_TIME(virt, id)
   3.207 +STUB_GET_WAKEUP_TIME(virt, id)
   3.208 +STUB_SET_WAKEUP_TIME(virt, id)
   3.209 +STUB_GET_VARIABLE(virt, id)
   3.210 +STUB_GET_NEXT_VARIABLE(virt, id)
   3.211 +STUB_SET_VARIABLE(virt, id)
   3.212 +STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id)
   3.213 +STUB_RESET_SYSTEM(virt, id)
   3.214 +
   3.215 +void
   3.216 +efi_gettimeofday (struct timespec *ts)
   3.217 +{
   3.218 +	efi_time_t tm;
   3.219 +
   3.220 +	memset(ts, 0, sizeof(ts));
   3.221 +	if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS)
   3.222 +		return;
   3.223 +
   3.224 +	ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second);
   3.225 +	ts->tv_nsec = tm.nanosecond;
   3.226 +}
   3.227 +
   3.228 +static int
   3.229 +is_available_memory (efi_memory_desc_t *md)
   3.230 +{
   3.231 +	if (!(md->attribute & EFI_MEMORY_WB))
   3.232 +		return 0;
   3.233 +
   3.234 +	switch (md->type) {
   3.235 +	      case EFI_LOADER_CODE:
   3.236 +	      case EFI_LOADER_DATA:
   3.237 +	      case EFI_BOOT_SERVICES_CODE:
   3.238 +	      case EFI_BOOT_SERVICES_DATA:
   3.239 +	      case EFI_CONVENTIONAL_MEMORY:
   3.240 +		return 1;
   3.241 +	}
   3.242 +	return 0;
   3.243 +}
   3.244 +
   3.245 +/*
   3.246 + * Trim descriptor MD so its starts at address START_ADDR.  If the descriptor covers
   3.247 + * memory that is normally available to the kernel, issue a warning that some memory
   3.248 + * is being ignored.
   3.249 + */
   3.250 +static void
   3.251 +trim_bottom (efi_memory_desc_t *md, u64 start_addr)
   3.252 +{
   3.253 +	u64 num_skipped_pages;
   3.254 +
   3.255 +	if (md->phys_addr >= start_addr || !md->num_pages)
   3.256 +		return;
   3.257 +
   3.258 +	num_skipped_pages = (start_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
   3.259 +	if (num_skipped_pages > md->num_pages)
   3.260 +		num_skipped_pages = md->num_pages;
   3.261 +
   3.262 +	if (is_available_memory(md))
   3.263 +		printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
   3.264 +		       "at 0x%lx\n", __FUNCTION__,
   3.265 +		       (num_skipped_pages << EFI_PAGE_SHIFT) >> 10,
   3.266 +		       md->phys_addr, start_addr - IA64_GRANULE_SIZE);
   3.267 +	/*
   3.268 +	 * NOTE: Don't set md->phys_addr to START_ADDR because that could cause the memory
   3.269 +	 * descriptor list to become unsorted.  In such a case, md->num_pages will be
   3.270 +	 * zero, so the Right Thing will happen.
   3.271 +	 */
   3.272 +	md->phys_addr += num_skipped_pages << EFI_PAGE_SHIFT;
   3.273 +	md->num_pages -= num_skipped_pages;
   3.274 +}
   3.275 +
   3.276 +static void
   3.277 +trim_top (efi_memory_desc_t *md, u64 end_addr)
   3.278 +{
   3.279 +	u64 num_dropped_pages, md_end_addr;
   3.280 +
   3.281 +	md_end_addr = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
   3.282 +
   3.283 +	if (md_end_addr <= end_addr || !md->num_pages)
   3.284 +		return;
   3.285 +
   3.286 +	num_dropped_pages = (md_end_addr - end_addr) >> EFI_PAGE_SHIFT;
   3.287 +	if (num_dropped_pages > md->num_pages)
   3.288 +		num_dropped_pages = md->num_pages;
   3.289 +
   3.290 +	if (is_available_memory(md))
   3.291 +		printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
   3.292 +		       "at 0x%lx\n", __FUNCTION__,
   3.293 +		       (num_dropped_pages << EFI_PAGE_SHIFT) >> 10,
   3.294 +		       md->phys_addr, end_addr);
   3.295 +	md->num_pages -= num_dropped_pages;
   3.296 +}
   3.297 +
   3.298 +/*
   3.299 + * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
   3.300 + * has memory that is available for OS use.
   3.301 + */
   3.302 +void
   3.303 +efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
   3.304 +{
   3.305 +	int prev_valid = 0;
   3.306 +	struct range {
   3.307 +		u64 start;
   3.308 +		u64 end;
   3.309 +	} prev, curr;
   3.310 +	void *efi_map_start, *efi_map_end, *p, *q;
   3.311 +	efi_memory_desc_t *md, *check_md;
   3.312 +	u64 efi_desc_size, start, end, granule_addr, last_granule_addr, first_non_wb_addr = 0;
   3.313 +	unsigned long total_mem = 0;
   3.314 +
   3.315 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.316 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.317 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.318 +
   3.319 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.320 +		md = p;
   3.321 +
   3.322 +		/* skip over non-WB memory descriptors; that's all we're interested in... */
   3.323 +		if (!(md->attribute & EFI_MEMORY_WB))
   3.324 +			continue;
   3.325 +
   3.326 +#ifdef XEN
   3.327 +// this works around a problem in the ski bootloader
   3.328 +{
   3.329 +		extern long running_on_sim;
   3.330 +		if (running_on_sim && md->type != EFI_CONVENTIONAL_MEMORY)
   3.331 +			continue;
   3.332 +}
   3.333 +// this is a temporary hack to avoid CONFIG_VIRTUAL_MEM_MAP
   3.334 +		if (md->phys_addr >= 0x100000000) continue;
   3.335 +#endif
   3.336 +		/*
   3.337 +		 * granule_addr is the base of md's first granule.
   3.338 +		 * [granule_addr - first_non_wb_addr) is guaranteed to
   3.339 +		 * be contiguous WB memory.
   3.340 +		 */
   3.341 +		granule_addr = GRANULEROUNDDOWN(md->phys_addr);
   3.342 +		first_non_wb_addr = max(first_non_wb_addr, granule_addr);
   3.343 +
   3.344 +		if (first_non_wb_addr < md->phys_addr) {
   3.345 +			trim_bottom(md, granule_addr + IA64_GRANULE_SIZE);
   3.346 +			granule_addr = GRANULEROUNDDOWN(md->phys_addr);
   3.347 +			first_non_wb_addr = max(first_non_wb_addr, granule_addr);
   3.348 +		}
   3.349 +
   3.350 +		for (q = p; q < efi_map_end; q += efi_desc_size) {
   3.351 +			check_md = q;
   3.352 +
   3.353 +			if ((check_md->attribute & EFI_MEMORY_WB) &&
   3.354 +			    (check_md->phys_addr == first_non_wb_addr))
   3.355 +				first_non_wb_addr += check_md->num_pages << EFI_PAGE_SHIFT;
   3.356 +			else
   3.357 +				break;		/* non-WB or hole */
   3.358 +		}
   3.359 +
   3.360 +		last_granule_addr = GRANULEROUNDDOWN(first_non_wb_addr);
   3.361 +		if (last_granule_addr < md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT))
   3.362 +			trim_top(md, last_granule_addr);
   3.363 +
   3.364 +		if (is_available_memory(md)) {
   3.365 +			if (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) >= max_addr) {
   3.366 +				if (md->phys_addr >= max_addr)
   3.367 +					continue;
   3.368 +				md->num_pages = (max_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
   3.369 +				first_non_wb_addr = max_addr;
   3.370 +			}
   3.371 +
   3.372 +			if (total_mem >= mem_limit)
   3.373 +				continue;
   3.374 +
   3.375 +			if (total_mem + (md->num_pages << EFI_PAGE_SHIFT) > mem_limit) {
   3.376 +				unsigned long limit_addr = md->phys_addr;
   3.377 +
   3.378 +				limit_addr += mem_limit - total_mem;
   3.379 +				limit_addr = GRANULEROUNDDOWN(limit_addr);
   3.380 +
   3.381 +				if (md->phys_addr > limit_addr)
   3.382 +					continue;
   3.383 +
   3.384 +				md->num_pages = (limit_addr - md->phys_addr) >>
   3.385 +				                EFI_PAGE_SHIFT;
   3.386 +				first_non_wb_addr = max_addr = md->phys_addr +
   3.387 +				              (md->num_pages << EFI_PAGE_SHIFT);
   3.388 +			}
   3.389 +			total_mem += (md->num_pages << EFI_PAGE_SHIFT);
   3.390 +
   3.391 +			if (md->num_pages == 0)
   3.392 +				continue;
   3.393 +
   3.394 +			curr.start = PAGE_OFFSET + md->phys_addr;
   3.395 +			curr.end   = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
   3.396 +
   3.397 +			if (!prev_valid) {
   3.398 +				prev = curr;
   3.399 +				prev_valid = 1;
   3.400 +			} else {
   3.401 +				if (curr.start < prev.start)
   3.402 +					printk(KERN_ERR "Oops: EFI memory table not ordered!\n");
   3.403 +
   3.404 +				if (prev.end == curr.start) {
   3.405 +					/* merge two consecutive memory ranges */
   3.406 +					prev.end = curr.end;
   3.407 +				} else {
   3.408 +					start = PAGE_ALIGN(prev.start);
   3.409 +					end = prev.end & PAGE_MASK;
   3.410 +					if ((end > start) && (*callback)(start, end, arg) < 0)
   3.411 +						return;
   3.412 +					prev = curr;
   3.413 +				}
   3.414 +			}
   3.415 +		}
   3.416 +	}
   3.417 +	if (prev_valid) {
   3.418 +		start = PAGE_ALIGN(prev.start);
   3.419 +		end = prev.end & PAGE_MASK;
   3.420 +		if (end > start)
   3.421 +			(*callback)(start, end, arg);
   3.422 +	}
   3.423 +}
   3.424 +
   3.425 +/*
   3.426 + * Look for the PAL_CODE region reported by EFI and maps it using an
   3.427 + * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
   3.428 + * Abstraction Layer chapter 11 in ADAG
   3.429 + */
   3.430 +
   3.431 +void *
   3.432 +efi_get_pal_addr (void)
   3.433 +{
   3.434 +	void *efi_map_start, *efi_map_end, *p;
   3.435 +	efi_memory_desc_t *md;
   3.436 +	u64 efi_desc_size;
   3.437 +	int pal_code_count = 0;
   3.438 +	u64 vaddr, mask;
   3.439 +
   3.440 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.441 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.442 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.443 +
   3.444 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.445 +		md = p;
   3.446 +		if (md->type != EFI_PAL_CODE)
   3.447 +			continue;
   3.448 +
   3.449 +		if (++pal_code_count > 1) {
   3.450 +			printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n",
   3.451 +			       md->phys_addr);
   3.452 +			continue;
   3.453 +		}
   3.454 +		/*
   3.455 +		 * The only ITLB entry in region 7 that is used is the one installed by
   3.456 +		 * __start().  That entry covers a 64MB range.
   3.457 +		 */
   3.458 +		mask  = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
   3.459 +		vaddr = PAGE_OFFSET + md->phys_addr;
   3.460 +
   3.461 +		/*
   3.462 +		 * We must check that the PAL mapping won't overlap with the kernel
   3.463 +		 * mapping.
   3.464 +		 *
   3.465 +		 * PAL code is guaranteed to be aligned on a power of 2 between 4k and
   3.466 +		 * 256KB and that only one ITR is needed to map it. This implies that the
   3.467 +		 * PAL code is always aligned on its size, i.e., the closest matching page
   3.468 +		 * size supported by the TLB. Therefore PAL code is guaranteed never to
   3.469 +		 * cross a 64MB unless it is bigger than 64MB (very unlikely!).  So for
   3.470 +		 * now the following test is enough to determine whether or not we need a
   3.471 +		 * dedicated ITR for the PAL code.
   3.472 +		 */
   3.473 +		if ((vaddr & mask) == (KERNEL_START & mask)) {
   3.474 +			printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
   3.475 +			       __FUNCTION__);
   3.476 +			continue;
   3.477 +		}
   3.478 +
   3.479 +		if (md->num_pages << EFI_PAGE_SHIFT > IA64_GRANULE_SIZE)
   3.480 +			panic("Woah!  PAL code size bigger than a granule!");
   3.481 +
   3.482 +#if EFI_DEBUG
   3.483 +		mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
   3.484 +
   3.485 +		printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
   3.486 +			smp_processor_id(), md->phys_addr,
   3.487 +			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
   3.488 +			vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
   3.489 +#endif
   3.490 +		return __va(md->phys_addr);
   3.491 +	}
   3.492 +	printk(KERN_WARNING "%s: no PAL-code memory-descriptor found",
   3.493 +	       __FUNCTION__);
   3.494 +	return NULL;
   3.495 +}
   3.496 +
   3.497 +void
   3.498 +efi_map_pal_code (void)
   3.499 +{
   3.500 +	void *pal_vaddr = efi_get_pal_addr ();
   3.501 +	u64 psr;
   3.502 +
   3.503 +	if (!pal_vaddr)
   3.504 +		return;
   3.505 +
   3.506 +	/*
   3.507 +	 * Cannot write to CRx with PSR.ic=1
   3.508 +	 */
   3.509 +	psr = ia64_clear_ic();
   3.510 +	ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr),
   3.511 +		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
   3.512 +		 IA64_GRANULE_SHIFT);
   3.513 +	ia64_set_psr(psr);		/* restore psr */
   3.514 +	ia64_srlz_i();
   3.515 +}
   3.516 +
   3.517 +void __init
   3.518 +efi_init (void)
   3.519 +{
   3.520 +	void *efi_map_start, *efi_map_end;
   3.521 +	efi_config_table_t *config_tables;
   3.522 +	efi_char16_t *c16;
   3.523 +	u64 efi_desc_size;
   3.524 +	char *cp, *end, vendor[100] = "unknown";
   3.525 +	extern char saved_command_line[];
   3.526 +	int i;
   3.527 +
   3.528 +	/* it's too early to be able to use the standard kernel command line support... */
   3.529 +	for (cp = saved_command_line; *cp; ) {
   3.530 +		if (memcmp(cp, "mem=", 4) == 0) {
   3.531 +			cp += 4;
   3.532 +			mem_limit = memparse(cp, &end);
   3.533 +			if (end != cp)
   3.534 +				break;
   3.535 +			cp = end;
   3.536 +		} else if (memcmp(cp, "max_addr=", 9) == 0) {
   3.537 +			cp += 9;
   3.538 +			max_addr = GRANULEROUNDDOWN(memparse(cp, &end));
   3.539 +			if (end != cp)
   3.540 +				break;
   3.541 +			cp = end;
   3.542 +		} else {
   3.543 +			while (*cp != ' ' && *cp)
   3.544 +				++cp;
   3.545 +			while (*cp == ' ')
   3.546 +				++cp;
   3.547 +		}
   3.548 +	}
   3.549 +	if (max_addr != ~0UL)
   3.550 +		printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20);
   3.551 +
   3.552 +	efi.systab = __va(ia64_boot_param->efi_systab);
   3.553 +
   3.554 +	/*
   3.555 +	 * Verify the EFI Table
   3.556 +	 */
   3.557 +	if (efi.systab == NULL)
   3.558 +		panic("Woah! Can't find EFI system table.\n");
   3.559 +	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
   3.560 +		panic("Woah! EFI system table signature incorrect\n");
   3.561 +	if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0)
   3.562 +		printk(KERN_WARNING "Warning: EFI system table major version mismatch: "
   3.563 +		       "got %d.%02d, expected %d.%02d\n",
   3.564 +		       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff,
   3.565 +		       EFI_SYSTEM_TABLE_REVISION >> 16, EFI_SYSTEM_TABLE_REVISION & 0xffff);
   3.566 +
   3.567 +	config_tables = __va(efi.systab->tables);
   3.568 +
   3.569 +	/* Show what we know for posterity */
   3.570 +	c16 = __va(efi.systab->fw_vendor);
   3.571 +	if (c16) {
   3.572 +		for (i = 0;i < (int) sizeof(vendor) && *c16; ++i)
   3.573 +			vendor[i] = *c16++;
   3.574 +		vendor[i] = '\0';
   3.575 +	}
   3.576 +
   3.577 +	printk(KERN_INFO "EFI v%u.%.02u by %s:",
   3.578 +	       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor);
   3.579 +
   3.580 +	for (i = 0; i < (int) efi.systab->nr_tables; i++) {
   3.581 +		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
   3.582 +			efi.mps = __va(config_tables[i].table);
   3.583 +			printk(" MPS=0x%lx", config_tables[i].table);
   3.584 +		} else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
   3.585 +			efi.acpi20 = __va(config_tables[i].table);
   3.586 +			printk(" ACPI 2.0=0x%lx", config_tables[i].table);
   3.587 +		} else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
   3.588 +			efi.acpi = __va(config_tables[i].table);
   3.589 +			printk(" ACPI=0x%lx", config_tables[i].table);
   3.590 +		} else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
   3.591 +			efi.smbios = __va(config_tables[i].table);
   3.592 +			printk(" SMBIOS=0x%lx", config_tables[i].table);
   3.593 +		} else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) {
   3.594 +			efi.sal_systab = __va(config_tables[i].table);
   3.595 +			printk(" SALsystab=0x%lx", config_tables[i].table);
   3.596 +		} else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
   3.597 +			efi.hcdp = __va(config_tables[i].table);
   3.598 +			printk(" HCDP=0x%lx", config_tables[i].table);
   3.599 +		}
   3.600 +	}
   3.601 +	printk("\n");
   3.602 +
   3.603 +	runtime = __va(efi.systab->runtime);
   3.604 +	efi.get_time = phys_get_time;
   3.605 +	efi.set_time = phys_set_time;
   3.606 +	efi.get_wakeup_time = phys_get_wakeup_time;
   3.607 +	efi.set_wakeup_time = phys_set_wakeup_time;
   3.608 +	efi.get_variable = phys_get_variable;
   3.609 +	efi.get_next_variable = phys_get_next_variable;
   3.610 +	efi.set_variable = phys_set_variable;
   3.611 +	efi.get_next_high_mono_count = phys_get_next_high_mono_count;
   3.612 +	efi.reset_system = phys_reset_system;
   3.613 +
   3.614 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.615 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.616 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.617 +
   3.618 +#if EFI_DEBUG
   3.619 +	/* print EFI memory map: */
   3.620 +	{
   3.621 +		efi_memory_desc_t *md;
   3.622 +		void *p;
   3.623 +
   3.624 +		for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) {
   3.625 +			md = p;
   3.626 +			printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n",
   3.627 +			       i, md->type, md->attribute, md->phys_addr,
   3.628 +			       md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
   3.629 +			       md->num_pages >> (20 - EFI_PAGE_SHIFT));
   3.630 +		}
   3.631 +	}
   3.632 +#endif
   3.633 +
   3.634 +	efi_map_pal_code();
   3.635 +	efi_enter_virtual_mode();
   3.636 +}
   3.637 +
   3.638 +void
   3.639 +efi_enter_virtual_mode (void)
   3.640 +{
   3.641 +	void *efi_map_start, *efi_map_end, *p;
   3.642 +	efi_memory_desc_t *md;
   3.643 +	efi_status_t status;
   3.644 +	u64 efi_desc_size;
   3.645 +
   3.646 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.647 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.648 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.649 +
   3.650 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.651 +		md = p;
   3.652 +		if (md->attribute & EFI_MEMORY_RUNTIME) {
   3.653 +			/*
   3.654 +			 * Some descriptors have multiple bits set, so the order of
   3.655 +			 * the tests is relevant.
   3.656 +			 */
   3.657 +			if (md->attribute & EFI_MEMORY_WB) {
   3.658 +				md->virt_addr = (u64) __va(md->phys_addr);
   3.659 +			} else if (md->attribute & EFI_MEMORY_UC) {
   3.660 +				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
   3.661 +			} else if (md->attribute & EFI_MEMORY_WC) {
   3.662 +#if 0
   3.663 +				md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
   3.664 +									   | _PAGE_D
   3.665 +									   | _PAGE_MA_WC
   3.666 +									   | _PAGE_PL_0
   3.667 +									   | _PAGE_AR_RW));
   3.668 +#else
   3.669 +				printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
   3.670 +				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
   3.671 +#endif
   3.672 +			} else if (md->attribute & EFI_MEMORY_WT) {
   3.673 +#if 0
   3.674 +				md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
   3.675 +									   | _PAGE_D | _PAGE_MA_WT
   3.676 +									   | _PAGE_PL_0
   3.677 +									   | _PAGE_AR_RW));
   3.678 +#else
   3.679 +				printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
   3.680 +				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
   3.681 +#endif
   3.682 +			}
   3.683 +		}
   3.684 +	}
   3.685 +
   3.686 +	status = efi_call_phys(__va(runtime->set_virtual_address_map),
   3.687 +			       ia64_boot_param->efi_memmap_size,
   3.688 +			       efi_desc_size, ia64_boot_param->efi_memdesc_version,
   3.689 +			       ia64_boot_param->efi_memmap);
   3.690 +	if (status != EFI_SUCCESS) {
   3.691 +		printk(KERN_WARNING "warning: unable to switch EFI into virtual mode "
   3.692 +		       "(status=%lu)\n", status);
   3.693 +		return;
   3.694 +	}
   3.695 +
   3.696 +	/*
   3.697 +	 * Now that EFI is in virtual mode, we call the EFI functions more efficiently:
   3.698 +	 */
   3.699 +	efi.get_time = virt_get_time;
   3.700 +	efi.set_time = virt_set_time;
   3.701 +	efi.get_wakeup_time = virt_get_wakeup_time;
   3.702 +	efi.set_wakeup_time = virt_set_wakeup_time;
   3.703 +	efi.get_variable = virt_get_variable;
   3.704 +	efi.get_next_variable = virt_get_next_variable;
   3.705 +	efi.set_variable = virt_set_variable;
   3.706 +	efi.get_next_high_mono_count = virt_get_next_high_mono_count;
   3.707 +	efi.reset_system = virt_reset_system;
   3.708 +}
   3.709 +
   3.710 +/*
   3.711 + * Walk the EFI memory map looking for the I/O port range.  There can only be one entry of
   3.712 + * this type, other I/O port ranges should be described via ACPI.
   3.713 + */
   3.714 +u64
   3.715 +efi_get_iobase (void)
   3.716 +{
   3.717 +	void *efi_map_start, *efi_map_end, *p;
   3.718 +	efi_memory_desc_t *md;
   3.719 +	u64 efi_desc_size;
   3.720 +
   3.721 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.722 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.723 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.724 +
   3.725 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.726 +		md = p;
   3.727 +		if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
   3.728 +			if (md->attribute & EFI_MEMORY_UC)
   3.729 +				return md->phys_addr;
   3.730 +		}
   3.731 +	}
   3.732 +	return 0;
   3.733 +}
   3.734 +
   3.735 +#ifdef XEN
   3.736 +// variation of efi_get_iobase which returns entire memory descriptor
   3.737 +efi_memory_desc_t *
   3.738 +efi_get_io_md (void)
   3.739 +{
   3.740 +	void *efi_map_start, *efi_map_end, *p;
   3.741 +	efi_memory_desc_t *md;
   3.742 +	u64 efi_desc_size;
   3.743 +
   3.744 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.745 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.746 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.747 +
   3.748 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.749 +		md = p;
   3.750 +		if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
   3.751 +			if (md->attribute & EFI_MEMORY_UC)
   3.752 +				return md;
   3.753 +		}
   3.754 +	}
   3.755 +	return 0;
   3.756 +}
   3.757 +#endif
   3.758 +
   3.759 +u32
   3.760 +efi_mem_type (unsigned long phys_addr)
   3.761 +{
   3.762 +	void *efi_map_start, *efi_map_end, *p;
   3.763 +	efi_memory_desc_t *md;
   3.764 +	u64 efi_desc_size;
   3.765 +
   3.766 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.767 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.768 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.769 +
   3.770 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.771 +		md = p;
   3.772 +
   3.773 +		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
   3.774 +			 return md->type;
   3.775 +	}
   3.776 +	return 0;
   3.777 +}
   3.778 +
   3.779 +u64
   3.780 +efi_mem_attributes (unsigned long phys_addr)
   3.781 +{
   3.782 +	void *efi_map_start, *efi_map_end, *p;
   3.783 +	efi_memory_desc_t *md;
   3.784 +	u64 efi_desc_size;
   3.785 +
   3.786 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.787 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.788 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.789 +
   3.790 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.791 +		md = p;
   3.792 +
   3.793 +		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
   3.794 +			return md->attribute;
   3.795 +	}
   3.796 +	return 0;
   3.797 +}
   3.798 +EXPORT_SYMBOL(efi_mem_attributes);
   3.799 +
   3.800 +int
   3.801 +valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
   3.802 +{
   3.803 +	void *efi_map_start, *efi_map_end, *p;
   3.804 +	efi_memory_desc_t *md;
   3.805 +	u64 efi_desc_size;
   3.806 +
   3.807 +	efi_map_start = __va(ia64_boot_param->efi_memmap);
   3.808 +	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
   3.809 +	efi_desc_size = ia64_boot_param->efi_memdesc_size;
   3.810 +
   3.811 +	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
   3.812 +		md = p;
   3.813 +
   3.814 +		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) {
   3.815 +			if (!(md->attribute & EFI_MEMORY_WB))
   3.816 +				return 0;
   3.817 +
   3.818 +			if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr)
   3.819 +				*size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr;
   3.820 +			return 1;
   3.821 +		}
   3.822 +	}
   3.823 +	return 0;
   3.824 +}
   3.825 +
   3.826 +int __init
   3.827 +efi_uart_console_only(void)
   3.828 +{
   3.829 +	efi_status_t status;
   3.830 +	char *s, name[] = "ConOut";
   3.831 +	efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID;
   3.832 +	efi_char16_t *utf16, name_utf16[32];
   3.833 +	unsigned char data[1024];
   3.834 +	unsigned long size = sizeof(data);
   3.835 +	struct efi_generic_dev_path *hdr, *end_addr;
   3.836 +	int uart = 0;
   3.837 +
   3.838 +	/* Convert to UTF-16 */
   3.839 +	utf16 = name_utf16;
   3.840 +	s = name;
   3.841 +	while (*s)
   3.842 +		*utf16++ = *s++ & 0x7f;
   3.843 +	*utf16 = 0;
   3.844 +
   3.845 +	status = efi.get_variable(name_utf16, &guid, NULL, &size, data);
   3.846 +	if (status != EFI_SUCCESS) {
   3.847 +		printk(KERN_ERR "No EFI %s variable?\n", name);
   3.848 +		return 0;
   3.849 +	}
   3.850 +
   3.851 +	hdr = (struct efi_generic_dev_path *) data;
   3.852 +	end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size);
   3.853 +	while (hdr < end_addr) {
   3.854 +		if (hdr->type == EFI_DEV_MSG &&
   3.855 +		    hdr->sub_type == EFI_DEV_MSG_UART)
   3.856 +			uart = 1;
   3.857 +		else if (hdr->type == EFI_DEV_END_PATH ||
   3.858 +			  hdr->type == EFI_DEV_END_PATH2) {
   3.859 +			if (!uart)
   3.860 +				return 0;
   3.861 +			if (hdr->sub_type == EFI_DEV_END_ENTIRE)
   3.862 +				return 1;
   3.863 +			uart = 0;
   3.864 +		}
   3.865 +		hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length);
   3.866 +	}
   3.867 +	printk(KERN_ERR "Malformed %s value\n", name);
   3.868 +	return 0;
   3.869 +}
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/xen/arch/ia64/entry.S	Tue Aug 02 15:59:09 2005 -0800
     4.3 @@ -0,0 +1,1653 @@
     4.4 +/*
     4.5 + * ia64/kernel/entry.S
     4.6 + *
     4.7 + * Kernel entry points.
     4.8 + *
     4.9 + * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
    4.10 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    4.11 + * Copyright (C) 1999, 2002-2003
    4.12 + *	Asit Mallick <Asit.K.Mallick@intel.com>
    4.13 + * 	Don Dugger <Don.Dugger@intel.com>
    4.14 + *	Suresh Siddha <suresh.b.siddha@intel.com>
    4.15 + *	Fenghua Yu <fenghua.yu@intel.com>
    4.16 + * Copyright (C) 1999 VA Linux Systems
    4.17 + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
    4.18 + */
    4.19 +/*
    4.20 + * ia64_switch_to now places correct virtual mapping in in TR2 for
    4.21 + * kernel stack. This allows us to handle interrupts without changing
    4.22 + * to physical mode.
    4.23 + *
    4.24 + * Jonathan Nicklin	<nicklin@missioncriticallinux.com>
    4.25 + * Patrick O'Rourke	<orourke@missioncriticallinux.com>
    4.26 + * 11/07/2000
    4.27 + */
    4.28 +/*
    4.29 + * Global (preserved) predicate usage on syscall entry/exit path:
    4.30 + *
    4.31 + *	pKStk:		See entry.h.
    4.32 + *	pUStk:		See entry.h.
    4.33 + *	pSys:		See entry.h.
    4.34 + *	pNonSys:	!pSys
    4.35 + */
    4.36 +
    4.37 +#include <linux/config.h>
    4.38 +
    4.39 +#include <asm/asmmacro.h>
    4.40 +#include <asm/cache.h>
    4.41 +#include <asm/errno.h>
    4.42 +#include <asm/kregs.h>
    4.43 +#include <asm/offsets.h>
    4.44 +#include <asm/pgtable.h>
    4.45 +#include <asm/percpu.h>
    4.46 +#include <asm/processor.h>
    4.47 +#include <asm/thread_info.h>
    4.48 +#include <asm/unistd.h>
    4.49 +
    4.50 +#include "minstate.h"
    4.51 +
    4.52 +#ifndef XEN
    4.53 +	/*
    4.54 +	 * execve() is special because in case of success, we need to
    4.55 +	 * setup a null register window frame.
    4.56 +	 */
    4.57 +ENTRY(ia64_execve)
    4.58 +	/*
    4.59 +	 * Allocate 8 input registers since ptrace() may clobber them
    4.60 +	 */
    4.61 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
    4.62 +	alloc loc1=ar.pfs,8,2,4,0
    4.63 +	mov loc0=rp
    4.64 +	.body
    4.65 +	mov out0=in0			// filename
    4.66 +	;;				// stop bit between alloc and call
    4.67 +	mov out1=in1			// argv
    4.68 +	mov out2=in2			// envp
    4.69 +	add out3=16,sp			// regs
    4.70 +	br.call.sptk.many rp=sys_execve
    4.71 +.ret0:
    4.72 +#ifdef CONFIG_IA32_SUPPORT
    4.73 +	/*
    4.74 +	 * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers
    4.75 +	 * from pt_regs.
    4.76 +	 */
    4.77 +	adds r16=PT(CR_IPSR)+16,sp
    4.78 +	;;
    4.79 +	ld8 r16=[r16]
    4.80 +#endif
    4.81 +	cmp4.ge p6,p7=r8,r0
    4.82 +	mov ar.pfs=loc1			// restore ar.pfs
    4.83 +	sxt4 r8=r8			// return 64-bit result
    4.84 +	;;
    4.85 +	stf.spill [sp]=f0
    4.86 +(p6)	cmp.ne pKStk,pUStk=r0,r0	// a successful execve() lands us in user-mode...
    4.87 +	mov rp=loc0
    4.88 +(p6)	mov ar.pfs=r0			// clear ar.pfs on success
    4.89 +(p7)	br.ret.sptk.many rp
    4.90 +
    4.91 +	/*
    4.92 +	 * In theory, we'd have to zap this state only to prevent leaking of
    4.93 +	 * security sensitive state (e.g., if current->mm->dumpable is zero).  However,
    4.94 +	 * this executes in less than 20 cycles even on Itanium, so it's not worth
    4.95 +	 * optimizing for...).
    4.96 +	 */
    4.97 +	mov ar.unat=0; 		mov ar.lc=0
    4.98 +	mov r4=0;		mov f2=f0;		mov b1=r0
    4.99 +	mov r5=0;		mov f3=f0;		mov b2=r0
   4.100 +	mov r6=0;		mov f4=f0;		mov b3=r0
   4.101 +	mov r7=0;		mov f5=f0;		mov b4=r0
   4.102 +	ldf.fill f12=[sp];	mov f13=f0;		mov b5=r0
   4.103 +	ldf.fill f14=[sp];	ldf.fill f15=[sp];	mov f16=f0
   4.104 +	ldf.fill f17=[sp];	ldf.fill f18=[sp];	mov f19=f0
   4.105 +	ldf.fill f20=[sp];	ldf.fill f21=[sp];	mov f22=f0
   4.106 +	ldf.fill f23=[sp];	ldf.fill f24=[sp];	mov f25=f0
   4.107 +	ldf.fill f26=[sp];	ldf.fill f27=[sp];	mov f28=f0
   4.108 +	ldf.fill f29=[sp];	ldf.fill f30=[sp];	mov f31=f0
   4.109 +#ifdef CONFIG_IA32_SUPPORT
   4.110 +	tbit.nz p6,p0=r16, IA64_PSR_IS_BIT
   4.111 +	movl loc0=ia64_ret_from_ia32_execve
   4.112 +	;;
   4.113 +(p6)	mov rp=loc0
   4.114 +#endif
   4.115 +	br.ret.sptk.many rp
   4.116 +END(ia64_execve)
   4.117 +
   4.118 +/*
   4.119 + * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr,
   4.120 + *	      u64 tls)
   4.121 + */
   4.122 +GLOBAL_ENTRY(sys_clone2)
   4.123 +	/*
   4.124 +	 * Allocate 8 input registers since ptrace() may clobber them
   4.125 +	 */
   4.126 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
   4.127 +	alloc r16=ar.pfs,8,2,6,0
   4.128 +	DO_SAVE_SWITCH_STACK
   4.129 +	adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp
   4.130 +	mov loc0=rp
   4.131 +	mov loc1=r16				// save ar.pfs across do_fork
   4.132 +	.body
   4.133 +	mov out1=in1
   4.134 +	mov out3=in2
   4.135 +	tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
   4.136 +	mov out4=in3	// parent_tidptr: valid only w/CLONE_PARENT_SETTID
   4.137 +	;;
   4.138 +(p6)	st8 [r2]=in5				// store TLS in r16 for copy_thread()
   4.139 +	mov out5=in4	// child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
   4.140 +	adds out2=IA64_SWITCH_STACK_SIZE+16,sp	// out2 = &regs
   4.141 +	mov out0=in0				// out0 = clone_flags
   4.142 +	br.call.sptk.many rp=do_fork
   4.143 +.ret1:	.restore sp
   4.144 +	adds sp=IA64_SWITCH_STACK_SIZE,sp	// pop the switch stack
   4.145 +	mov ar.pfs=loc1
   4.146 +	mov rp=loc0
   4.147 +	br.ret.sptk.many rp
   4.148 +END(sys_clone2)
   4.149 +
   4.150 +/*
   4.151 + * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls)
   4.152 + *	Deprecated.  Use sys_clone2() instead.
   4.153 + */
   4.154 +GLOBAL_ENTRY(sys_clone)
   4.155 +	/*
   4.156 +	 * Allocate 8 input registers since ptrace() may clobber them
   4.157 +	 */
   4.158 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
   4.159 +	alloc r16=ar.pfs,8,2,6,0
   4.160 +	DO_SAVE_SWITCH_STACK
   4.161 +	adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp
   4.162 +	mov loc0=rp
   4.163 +	mov loc1=r16				// save ar.pfs across do_fork
   4.164 +	.body
   4.165 +	mov out1=in1
   4.166 +	mov out3=16				// stacksize (compensates for 16-byte scratch area)
   4.167 +	tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
   4.168 +	mov out4=in2	// parent_tidptr: valid only w/CLONE_PARENT_SETTID
   4.169 +	;;
   4.170 +(p6)	st8 [r2]=in4				// store TLS in r13 (tp)
   4.171 +	mov out5=in3	// child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
   4.172 +	adds out2=IA64_SWITCH_STACK_SIZE+16,sp	// out2 = &regs
   4.173 +	mov out0=in0				// out0 = clone_flags
   4.174 +	br.call.sptk.many rp=do_fork
   4.175 +.ret2:	.restore sp
   4.176 +	adds sp=IA64_SWITCH_STACK_SIZE,sp	// pop the switch stack
   4.177 +	mov ar.pfs=loc1
   4.178 +	mov rp=loc0
   4.179 +	br.ret.sptk.many rp
   4.180 +END(sys_clone)
   4.181 +#endif /* !XEN */
   4.182 +
   4.183 +/*
   4.184 + * prev_task <- ia64_switch_to(struct task_struct *next)
   4.185 + *	With Ingo's new scheduler, interrupts are disabled when this routine gets
   4.186 + *	called.  The code starting at .map relies on this.  The rest of the code
   4.187 + *	doesn't care about the interrupt masking status.
   4.188 + */
   4.189 +GLOBAL_ENTRY(ia64_switch_to)
   4.190 +	.prologue
   4.191 +	alloc r16=ar.pfs,1,0,0,0
   4.192 +	DO_SAVE_SWITCH_STACK
   4.193 +	.body
   4.194 +
   4.195 +	adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
   4.196 +	movl r25=init_task
   4.197 +	mov r27=IA64_KR(CURRENT_STACK)
   4.198 +	adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
   4.199 +#ifdef XEN
   4.200 +	dep r20=0,in0,60,4		// physical address of "next"
   4.201 +#else
   4.202 +	dep r20=0,in0,61,3		// physical address of "next"
   4.203 +#endif
   4.204 +	;;
   4.205 +	st8 [r22]=sp			// save kernel stack pointer of old task
   4.206 +	shr.u r26=r20,IA64_GRANULE_SHIFT
   4.207 +	cmp.eq p7,p6=r25,in0
   4.208 +	;;
   4.209 +	/*
   4.210 +	 * If we've already mapped this task's page, we can skip doing it again.
   4.211 +	 */
   4.212 +(p6)	cmp.eq p7,p6=r26,r27
   4.213 +(p6)	br.cond.dpnt .map
   4.214 +	;;
   4.215 +.done:
   4.216 +(p6)	ssm psr.ic			// if we had to map, reenable the psr.ic bit FIRST!!!
   4.217 +	;;
   4.218 +(p6)	srlz.d
   4.219 +	ld8 sp=[r21]			// load kernel stack pointer of new task
   4.220 +	mov IA64_KR(CURRENT)=in0	// update "current" application register
   4.221 +	mov r8=r13			// return pointer to previously running task
   4.222 +	mov r13=in0			// set "current" pointer
   4.223 +	;;
   4.224 +	DO_LOAD_SWITCH_STACK
   4.225 +
   4.226 +#ifdef CONFIG_SMP
   4.227 +	sync.i				// ensure "fc"s done by this CPU are visible on other CPUs
   4.228 +#endif
   4.229 +	br.ret.sptk.many rp		// boogie on out in new context
   4.230 +
   4.231 +.map:
   4.232 +#ifdef XEN
   4.233 +	// avoid overlapping with kernel TR
   4.234 +	movl r25=KERNEL_START
   4.235 +	dep  r23=0,in0,0,KERNEL_TR_PAGE_SHIFT
   4.236 +	;;
   4.237 +	cmp.eq p7,p0=r25,r23
   4.238 +	;;
   4.239 +(p7)	mov IA64_KR(CURRENT_STACK)=r26	// remember last page we mapped...
   4.240 +(p7)	br.cond.sptk .done
   4.241 +#endif
   4.242 +	rsm psr.ic			// interrupts (psr.i) are already disabled here
   4.243 +	movl r25=PAGE_KERNEL
   4.244 +	;;
   4.245 +	srlz.d
   4.246 +	or r23=r25,r20			// construct PA | page properties
   4.247 +	mov r25=IA64_GRANULE_SHIFT<<2
   4.248 +	;;
   4.249 +	mov cr.itir=r25
   4.250 +	mov cr.ifa=in0			// VA of next task...
   4.251 +	;;
   4.252 +	mov r25=IA64_TR_CURRENT_STACK
   4.253 +	mov IA64_KR(CURRENT_STACK)=r26	// remember last page we mapped...
   4.254 +	;;
   4.255 +	itr.d dtr[r25]=r23		// wire in new mapping...
   4.256 +	br.cond.sptk .done
   4.257 +END(ia64_switch_to)
   4.258 +
   4.259 +/*
   4.260 + * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
   4.261 + * means that we may get an interrupt with "sp" pointing to the new kernel stack while
   4.262 + * ar.bspstore is still pointing to the old kernel backing store area.  Since ar.rsc,
   4.263 + * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a
   4.264 + * problem.  Also, we don't need to specify unwind information for preserved registers
   4.265 + * that are not modified in save_switch_stack as the right unwind information is already
   4.266 + * specified at the call-site of save_switch_stack.
   4.267 + */
   4.268 +
   4.269 +/*
   4.270 + * save_switch_stack:
   4.271 + *	- r16 holds ar.pfs
   4.272 + *	- b7 holds address to return to
   4.273 + *	- rp (b0) holds return address to save
   4.274 + */
   4.275 +GLOBAL_ENTRY(save_switch_stack)
   4.276 +	.prologue
   4.277 +	.altrp b7
   4.278 +	flushrs			// flush dirty regs to backing store (must be first in insn group)
   4.279 +	.save @priunat,r17
   4.280 +	mov r17=ar.unat		// preserve caller's
   4.281 +	.body
   4.282 +#ifdef CONFIG_ITANIUM
   4.283 +	adds r2=16+128,sp
   4.284 +	adds r3=16+64,sp
   4.285 +	adds r14=SW(R4)+16,sp
   4.286 +	;;
   4.287 +	st8.spill [r14]=r4,16		// spill r4
   4.288 +	lfetch.fault.excl.nt1 [r3],128
   4.289 +	;;
   4.290 +	lfetch.fault.excl.nt1 [r2],128
   4.291 +	lfetch.fault.excl.nt1 [r3],128
   4.292 +	;;
   4.293 +	lfetch.fault.excl [r2]
   4.294 +	lfetch.fault.excl [r3]
   4.295 +	adds r15=SW(R5)+16,sp
   4.296 +#else
   4.297 +	add r2=16+3*128,sp
   4.298 +	add r3=16,sp
   4.299 +	add r14=SW(R4)+16,sp
   4.300 +	;;
   4.301 +	st8.spill [r14]=r4,SW(R6)-SW(R4)	// spill r4 and prefetch offset 0x1c0
   4.302 +	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x010
   4.303 +	;;
   4.304 +	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x090
   4.305 +	lfetch.fault.excl.nt1 [r2],128	//		prefetch offset 0x190
   4.306 +	;;
   4.307 +	lfetch.fault.excl.nt1 [r3]	//		prefetch offset 0x110
   4.308 +	lfetch.fault.excl.nt1 [r2]	//		prefetch offset 0x210
   4.309 +	adds r15=SW(R5)+16,sp
   4.310 +#endif
   4.311 +	;;
   4.312 +	st8.spill [r15]=r5,SW(R7)-SW(R5)	// spill r5
   4.313 +	mov.m ar.rsc=0			// put RSE in mode: enforced lazy, little endian, pl 0
   4.314 +	add r2=SW(F2)+16,sp		// r2 = &sw->f2
   4.315 +	;;
   4.316 +	st8.spill [r14]=r6,SW(B0)-SW(R6)	// spill r6
   4.317 +	mov.m r18=ar.fpsr		// preserve fpsr
   4.318 +	add r3=SW(F3)+16,sp		// r3 = &sw->f3
   4.319 +	;;
   4.320 +	stf.spill [r2]=f2,32
   4.321 +	mov.m r19=ar.rnat
   4.322 +	mov r21=b0
   4.323 +
   4.324 +	stf.spill [r3]=f3,32
   4.325 +	st8.spill [r15]=r7,SW(B2)-SW(R7)	// spill r7
   4.326 +	mov r22=b1
   4.327 +	;;
   4.328 +	// since we're done with the spills, read and save ar.unat:
   4.329 +	mov.m r29=ar.unat
   4.330 +	mov.m r20=ar.bspstore
   4.331 +	mov r23=b2
   4.332 +	stf.spill [r2]=f4,32
   4.333 +	stf.spill [r3]=f5,32
   4.334 +	mov r24=b3
   4.335 +	;;
   4.336 +	st8 [r14]=r21,SW(B1)-SW(B0)		// save b0
   4.337 +	st8 [r15]=r23,SW(B3)-SW(B2)		// save b2
   4.338 +	mov r25=b4
   4.339 +	mov r26=b5
   4.340 +	;;
   4.341 +	st8 [r14]=r22,SW(B4)-SW(B1)		// save b1
   4.342 +	st8 [r15]=r24,SW(AR_PFS)-SW(B3)		// save b3
   4.343 +	mov r21=ar.lc		// I-unit
   4.344 +	stf.spill [r2]=f12,32
   4.345 +	stf.spill [r3]=f13,32
   4.346 +	;;
   4.347 +	st8 [r14]=r25,SW(B5)-SW(B4)		// save b4
   4.348 +	st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS)	// save ar.pfs
   4.349 +	stf.spill [r2]=f14,32
   4.350 +	stf.spill [r3]=f15,32
   4.351 +	;;
   4.352 +	st8 [r14]=r26				// save b5
   4.353 +	st8 [r15]=r21				// save ar.lc
   4.354 +	stf.spill [r2]=f16,32
   4.355 +	stf.spill [r3]=f17,32
   4.356 +	;;
   4.357 +	stf.spill [r2]=f18,32
   4.358 +	stf.spill [r3]=f19,32
   4.359 +	;;
   4.360 +	stf.spill [r2]=f20,32
   4.361 +	stf.spill [r3]=f21,32
   4.362 +	;;
   4.363 +	stf.spill [r2]=f22,32
   4.364 +	stf.spill [r3]=f23,32
   4.365 +	;;
   4.366 +	stf.spill [r2]=f24,32
   4.367 +	stf.spill [r3]=f25,32
   4.368 +	;;
   4.369 +	stf.spill [r2]=f26,32
   4.370 +	stf.spill [r3]=f27,32
   4.371 +	;;
   4.372 +	stf.spill [r2]=f28,32
   4.373 +	stf.spill [r3]=f29,32
   4.374 +	;;
   4.375 +	stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
   4.376 +	stf.spill [r3]=f31,SW(PR)-SW(F31)
   4.377 +	add r14=SW(CALLER_UNAT)+16,sp
   4.378 +	;;
   4.379 +	st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT)	// save ar.unat
   4.380 +	st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
   4.381 +	mov r21=pr
   4.382 +	;;
   4.383 +	st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
   4.384 +	st8 [r3]=r21				// save predicate registers
   4.385 +	;;
   4.386 +	st8 [r2]=r20				// save ar.bspstore
   4.387 +	st8 [r14]=r18				// save fpsr
   4.388 +	mov ar.rsc=3		// put RSE back into eager mode, pl 0
   4.389 +	br.cond.sptk.many b7
   4.390 +END(save_switch_stack)
   4.391 +
   4.392 +/*
   4.393 + * load_switch_stack:
   4.394 + *	- "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK)
   4.395 + *	- b7 holds address to return to
   4.396 + *	- must not touch r8-r11
   4.397 + */
   4.398 +#ifdef XEN
   4.399 +GLOBAL_ENTRY(load_switch_stack)
   4.400 +#else
   4.401 +ENTRY(load_switch_stack)
   4.402 +#endif
   4.403 +	.prologue
   4.404 +	.altrp b7
   4.405 +
   4.406 +	.body
   4.407 +	lfetch.fault.nt1 [sp]
   4.408 +	adds r2=SW(AR_BSPSTORE)+16,sp
   4.409 +	adds r3=SW(AR_UNAT)+16,sp
   4.410 +	mov ar.rsc=0						// put RSE into enforced lazy mode
   4.411 +	adds r14=SW(CALLER_UNAT)+16,sp
   4.412 +	adds r15=SW(AR_FPSR)+16,sp
   4.413 +	;;
   4.414 +	ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE))	// bspstore
   4.415 +	ld8 r29=[r3],(SW(B1)-SW(AR_UNAT))	// unat
   4.416 +	;;
   4.417 +	ld8 r21=[r2],16		// restore b0
   4.418 +	ld8 r22=[r3],16		// restore b1
   4.419 +	;;
   4.420 +	ld8 r23=[r2],16		// restore b2
   4.421 +	ld8 r24=[r3],16		// restore b3
   4.422 +	;;
   4.423 +	ld8 r25=[r2],16		// restore b4
   4.424 +	ld8 r26=[r3],16		// restore b5
   4.425 +	;;
   4.426 +	ld8 r16=[r2],(SW(PR)-SW(AR_PFS))	// ar.pfs
   4.427 +	ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC))	// ar.lc
   4.428 +	;;
   4.429 +	ld8 r28=[r2]		// restore pr
   4.430 +	ld8 r30=[r3]		// restore rnat
   4.431 +	;;
   4.432 +	ld8 r18=[r14],16	// restore caller's unat
   4.433 +	ld8 r19=[r15],24	// restore fpsr
   4.434 +	;;
   4.435 +	ldf.fill f2=[r14],32
   4.436 +	ldf.fill f3=[r15],32
   4.437 +	;;
   4.438 +	ldf.fill f4=[r14],32
   4.439 +	ldf.fill f5=[r15],32
   4.440 +	;;
   4.441 +	ldf.fill f12=[r14],32
   4.442 +	ldf.fill f13=[r15],32
   4.443 +	;;
   4.444 +	ldf.fill f14=[r14],32
   4.445 +	ldf.fill f15=[r15],32
   4.446 +	;;
   4.447 +	ldf.fill f16=[r14],32
   4.448 +	ldf.fill f17=[r15],32
   4.449 +	;;
   4.450 +	ldf.fill f18=[r14],32
   4.451 +	ldf.fill f19=[r15],32
   4.452 +	mov b0=r21
   4.453 +	;;
   4.454 +	ldf.fill f20=[r14],32
   4.455 +	ldf.fill f21=[r15],32
   4.456 +	mov b1=r22
   4.457 +	;;
   4.458 +	ldf.fill f22=[r14],32
   4.459 +	ldf.fill f23=[r15],32
   4.460 +	mov b2=r23
   4.461 +	;;
   4.462 +	mov ar.bspstore=r27
   4.463 +	mov ar.unat=r29		// establish unat holding the NaT bits for r4-r7
   4.464 +	mov b3=r24
   4.465 +	;;
   4.466 +	ldf.fill f24=[r14],32
   4.467 +	ldf.fill f25=[r15],32
   4.468 +	mov b4=r25
   4.469 +	;;
   4.470 +	ldf.fill f26=[r14],32
   4.471 +	ldf.fill f27=[r15],32
   4.472 +	mov b5=r26
   4.473 +	;;
   4.474 +	ldf.fill f28=[r14],32
   4.475 +	ldf.fill f29=[r15],32
   4.476 +	mov ar.pfs=r16
   4.477 +	;;
   4.478 +	ldf.fill f30=[r14],32
   4.479 +	ldf.fill f31=[r15],24
   4.480 +	mov ar.lc=r17
   4.481 +	;;
   4.482 +	ld8.fill r4=[r14],16
   4.483 +	ld8.fill r5=[r15],16
   4.484 +	mov pr=r28,-1
   4.485 +	;;
   4.486 +	ld8.fill r6=[r14],16
   4.487 +	ld8.fill r7=[r15],16
   4.488 +
   4.489 +	mov ar.unat=r18				// restore caller's unat
   4.490 +	mov ar.rnat=r30				// must restore after bspstore but before rsc!
   4.491 +	mov ar.fpsr=r19				// restore fpsr
   4.492 +	mov ar.rsc=3				// put RSE back into eager mode, pl 0
   4.493 +	br.cond.sptk.many b7
   4.494 +END(load_switch_stack)
   4.495 +
   4.496 +#ifndef XEN
   4.497 +GLOBAL_ENTRY(__ia64_syscall)
   4.498 +	.regstk 6,0,0,0
   4.499 +	mov r15=in5				// put syscall number in place
   4.500 +	break __BREAK_SYSCALL
   4.501 +	movl r2=errno
   4.502 +	cmp.eq p6,p7=-1,r10
   4.503 +	;;
   4.504 +(p6)	st4 [r2]=r8
   4.505 +(p6)	mov r8=-1
   4.506 +	br.ret.sptk.many rp
   4.507 +END(__ia64_syscall)
   4.508 +
   4.509 +GLOBAL_ENTRY(execve)
   4.510 +	mov r15=__NR_execve			// put syscall number in place
   4.511 +	break __BREAK_SYSCALL
   4.512 +	br.ret.sptk.many rp
   4.513 +END(execve)
   4.514 +
   4.515 +GLOBAL_ENTRY(clone)
   4.516 +	mov r15=__NR_clone			// put syscall number in place
   4.517 +	break __BREAK_SYSCALL
   4.518 +	br.ret.sptk.many rp
   4.519 +END(clone)
   4.520 +
   4.521 +	/*
   4.522 +	 * Invoke a system call, but do some tracing before and after the call.
   4.523 +	 * We MUST preserve the current register frame throughout this routine
   4.524 +	 * because some system calls (such as ia64_execve) directly
   4.525 +	 * manipulate ar.pfs.
   4.526 +	 */
   4.527 +GLOBAL_ENTRY(ia64_trace_syscall)
   4.528 +	PT_REGS_UNWIND_INFO(0)
   4.529 +	/*
   4.530 +	 * We need to preserve the scratch registers f6-f11 in case the system
   4.531 +	 * call is sigreturn.
   4.532 +	 */
   4.533 +	adds r16=PT(F6)+16,sp
   4.534 +	adds r17=PT(F7)+16,sp
   4.535 +	;;
   4.536 + 	stf.spill [r16]=f6,32
   4.537 + 	stf.spill [r17]=f7,32
   4.538 +	;;
   4.539 + 	stf.spill [r16]=f8,32
   4.540 + 	stf.spill [r17]=f9,32
   4.541 +	;;
   4.542 + 	stf.spill [r16]=f10
   4.543 + 	stf.spill [r17]=f11
   4.544 +	br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
   4.545 +	adds r16=PT(F6)+16,sp
   4.546 +	adds r17=PT(F7)+16,sp
   4.547 +	;;
   4.548 +	ldf.fill f6=[r16],32
   4.549 +	ldf.fill f7=[r17],32
   4.550 +	;;
   4.551 +	ldf.fill f8=[r16],32
   4.552 +	ldf.fill f9=[r17],32
   4.553 +	;;
   4.554 +	ldf.fill f10=[r16]
   4.555 +	ldf.fill f11=[r17]
   4.556 +	// the syscall number may have changed, so re-load it and re-calculate the
   4.557 +	// syscall entry-point:
   4.558 +	adds r15=PT(R15)+16,sp			// r15 = &pt_regs.r15 (syscall #)
   4.559 +	;;
   4.560 +	ld8 r15=[r15]
   4.561 +	mov r3=NR_syscalls - 1
   4.562 +	;;
   4.563 +	adds r15=-1024,r15
   4.564 +	movl r16=sys_call_table
   4.565 +	;;
   4.566 +	shladd r20=r15,3,r16			// r20 = sys_call_table + 8*(syscall-1024)
   4.567 +	cmp.leu p6,p7=r15,r3
   4.568 +	;;
   4.569 +(p6)	ld8 r20=[r20]				// load address of syscall entry point
   4.570 +(p7)	movl r20=sys_ni_syscall
   4.571 +	;;
   4.572 +	mov b6=r20
   4.573 +	br.call.sptk.many rp=b6			// do the syscall
   4.574 +.strace_check_retval:
   4.575 +	cmp.lt p6,p0=r8,r0			// syscall failed?
   4.576 +	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
   4.577 +	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
   4.578 +	mov r10=0
   4.579 +(p6)	br.cond.sptk strace_error		// syscall failed ->
   4.580 +	;;					// avoid RAW on r10
   4.581 +.strace_save_retval:
   4.582 +.mem.offset 0,0; st8.spill [r2]=r8		// store return value in slot for r8
   4.583 +.mem.offset 8,0; st8.spill [r3]=r10		// clear error indication in slot for r10
   4.584 +	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
   4.585 +.ret3:	br.cond.sptk .work_pending_syscall_end
   4.586 +
   4.587 +strace_error:
   4.588 +	ld8 r3=[r2]				// load pt_regs.r8
   4.589 +	sub r9=0,r8				// negate return value to get errno value
   4.590 +	;;
   4.591 +	cmp.ne p6,p0=r3,r0			// is pt_regs.r8!=0?
   4.592 +	adds r3=16,r2				// r3=&pt_regs.r10
   4.593 +	;;
   4.594 +(p6)	mov r10=-1
   4.595 +(p6)	mov r8=r9
   4.596 +	br.cond.sptk .strace_save_retval
   4.597 +END(ia64_trace_syscall)
   4.598 +
   4.599 +	/*
   4.600 +	 * When traced and returning from sigreturn, we invoke syscall_trace but then
   4.601 +	 * go straight to ia64_leave_kernel rather than ia64_leave_syscall.
   4.602 +	 */
   4.603 +GLOBAL_ENTRY(ia64_strace_leave_kernel)
   4.604 +	PT_REGS_UNWIND_INFO(0)
   4.605 +{	/*
   4.606 +	 * Some versions of gas generate bad unwind info if the first instruction of a
   4.607 +	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
   4.608 +	 */
   4.609 +	nop.m 0
   4.610 +	nop.i 0
   4.611 +	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
   4.612 +}
   4.613 +.ret4:	br.cond.sptk ia64_leave_kernel
   4.614 +END(ia64_strace_leave_kernel)
   4.615 +#endif
   4.616 +
   4.617 +GLOBAL_ENTRY(ia64_ret_from_clone)
   4.618 +	PT_REGS_UNWIND_INFO(0)
   4.619 +{	/*
   4.620 +	 * Some versions of gas generate bad unwind info if the first instruction of a
   4.621 +	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
   4.622 +	 */
   4.623 +	nop.m 0
   4.624 +	nop.i 0
   4.625 +	/*
   4.626 +	 * We need to call schedule_tail() to complete the scheduling process.
   4.627 +	 * Called by ia64_switch_to() after do_fork()->copy_thread().  r8 contains the
   4.628 +	 * address of the previously executing task.
   4.629 +	 */
   4.630 +	br.call.sptk.many rp=ia64_invoke_schedule_tail
   4.631 +}
   4.632 +#ifdef XEN
   4.633 +	// new domains are cloned but not exec'ed so switch to user mode here
   4.634 +	cmp.ne pKStk,pUStk=r0,r0
   4.635 +#ifdef CONFIG_VTI
   4.636 +	br.cond.spnt ia64_leave_hypervisor
   4.637 +#else // CONFIG_VTI
   4.638 +	br.cond.spnt ia64_leave_kernel
   4.639 +#endif // CONFIG_VTI
   4.640 +#else
   4.641 +.ret8:
   4.642 +	adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
   4.643 +	;;
   4.644 +	ld4 r2=[r2]
   4.645 +	;;
   4.646 +	mov r8=0
   4.647 +	and r2=_TIF_SYSCALL_TRACEAUDIT,r2
   4.648 +	;;
   4.649 +	cmp.ne p6,p0=r2,r0
   4.650 +(p6)	br.cond.spnt .strace_check_retval
   4.651 +#endif
   4.652 +	;;					// added stop bits to prevent r8 dependency
   4.653 +END(ia64_ret_from_clone)
   4.654 +	// fall through
   4.655 +GLOBAL_ENTRY(ia64_ret_from_syscall)
   4.656 +	PT_REGS_UNWIND_INFO(0)
   4.657 +	cmp.ge p6,p7=r8,r0			// syscall executed successfully?
   4.658 +	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
   4.659 +	mov r10=r0				// clear error indication in r10
   4.660 +(p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
   4.661 +END(ia64_ret_from_syscall)
   4.662 +	// fall through
   4.663 +/*
   4.664 + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
   4.665 + *	need to switch to bank 0 and doesn't restore the scratch registers.
   4.666 + *	To avoid leaking kernel bits, the scratch registers are set to
   4.667 + *	the following known-to-be-safe values:
   4.668 + *
   4.669 + *		  r1: restored (global pointer)
   4.670 + *		  r2: cleared
   4.671 + *		  r3: 1 (when returning to user-level)
   4.672 + *	      r8-r11: restored (syscall return value(s))
   4.673 + *		 r12: restored (user-level stack pointer)
   4.674 + *		 r13: restored (user-level thread pointer)
   4.675 + *		 r14: cleared
   4.676 + *		 r15: restored (syscall #)
   4.677 + *	     r16-r17: cleared
   4.678 + *		 r18: user-level b6
   4.679 + *		 r19: cleared
   4.680 + *		 r20: user-level ar.fpsr
   4.681 + *		 r21: user-level b0
   4.682 + *		 r22: cleared
   4.683 + *		 r23: user-level ar.bspstore
   4.684 + *		 r24: user-level ar.rnat
   4.685 + *		 r25: user-level ar.unat
   4.686 + *		 r26: user-level ar.pfs
   4.687 + *		 r27: user-level ar.rsc
   4.688 + *		 r28: user-level ip
   4.689 + *		 r29: user-level psr
   4.690 + *		 r30: user-level cfm
   4.691 + *		 r31: user-level pr
   4.692 + *	      f6-f11: cleared
   4.693 + *		  pr: restored (user-level pr)
   4.694 + *		  b0: restored (user-level rp)
   4.695 + *	          b6: restored
   4.696 + *		  b7: cleared
   4.697 + *	     ar.unat: restored (user-level ar.unat)
   4.698 + *	      ar.pfs: restored (user-level ar.pfs)
   4.699 + *	      ar.rsc: restored (user-level ar.rsc)
   4.700 + *	     ar.rnat: restored (user-level ar.rnat)
   4.701 + *	 ar.bspstore: restored (user-level ar.bspstore)
   4.702 + *	     ar.fpsr: restored (user-level ar.fpsr)
   4.703 + *	      ar.ccv: cleared
   4.704 + *	      ar.csd: cleared
   4.705 + *	      ar.ssd: cleared
   4.706 + */
   4.707 +ENTRY(ia64_leave_syscall)
   4.708 +	PT_REGS_UNWIND_INFO(0)
   4.709 +	/*
   4.710 +	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
   4.711 +	 * user- or fsys-mode, hence we disable interrupts early on.
   4.712 +	 *
   4.713 +	 * p6 controls whether current_thread_info()->flags needs to be check for
   4.714 +	 * extra work.  We always check for extra work when returning to user-level.
   4.715 +	 * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
   4.716 +	 * is 0.  After extra work processing has been completed, execution
   4.717 +	 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
   4.718 +	 * needs to be redone.
   4.719 +	 */
   4.720 +#ifdef CONFIG_PREEMPT
   4.721 +	rsm psr.i				// disable interrupts
   4.722 +	cmp.eq pLvSys,p0=r0,r0			// pLvSys=1: leave from syscall
   4.723 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
   4.724 +	;;
   4.725 +	.pred.rel.mutex pUStk,pKStk
   4.726 +(pKStk) ld4 r21=[r20]			// r21 <- preempt_count
   4.727 +(pUStk)	mov r21=0			// r21 <- 0
   4.728 +	;;
   4.729 +	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
   4.730 +#else /* !CONFIG_PREEMPT */
   4.731 +(pUStk)	rsm psr.i
   4.732 +	cmp.eq pLvSys,p0=r0,r0		// pLvSys=1: leave from syscall
   4.733 +(pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
   4.734 +#endif
   4.735 +.work_processed_syscall:
   4.736 +	adds r2=PT(LOADRS)+16,r12
   4.737 +	adds r3=PT(AR_BSPSTORE)+16,r12
   4.738 +#ifdef XEN
   4.739 +	;;
   4.740 +#else
   4.741 +	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
   4.742 +	;;
   4.743 +(p6)	ld4 r31=[r18]				// load current_thread_info()->flags
   4.744 +#endif
   4.745 +	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
   4.746 +	mov b7=r0		// clear b7
   4.747 +	;;
   4.748 +	ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
   4.749 +	ld8 r18=[r2],PT(R9)-PT(B6)		// load b6
   4.750 +#ifndef XEN
   4.751 +(p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
   4.752 +#endif
   4.753 +	;;
   4.754 +	mov r16=ar.bsp				// M2  get existing backing store pointer
   4.755 +#ifndef XEN
   4.756 +(p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
   4.757 +(p6)	br.cond.spnt .work_pending_syscall
   4.758 +#endif
   4.759 +	;;
   4.760 +	// start restoring the state saved on the kernel stack (struct pt_regs):
   4.761 +	ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
   4.762 +	ld8 r11=[r3],PT(CR_IIP)-PT(R11)
   4.763 +	mov f6=f0		// clear f6
   4.764 +	;;
   4.765 +	invala			// M0|1 invalidate ALAT
   4.766 +	rsm psr.i | psr.ic	// M2 initiate turning off of interrupt and interruption collection
   4.767 +	mov f9=f0		// clear f9
   4.768 +
   4.769 +	ld8 r29=[r2],16		// load cr.ipsr
   4.770 +	ld8 r28=[r3],16			// load cr.iip
   4.771 +	mov f8=f0		// clear f8
   4.772 +	;;
   4.773 +	ld8 r30=[r2],16		// M0|1 load cr.ifs
   4.774 +	mov.m ar.ssd=r0		// M2 clear ar.ssd
   4.775 +	cmp.eq p9,p0=r0,r0	// set p9 to indicate that we should restore cr.ifs
   4.776 +	;;
   4.777 +	ld8 r25=[r3],16		// M0|1 load ar.unat
   4.778 +	mov.m ar.csd=r0		// M2 clear ar.csd
   4.779 +	mov r22=r0		// clear r22
   4.780 +	;;
   4.781 +	ld8 r26=[r2],PT(B0)-PT(AR_PFS)	// M0|1 load ar.pfs
   4.782 +(pKStk)	mov r22=psr		// M2 read PSR now that interrupts are disabled
   4.783 +	mov f10=f0		// clear f10
   4.784 +	;;
   4.785 +	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
   4.786 +	ld8 r27=[r3],PT(PR)-PT(AR_RSC)	// load ar.rsc
   4.787 +	mov f11=f0		// clear f11
   4.788 +	;;
   4.789 +	ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)	// load ar.rnat (may be garbage)
   4.790 +	ld8 r31=[r3],PT(R1)-PT(PR)		// load predicates
   4.791 +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
   4.792 +	;;
   4.793 +	ld8 r20=[r2],PT(R12)-PT(AR_FPSR)	// load ar.fpsr
   4.794 +	ld8.fill r1=[r3],16	// load r1
   4.795 +(pUStk) mov r17=1
   4.796 +	;;
   4.797 +	srlz.d			// M0  ensure interruption collection is off
   4.798 +	ld8.fill r13=[r3],16
   4.799 +	mov f7=f0		// clear f7
   4.800 +	;;
   4.801 +	ld8.fill r12=[r2]	// restore r12 (sp)
   4.802 +	ld8.fill r15=[r3]	// restore r15
   4.803 +#ifdef XEN
   4.804 +	movl r3=THIS_CPU(ia64_phys_stacked_size_p8)
   4.805 +#else
   4.806 +	addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
   4.807 +#endif
   4.808 +	;;
   4.809 +(pUStk)	ld4 r3=[r3]		// r3 = cpu_data->phys_stacked_size_p8
   4.810 +(pUStk) st1 [r14]=r17
   4.811 +	mov b6=r18		// I0  restore b6
   4.812 +	;;
   4.813 +	mov r14=r0		// clear r14
   4.814 +	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
   4.815 +(pKStk) br.cond.dpnt.many skip_rbs_switch
   4.816 +
   4.817 +	mov.m ar.ccv=r0		// clear ar.ccv
   4.818 +(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
   4.819 +	br.cond.sptk.many rbs_switch
   4.820 +END(ia64_leave_syscall)
   4.821 +
   4.822 +#ifdef CONFIG_IA32_SUPPORT
   4.823 +GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
   4.824 +	PT_REGS_UNWIND_INFO(0)
   4.825 +	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
   4.826 +	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
   4.827 +	;;
   4.828 +	.mem.offset 0,0
   4.829 +	st8.spill [r2]=r8	// store return value in slot for r8 and set unat bit
   4.830 +	.mem.offset 8,0
   4.831 +	st8.spill [r3]=r0	// clear error indication in slot for r10 and set unat bit
   4.832 +END(ia64_ret_from_ia32_execve_syscall)
   4.833 +	// fall through
   4.834 +#endif /* CONFIG_IA32_SUPPORT */
   4.835 +GLOBAL_ENTRY(ia64_leave_kernel)
   4.836 +	PT_REGS_UNWIND_INFO(0)
   4.837 +	/*
   4.838 +	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
   4.839 +	 * user- or fsys-mode, hence we disable interrupts early on.
   4.840 +	 *
   4.841 +	 * p6 controls whether current_thread_info()->flags needs to be check for
   4.842 +	 * extra work.  We always check for extra work when returning to user-level.
   4.843 +	 * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
   4.844 +	 * is 0.  After extra work processing has been completed, execution
   4.845 +	 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
   4.846 +	 * needs to be redone.
   4.847 +	 */
   4.848 +#ifdef CONFIG_PREEMPT
   4.849 +	rsm psr.i				// disable interrupts
   4.850 +	cmp.eq p0,pLvSys=r0,r0			// pLvSys=0: leave from kernel
   4.851 +(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
   4.852 +	;;
   4.853 +	.pred.rel.mutex pUStk,pKStk
   4.854 +(pKStk)	ld4 r21=[r20]			// r21 <- preempt_count
   4.855 +(pUStk)	mov r21=0			// r21 <- 0
   4.856 +	;;
   4.857 +	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
   4.858 +#else
   4.859 +(pUStk)	rsm psr.i
   4.860 +	cmp.eq p0,pLvSys=r0,r0		// pLvSys=0: leave from kernel
   4.861 +(pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
   4.862 +#endif
   4.863 +.work_processed_kernel:
   4.864 +#ifdef XEN
   4.865 +	alloc loc0=ar.pfs,0,1,1,0
   4.866 +	adds out0=16,r12
   4.867 +	;;
   4.868 +(p6)	br.call.sptk.many b0=deliver_pending_interrupt
   4.869 +	mov ar.pfs=loc0
   4.870 +	mov r31=r0
   4.871 +#else
   4.872 +	adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
   4.873 +	;;
   4.874 +(p6)	ld4 r31=[r17]				// load current_thread_info()->flags
   4.875 +#endif
   4.876 +	adds r21=PT(PR)+16,r12
   4.877 +	;;
   4.878 +
   4.879 +	lfetch [r21],PT(CR_IPSR)-PT(PR)
   4.880 +	adds r2=PT(B6)+16,r12
   4.881 +	adds r3=PT(R16)+16,r12
   4.882 +	;;
   4.883 +	lfetch [r21]
   4.884 +	ld8 r28=[r2],8		// load b6
   4.885 +	adds r29=PT(R24)+16,r12
   4.886 +
   4.887 +	ld8.fill r16=[r3]
   4.888 +	adds r30=PT(AR_CCV)+16,r12
   4.889 +(p6)	and r19=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
   4.890 +	;;
   4.891 +	adds r3=PT(AR_CSD)-PT(R16),r3
   4.892 +	ld8.fill r24=[r29]
   4.893 +	ld8 r15=[r30]		// load ar.ccv
   4.894 +(p6)	cmp4.ne.unc p6,p0=r19, r0		// any special work pending?
   4.895 +	;;
   4.896 +	ld8 r29=[r2],16		// load b7
   4.897 +	ld8 r30=[r3],16		// load ar.csd
   4.898 +#ifndef XEN
   4.899 +(p6)	br.cond.spnt .work_pending
   4.900 +#endif
   4.901 +	;;
   4.902 +	ld8 r31=[r2],16		// load ar.ssd
   4.903 +	ld8.fill r8=[r3],16
   4.904 +	;;
   4.905 +	ld8.fill r9=[r2],16
   4.906 +	ld8.fill r10=[r3],PT(R17)-PT(R10)
   4.907 +	;;
   4.908 +	ld8.fill r11=[r2],PT(R18)-PT(R11)
   4.909 +	ld8.fill r17=[r3],16
   4.910 +	;;
   4.911 +	ld8.fill r18=[r2],16
   4.912 +	ld8.fill r19=[r3],16
   4.913 +	;;
   4.914 +	ld8.fill r20=[r2],16
   4.915 +	ld8.fill r21=[r3],16
   4.916 +	mov ar.csd=r30
   4.917 +	mov ar.ssd=r31
   4.918 +	;;
   4.919 +	rsm psr.i | psr.ic	// initiate turning off of interrupt and interruption collection
   4.920 +	invala			// invalidate ALAT
   4.921 +	;;
   4.922 +	ld8.fill r22=[r2],24
   4.923 +	ld8.fill r23=[r3],24
   4.924 +	mov b6=r28
   4.925 +	;;
   4.926 +	ld8.fill r25=[r2],16
   4.927 +	ld8.fill r26=[r3],16
   4.928 +	mov b7=r29
   4.929 +	;;
   4.930 +	ld8.fill r27=[r2],16
   4.931 +	ld8.fill r28=[r3],16
   4.932 +	;;
   4.933 +	ld8.fill r29=[r2],16
   4.934 +	ld8.fill r30=[r3],24
   4.935 +	;;
   4.936 +	ld8.fill r31=[r2],PT(F9)-PT(R31)
   4.937 +	adds r3=PT(F10)-PT(F6),r3
   4.938 +	;;
   4.939 +	ldf.fill f9=[r2],PT(F6)-PT(F9)
   4.940 +	ldf.fill f10=[r3],PT(F8)-PT(F10)
   4.941 +	;;
   4.942 +	ldf.fill f6=[r2],PT(F7)-PT(F6)
   4.943 +	;;
   4.944 +	ldf.fill f7=[r2],PT(F11)-PT(F7)
   4.945 +	ldf.fill f8=[r3],32
   4.946 +	;;
   4.947 +	srlz.i			// ensure interruption collection is off
   4.948 +	mov ar.ccv=r15
   4.949 +	;;
   4.950 +	ldf.fill f11=[r2]
   4.951 +	bsw.0			// switch back to bank 0 (no stop bit required beforehand...)
   4.952 +	;;
   4.953 +(pUStk)	mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
   4.954 +	adds r16=PT(CR_IPSR)+16,r12
   4.955 +	adds r17=PT(CR_IIP)+16,r12
   4.956 +
   4.957 +(pKStk)	mov r22=psr		// M2 read PSR now that interrupts are disabled
   4.958 +	nop.i 0
   4.959 +	nop.i 0
   4.960 +	;;
   4.961 +	ld8 r29=[r16],16	// load cr.ipsr
   4.962 +	ld8 r28=[r17],16	// load cr.iip
   4.963 +	;;
   4.964 +	ld8 r30=[r16],16	// load cr.ifs
   4.965 +	ld8 r25=[r17],16	// load ar.unat
   4.966 +	;;
   4.967 +	ld8 r26=[r16],16	// load ar.pfs
   4.968 +	ld8 r27=[r17],16	// load ar.rsc
   4.969 +	cmp.eq p9,p0=r0,r0	// set p9 to indicate that we should restore cr.ifs
   4.970 +	;;
   4.971 +	ld8 r24=[r16],16	// load ar.rnat (may be garbage)
   4.972 +	ld8 r23=[r17],16	// load ar.bspstore (may be garbage)
   4.973 +	;;
   4.974 +	ld8 r31=[r16],16	// load predicates
   4.975 +	ld8 r21=[r17],16	// load b0
   4.976 +	;;
   4.977 +	ld8 r19=[r16],16	// load ar.rsc value for "loadrs"
   4.978 +	ld8.fill r1=[r17],16	// load r1
   4.979 +	;;
   4.980 +	ld8.fill r12=[r16],16
   4.981 +	ld8.fill r13=[r17],16
   4.982 +(pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
   4.983 +	;;
   4.984 +	ld8 r20=[r16],16	// ar.fpsr
   4.985 +	ld8.fill r15=[r17],16
   4.986 +	;;
   4.987 +	ld8.fill r14=[r16],16
   4.988 +	ld8.fill r2=[r17]
   4.989 +(pUStk)	mov r17=1
   4.990 +	;;
   4.991 +	ld8.fill r3=[r16]
   4.992 +(pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
   4.993 +	shr.u r18=r19,16	// get byte size of existing "dirty" partition
   4.994 +	;;
   4.995 +	mov r16=ar.bsp		// get existing backing store pointer
   4.996 +#ifdef XEN
   4.997 +	movl r17=THIS_CPU(ia64_phys_stacked_size_p8)
   4.998 +#else
   4.999 +	addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
  4.1000 +#endif
  4.1001 +	;;
  4.1002 +	ld4 r17=[r17]		// r17 = cpu_data->phys_stacked_size_p8
  4.1003 +(pKStk)	br.cond.dpnt skip_rbs_switch
  4.1004 +
  4.1005 +	/*
  4.1006 +	 * Restore user backing store.
  4.1007 +	 *
  4.1008 +	 * NOTE: alloc, loadrs, and cover can't be predicated.
  4.1009 +	 */
  4.1010 +(pNonSys) br.cond.dpnt dont_preserve_current_frame
  4.1011 +
  4.1012 +rbs_switch:
  4.1013 +	cover				// add current frame into dirty partition and set cr.ifs
  4.1014 +	;;
  4.1015 +	mov r19=ar.bsp			// get new backing store pointer
  4.1016 +	sub r16=r16,r18			// krbs = old bsp - size of dirty partition
  4.1017 +	cmp.ne p9,p0=r0,r0		// clear p9 to skip restore of cr.ifs
  4.1018 +	;;
  4.1019 +	sub r19=r19,r16			// calculate total byte size of dirty partition
  4.1020 +	add r18=64,r18			// don't force in0-in7 into memory...
  4.1021 +	;;
  4.1022 +	shl r19=r19,16			// shift size of dirty partition into loadrs position
  4.1023 +	;;
  4.1024 +dont_preserve_current_frame:
  4.1025 +	/*
  4.1026 +	 * To prevent leaking bits between the kernel and user-space,
  4.1027 +	 * we must clear the stacked registers in the "invalid" partition here.
  4.1028 +	 * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
  4.1029 +	 * 5 registers/cycle on McKinley).
  4.1030 +	 */
  4.1031 +#	define pRecurse	p6
  4.1032 +#	define pReturn	p7
  4.1033 +#ifdef CONFIG_ITANIUM
  4.1034 +#	define Nregs	10
  4.1035 +#else
  4.1036 +#	define Nregs	14
  4.1037 +#endif
  4.1038 +	alloc loc0=ar.pfs,2,Nregs-2,2,0
  4.1039 +	shr.u loc1=r18,9		// RNaTslots <= floor(dirtySize / (64*8))
  4.1040 +	sub r17=r17,r18			// r17 = (physStackedSize + 8) - dirtySize
  4.1041 +	;;
  4.1042 +	mov ar.rsc=r19			// load ar.rsc to be used for "loadrs"
  4.1043 +	shladd in0=loc1,3,r17
  4.1044 +	mov in1=0
  4.1045 +	;;
  4.1046 +	TEXT_ALIGN(32)
  4.1047 +rse_clear_invalid:
  4.1048 +#ifdef CONFIG_ITANIUM
  4.1049 +	// cycle 0
  4.1050 + { .mii
  4.1051 +	alloc loc0=ar.pfs,2,Nregs-2,2,0
  4.1052 +	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
  4.1053 +	add out0=-Nregs*8,in0
  4.1054 +}{ .mfb
  4.1055 +	add out1=1,in1			// increment recursion count
  4.1056 +	nop.f 0
  4.1057 +	nop.b 0				// can't do br.call here because of alloc (WAW on CFM)
  4.1058 +	;;
  4.1059 +}{ .mfi	// cycle 1
  4.1060 +	mov loc1=0
  4.1061 +	nop.f 0
  4.1062 +	mov loc2=0
  4.1063 +}{ .mib
  4.1064 +	mov loc3=0
  4.1065 +	mov loc4=0
  4.1066 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid
  4.1067 +
  4.1068 +}{ .mfi	// cycle 2
  4.1069 +	mov loc5=0
  4.1070 +	nop.f 0
  4.1071 +	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
  4.1072 +}{ .mib
  4.1073 +	mov loc6=0
  4.1074 +	mov loc7=0
  4.1075 +(pReturn) br.ret.sptk.many b0
  4.1076 +}
  4.1077 +#else /* !CONFIG_ITANIUM */
  4.1078 +	alloc loc0=ar.pfs,2,Nregs-2,2,0
  4.1079 +	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
  4.1080 +	add out0=-Nregs*8,in0
  4.1081 +	add out1=1,in1			// increment recursion count
  4.1082 +	mov loc1=0
  4.1083 +	mov loc2=0
  4.1084 +	;;
  4.1085 +	mov loc3=0
  4.1086 +	mov loc4=0
  4.1087 +	mov loc5=0
  4.1088 +	mov loc6=0
  4.1089 +	mov loc7=0
  4.1090 +(pRecurse) br.call.sptk.few b0=rse_clear_invalid
  4.1091 +	;;
  4.1092 +	mov loc8=0
  4.1093 +	mov loc9=0
  4.1094 +	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
  4.1095 +	mov loc10=0
  4.1096 +	mov loc11=0
  4.1097 +(pReturn) br.ret.sptk.many b0
  4.1098 +#endif /* !CONFIG_ITANIUM */
  4.1099 +#	undef pRecurse
  4.1100 +#	undef pReturn
  4.1101 +	;;
  4.1102 +	alloc r17=ar.pfs,0,0,0,0	// drop current register frame
  4.1103 +	;;
  4.1104 +	loadrs
  4.1105 +	;;
  4.1106 +skip_rbs_switch:
  4.1107 +	mov ar.unat=r25		// M2
  4.1108 +(pKStk)	extr.u r22=r22,21,1	// I0 extract current value of psr.pp from r22
  4.1109 +(pLvSys)mov r19=r0		// A  clear r19 for leave_syscall, no-op otherwise
  4.1110 +	;;
  4.1111 +(pUStk)	mov ar.bspstore=r23	// M2
  4.1112 +(pKStk)	dep r29=r22,r29,21,1	// I0 update ipsr.pp with psr.pp
  4.1113 +(pLvSys)mov r16=r0		// A  clear r16 for leave_syscall, no-op otherwise
  4.1114 +	;;
  4.1115 +	mov cr.ipsr=r29		// M2
  4.1116 +	mov ar.pfs=r26		// I0
  4.1117 +(pLvSys)mov r17=r0		// A  clear r17 for leave_syscall, no-op otherwise
  4.1118 +
  4.1119 +(p9)	mov cr.ifs=r30		// M2
  4.1120 +	mov b0=r21		// I0
  4.1121 +(pLvSys)mov r18=r0		// A  clear r18 for leave_syscall, no-op otherwise
  4.1122 +
  4.1123 +	mov ar.fpsr=r20		// M2
  4.1124 +	mov cr.iip=r28		// M2
  4.1125 +	nop 0
  4.1126 +	;;
  4.1127 +(pUStk)	mov ar.rnat=r24		// M2 must happen with RSE in lazy mode
  4.1128 +	nop 0
  4.1129 +(pLvSys)mov r2=r0
  4.1130 +
  4.1131 +	mov ar.rsc=r27		// M2
  4.1132 +	mov pr=r31,-1		// I0
  4.1133 +	rfi			// B
  4.1134 +
  4.1135 +#ifndef XEN
  4.1136 +	/*
  4.1137 +	 * On entry:
  4.1138 +	 *	r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
  4.1139 +	 *	r31 = current->thread_info->flags
  4.1140 +	 * On exit:
  4.1141 +	 *	p6 = TRUE if work-pending-check needs to be redone
  4.1142 +	 */
  4.1143 +.work_pending_syscall:
  4.1144 +	add r2=-8,r2
  4.1145 +	add r3=-8,r3
  4.1146 +	;;
  4.1147 +	st8 [r2]=r8
  4.1148 +	st8 [r3]=r10
  4.1149 +.work_pending:
  4.1150 +	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
  4.1151 +(p6)	br.cond.sptk.few .sigdelayed
  4.1152 +	;;
  4.1153 +	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
  4.1154 +(p6)	br.cond.sptk.few .notify
  4.1155 +#ifdef CONFIG_PREEMPT
  4.1156 +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
  4.1157 +	;;
  4.1158 +(pKStk) st4 [r20]=r21
  4.1159 +	ssm psr.i		// enable interrupts
  4.1160 +#endif
  4.1161 +	br.call.spnt.many rp=schedule
  4.1162 +.ret9:	cmp.eq p6,p0=r0,r0				// p6 <- 1
  4.1163 +	rsm psr.i		// disable interrupts
  4.1164 +	;;
  4.1165 +#ifdef CONFIG_PREEMPT
  4.1166 +(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
  4.1167 +	;;
  4.1168 +(pKStk)	st4 [r20]=r0		// preempt_count() <- 0
  4.1169 +#endif
  4.1170 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
  4.1171 +	br.cond.sptk.many .work_processed_kernel	// re-check
  4.1172 +
  4.1173 +.notify:
  4.1174 +(pUStk)	br.call.spnt.many rp=notify_resume_user
  4.1175 +.ret10:	cmp.ne p6,p0=r0,r0				// p6 <- 0
  4.1176 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
  4.1177 +	br.cond.sptk.many .work_processed_kernel	// don't re-check
  4.1178 +
  4.1179 +// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
  4.1180 +// it could not be delivered.  Deliver it now.  The signal might be for us and
  4.1181 +// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
  4.1182 +// signal.
  4.1183 +
  4.1184 +.sigdelayed:
  4.1185 +	br.call.sptk.many rp=do_sigdelayed
  4.1186 +	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
  4.1187 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
  4.1188 +	br.cond.sptk.many .work_processed_kernel	// re-check
  4.1189 +
  4.1190 +.work_pending_syscall_end:
  4.1191 +	adds r2=PT(R8)+16,r12
  4.1192 +	adds r3=PT(R10)+16,r12
  4.1193 +	;;
  4.1194 +	ld8 r8=[r2]
  4.1195 +	ld8 r10=[r3]
  4.1196 +	br.cond.sptk.many .work_processed_syscall	// re-check
  4.1197 +#endif
  4.1198 +
  4.1199 +END(ia64_leave_kernel)
  4.1200 +
  4.1201 +ENTRY(handle_syscall_error)
  4.1202 +	/*
  4.1203 +	 * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
  4.1204 +	 * lead us to mistake a negative return value as a failed syscall.  Those syscall
  4.1205 +	 * must deposit a non-zero value in pt_regs.r8 to indicate an error.  If
  4.1206 +	 * pt_regs.r8 is zero, we assume that the call completed successfully.
  4.1207 +	 */
  4.1208 +	PT_REGS_UNWIND_INFO(0)
  4.1209 +	ld8 r3=[r2]		// load pt_regs.r8
  4.1210 +	;;
  4.1211 +	cmp.eq p6,p7=r3,r0	// is pt_regs.r8==0?
  4.1212 +	;;
  4.1213 +(p7)	mov r10=-1
  4.1214 +(p7)	sub r8=0,r8		// negate return value to get errno
  4.1215 +	br.cond.sptk ia64_leave_syscall
  4.1216 +END(handle_syscall_error)
  4.1217 +
  4.1218 +	/*
  4.1219 +	 * Invoke schedule_tail(task) while preserving in0-in7, which may be needed
  4.1220 +	 * in case a system call gets restarted.
  4.1221 +	 */
  4.1222 +GLOBAL_ENTRY(ia64_invoke_schedule_tail)
  4.1223 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
  4.1224 +	alloc loc1=ar.pfs,8,2,1,0
  4.1225 +	mov loc0=rp
  4.1226 +	mov out0=r8				// Address of previous task
  4.1227 +	;;
  4.1228 +	br.call.sptk.many rp=schedule_tail
  4.1229 +.ret11:	mov ar.pfs=loc1
  4.1230 +	mov rp=loc0
  4.1231 +	br.ret.sptk.many rp
  4.1232 +END(ia64_invoke_schedule_tail)
  4.1233 +
  4.1234 +#ifndef XEN
  4.1235 +	/*
  4.1236 +	 * Setup stack and call do_notify_resume_user().  Note that pSys and pNonSys need to
  4.1237 +	 * be set up by the caller.  We declare 8 input registers so the system call
  4.1238 +	 * args get preserved, in case we need to restart a system call.
  4.1239 +	 */
  4.1240 +ENTRY(notify_resume_user)
  4.1241 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
  4.1242 +	alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
  4.1243 +	mov r9=ar.unat
  4.1244 +	mov loc0=rp				// save return address
  4.1245 +	mov out0=0				// there is no "oldset"
  4.1246 +	adds out1=8,sp				// out1=&sigscratch->ar_pfs
  4.1247 +(pSys)	mov out2=1				// out2==1 => we're in a syscall
  4.1248 +	;;
  4.1249 +(pNonSys) mov out2=0				// out2==0 => not a syscall
  4.1250 +	.fframe 16
  4.1251 +	.spillpsp ar.unat, 16			// (note that offset is relative to psp+0x10!)
  4.1252 +	st8 [sp]=r9,-16				// allocate space for ar.unat and save it
  4.1253 +	st8 [out1]=loc1,-8			// save ar.pfs, out1=&sigscratch
  4.1254 +	.body
  4.1255 +	br.call.sptk.many rp=do_notify_resume_user
  4.1256 +.ret15:	.restore sp
  4.1257 +	adds sp=16,sp				// pop scratch stack space
  4.1258 +	;;
  4.1259 +	ld8 r9=[sp]				// load new unat from sigscratch->scratch_unat
  4.1260 +	mov rp=loc0
  4.1261 +	;;
  4.1262 +	mov ar.unat=r9
  4.1263 +	mov ar.pfs=loc1
  4.1264 +	br.ret.sptk.many rp
  4.1265 +END(notify_resume_user)
  4.1266 +
  4.1267 +GLOBAL_ENTRY(sys_rt_sigsuspend)
  4.1268 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
  4.1269 +	alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
  4.1270 +	mov r9=ar.unat
  4.1271 +	mov loc0=rp				// save return address
  4.1272 +	mov out0=in0				// mask
  4.1273 +	mov out1=in1				// sigsetsize
  4.1274 +	adds out2=8,sp				// out2=&sigscratch->ar_pfs
  4.1275 +	;;
  4.1276 +	.fframe 16
  4.1277 +	.spillpsp ar.unat, 16			// (note that offset is relative to psp+0x10!)
  4.1278 +	st8 [sp]=r9,-16				// allocate space for ar.unat and save it
  4.1279 +	st8 [out2]=loc1,-8			// save ar.pfs, out2=&sigscratch
  4.1280 +	.body
  4.1281 +	br.call.sptk.many rp=ia64_rt_sigsuspend
  4.1282 +.ret17:	.restore sp
  4.1283 +	adds sp=16,sp				// pop scratch stack space
  4.1284 +	;;
  4.1285 +	ld8 r9=[sp]				// load new unat from sw->caller_unat
  4.1286 +	mov rp=loc0
  4.1287 +	;;
  4.1288 +	mov ar.unat=r9
  4.1289 +	mov ar.pfs=loc1
  4.1290 +	br.ret.sptk.many rp
  4.1291 +END(sys_rt_sigsuspend)
  4.1292 +
  4.1293 +ENTRY(sys_rt_sigreturn)
  4.1294 +	PT_REGS_UNWIND_INFO(0)
  4.1295 +	/*
  4.1296 +	 * Allocate 8 input registers since ptrace() may clobber them
  4.1297 +	 */
  4.1298 +	alloc r2=ar.pfs,8,0,1,0
  4.1299 +	.prologue
  4.1300 +	PT_REGS_SAVES(16)
  4.1301 +	adds sp=-16,sp
  4.1302 +	.body
  4.1303 +	cmp.eq pNonSys,pSys=r0,r0		// sigreturn isn't a normal syscall...
  4.1304 +	;;
  4.1305 +	/*
  4.1306 +	 * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined
  4.1307 +	 * syscall-entry path does not save them we save them here instead.  Note: we
  4.1308 +	 * don't need to save any other registers that are not saved by the stream-lined
  4.1309 +	 * syscall path, because restore_sigcontext() restores them.
  4.1310 +	 */
  4.1311 +	adds r16=PT(F6)+32,sp
  4.1312 +	adds r17=PT(F7)+32,sp
  4.1313 +	;;
  4.1314 + 	stf.spill [r16]=f6,32
  4.1315 + 	stf.spill [r17]=f7,32
  4.1316 +	;;
  4.1317 + 	stf.spill [r16]=f8,32
  4.1318 + 	stf.spill [r17]=f9,32
  4.1319 +	;;
  4.1320 + 	stf.spill [r16]=f10
  4.1321 + 	stf.spill [r17]=f11
  4.1322 +	adds out0=16,sp				// out0 = &sigscratch
  4.1323 +	br.call.sptk.many rp=ia64_rt_sigreturn
  4.1324 +.ret19:	.restore sp 0
  4.1325 +	adds sp=16,sp
  4.1326 +	;;
  4.1327 +	ld8 r9=[sp]				// load new ar.unat
  4.1328 +	mov.sptk b7=r8,ia64_leave_kernel
  4.1329 +	;;
  4.1330 +	mov ar.unat=r9
  4.1331 +	br.many b7
  4.1332 +END(sys_rt_sigreturn)
  4.1333 +#endif
  4.1334 +
  4.1335 +GLOBAL_ENTRY(ia64_prepare_handle_unaligned)
  4.1336 +	.prologue
  4.1337 +	/*
  4.1338 +	 * r16 = fake ar.pfs, we simply need to make sure privilege is still 0
  4.1339 +	 */
  4.1340 +	mov r16=r0
  4.1341 +	DO_SAVE_SWITCH_STACK
  4.1342 +	br.call.sptk.many rp=ia64_handle_unaligned	// stack frame setup in ivt
  4.1343 +.ret21:	.body
  4.1344 +	DO_LOAD_SWITCH_STACK
  4.1345 +	br.cond.sptk.many rp				// goes to ia64_leave_kernel
  4.1346 +END(ia64_prepare_handle_unaligned)
  4.1347 +
  4.1348 +#ifndef XEN
  4.1349 +	//
  4.1350 +	// unw_init_running(void (*callback)(info, arg), void *arg)
  4.1351 +	//
  4.1352 +#	define EXTRA_FRAME_SIZE	((UNW_FRAME_INFO_SIZE+15)&~15)
  4.1353 +
  4.1354 +GLOBAL_ENTRY(unw_init_running)
  4.1355 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
  4.1356 +	alloc loc1=ar.pfs,2,3,3,0
  4.1357 +	;;
  4.1358 +	ld8 loc2=[in0],8
  4.1359 +	mov loc0=rp
  4.1360 +	mov r16=loc1
  4.1361 +	DO_SAVE_SWITCH_STACK
  4.1362 +	.body
  4.1363 +
  4.1364 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
  4.1365 +	.fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE
  4.1366 +	SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE)
  4.1367 +	adds sp=-EXTRA_FRAME_SIZE,sp
  4.1368 +	.body
  4.1369 +	;;
  4.1370 +	adds out0=16,sp				// &info
  4.1371 +	mov out1=r13				// current
  4.1372 +	adds out2=16+EXTRA_FRAME_SIZE,sp	// &switch_stack
  4.1373 +	br.call.sptk.many rp=unw_init_frame_info
  4.1374 +1:	adds out0=16,sp				// &info
  4.1375 +	mov b6=loc2
  4.1376 +	mov loc2=gp				// save gp across indirect function call
  4.1377 +	;;
  4.1378 +	ld8 gp=[in0]
  4.1379 +	mov out1=in1				// arg
  4.1380 +	br.call.sptk.many rp=b6			// invoke the callback function
  4.1381 +1:	mov gp=loc2				// restore gp
  4.1382 +
  4.1383 +	// For now, we don't allow changing registers from within
  4.1384 +	// unw_init_running; if we ever want to allow that, we'd
  4.1385 +	// have to do a load_switch_stack here:
  4.1386 +	.restore sp
  4.1387 +	adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp
  4.1388 +
  4.1389 +	mov ar.pfs=loc1
  4.1390 +	mov rp=loc0
  4.1391 +	br.ret.sptk.many rp
  4.1392 +END(unw_init_running)
  4.1393 +
  4.1394 +	.rodata
  4.1395 +	.align 8
  4.1396 +	.globl sys_call_table
  4.1397 +sys_call_table:
  4.1398 +	data8 sys_ni_syscall		//  This must be sys_ni_syscall!  See ivt.S.
  4.1399 +	data8 sys_exit				// 1025
  4.1400 +	data8 sys_read
  4.1401 +	data8 sys_write
  4.1402 +	data8 sys_open
  4.1403 +	data8 sys_close
  4.1404 +	data8 sys_creat				// 1030
  4.1405 +	data8 sys_link
  4.1406 +	data8 sys_unlink
  4.1407 +	data8 ia64_execve
  4.1408 +	data8 sys_chdir
  4.1409 +	data8 sys_fchdir			// 1035
  4.1410 +	data8 sys_utimes
  4.1411 +	data8 sys_mknod
  4.1412 +	data8 sys_chmod
  4.1413 +	data8 sys_chown
  4.1414 +	data8 sys_lseek				// 1040
  4.1415 +	data8 sys_getpid
  4.1416 +	data8 sys_getppid
  4.1417 +	data8 sys_mount
  4.1418 +	data8 sys_umount
  4.1419 +	data8 sys_setuid			// 1045
  4.1420 +	data8 sys_getuid
  4.1421 +	data8 sys_geteuid
  4.1422 +	data8 sys_ptrace
  4.1423 +	data8 sys_access
  4.1424 +	data8 sys_sync				// 1050
  4.1425 +	data8 sys_fsync
  4.1426 +	data8 sys_fdatasync
  4.1427 +	data8 sys_kill
  4.1428 +	data8 sys_rename
  4.1429 +	data8 sys_mkdir				// 1055
  4.1430 +	data8 sys_rmdir
  4.1431 +	data8 sys_dup
  4.1432 +	data8 sys_pipe
  4.1433 +	data8 sys_times
  4.1434 +	data8 ia64_brk				// 1060
  4.1435 +	data8 sys_setgid
  4.1436 +	data8 sys_getgid
  4.1437 +	data8 sys_getegid
  4.1438 +	data8 sys_acct
  4.1439 +	data8 sys_ioctl				// 1065
  4.1440 +	data8 sys_fcntl
  4.1441 +	data8 sys_umask
  4.1442 +	data8 sys_chroot
  4.1443 +	data8 sys_ustat
  4.1444 +	data8 sys_dup2				// 1070
  4.1445 +	data8 sys_setreuid
  4.1446 +	data8 sys_setregid
  4.1447 +	data8 sys_getresuid
  4.1448 +	data8 sys_setresuid
  4.1449 +	data8 sys_getresgid			// 1075
  4.1450 +	data8 sys_setresgid
  4.1451 +	data8 sys_getgroups
  4.1452 +	data8 sys_setgroups
  4.1453 +	data8 sys_getpgid
  4.1454 +	data8 sys_setpgid			// 1080
  4.1455 +	data8 sys_setsid
  4.1456 +	data8 sys_getsid
  4.1457 +	data8 sys_sethostname
  4.1458 +	data8 sys_setrlimit
  4.1459 +	data8 sys_getrlimit			// 1085
  4.1460 +	data8 sys_getrusage
  4.1461 +	data8 sys_gettimeofday
  4.1462 +	data8 sys_settimeofday
  4.1463 +	data8 sys_select
  4.1464 +	data8 sys_poll				// 1090
  4.1465 +	data8 sys_symlink
  4.1466 +	data8 sys_readlink
  4.1467 +	data8 sys_uselib
  4.1468 +	data8 sys_swapon
  4.1469 +	data8 sys_swapoff			// 1095
  4.1470 +	data8 sys_reboot
  4.1471 +	data8 sys_truncate
  4.1472 +	data8 sys_ftruncate
  4.1473 +	data8 sys_fchmod
  4.1474 +	data8 sys_fchown			// 1100
  4.1475 +	data8 ia64_getpriority
  4.1476 +	data8 sys_setpriority
  4.1477 +	data8 sys_statfs
  4.1478 +	data8 sys_fstatfs
  4.1479 +	data8 sys_gettid			// 1105
  4.1480 +	data8 sys_semget
  4.1481 +	data8 sys_semop
  4.1482 +	data8 sys_semctl
  4.1483 +	data8 sys_msgget
  4.1484 +	data8 sys_msgsnd			// 1110
  4.1485 +	data8 sys_msgrcv
  4.1486 +	data8 sys_msgctl
  4.1487 +	data8 sys_shmget
  4.1488 +	data8 ia64_shmat
  4.1489 +	data8 sys_shmdt				// 1115
  4.1490 +	data8 sys_shmctl
  4.1491 +	data8 sys_syslog
  4.1492 +	data8 sys_setitimer
  4.1493 +	data8 sys_getitimer
  4.1494 +	data8 sys_ni_syscall			// 1120		/* was: ia64_oldstat */
  4.1495 +	data8 sys_ni_syscall					/* was: ia64_oldlstat */
  4.1496 +	data8 sys_ni_syscall					/* was: ia64_oldfstat */
  4.1497 +	data8 sys_vhangup
  4.1498 +	data8 sys_lchown
  4.1499 +	data8 sys_remap_file_pages		// 1125
  4.1500 +	data8 sys_wait4
  4.1501 +	data8 sys_sysinfo
  4.1502 +	data8 sys_clone
  4.1503 +	data8 sys_setdomainname
  4.1504 +	data8 sys_newuname			// 1130
  4.1505 +	data8 sys_adjtimex
  4.1506 +	data8 sys_ni_syscall					/* was: ia64_create_module */
  4.1507 +	data8 sys_init_module
  4.1508 +	data8 sys_delete_module
  4.1509 +	data8 sys_ni_syscall			// 1135		/* was: sys_get_kernel_syms */
  4.1510 +	data8 sys_ni_syscall					/* was: sys_query_module */
  4.1511 +	data8 sys_quotactl
  4.1512 +	data8 sys_bdflush
  4.1513 +	data8 sys_sysfs
  4.1514 +	data8 sys_personality			// 1140
  4.1515 +	data8 sys_ni_syscall		// sys_afs_syscall
  4.1516 +	data8 sys_setfsuid
  4.1517 +	data8 sys_setfsgid
  4.1518 +	data8 sys_getdents
  4.1519 +	data8 sys_flock				// 1145
  4.1520 +	data8 sys_readv
  4.1521 +	data8 sys_writev
  4.1522 +	data8 sys_pread64
  4.1523 +	data8 sys_pwrite64
  4.1524 +	data8 sys_sysctl			// 1150
  4.1525 +	data8 sys_mmap
  4.1526 +	data8 sys_munmap
  4.1527 +	data8 sys_mlock
  4.1528 +	data8 sys_mlockall
  4.1529 +	data8 sys_mprotect			// 1155
  4.1530 +	data8 ia64_mremap
  4.1531 +	data8 sys_msync
  4.1532 +	data8 sys_munlock
  4.1533 +	data8 sys_munlockall
  4.1534 +	data8 sys_sched_getparam		// 1160
  4.1535 +	data8 sys_sched_setparam
  4.1536 +	data8 sys_sched_getscheduler
  4.1537 +	data8 sys_sched_setscheduler
  4.1538 +	data8 sys_sched_yield
  4.1539 +	data8 sys_sched_get_priority_max	// 1165
  4.1540 +	data8 sys_sched_get_priority_min
  4.1541 +	data8 sys_sched_rr_get_interval
  4.1542 +	data8 sys_nanosleep
  4.1543 +	data8 sys_nfsservctl
  4.1544 +	data8 sys_prctl				// 1170
  4.1545 +	data8 sys_getpagesize
  4.1546 +	data8 sys_mmap2
  4.1547 +	data8 sys_pciconfig_read
  4.1548 +	data8 sys_pciconfig_write
  4.1549 +	data8 sys_perfmonctl			// 1175
  4.1550 +	data8 sys_sigaltstack
  4.1551 +	data8 sys_rt_sigaction
  4.1552 +	data8 sys_rt_sigpending
  4.1553 +	data8 sys_rt_sigprocmask
  4.1554 +	data8 sys_rt_sigqueueinfo		// 1180
  4.1555 +	data8 sys_rt_sigreturn
  4.1556 +	data8 sys_rt_sigsuspend
  4.1557 +	data8 sys_rt_sigtimedwait
  4.1558 +	data8 sys_getcwd
  4.1559 +	data8 sys_capget			// 1185
  4.1560 +	data8 sys_capset
  4.1561 +	data8 sys_sendfile64
  4.1562 +	data8 sys_ni_syscall		// sys_getpmsg (STREAMS)
  4.1563 +	data8 sys_ni_syscall		// sys_putpmsg (STREAMS)
  4.1564 +	data8 sys_socket			// 1190
  4.1565 +	data8 sys_bind
  4.1566 +	data8 sys_connect
  4.1567 +	data8 sys_listen
  4.1568 +	data8 sys_accept
  4.1569 +	data8 sys_getsockname			// 1195
  4.1570 +	data8 sys_getpeername
  4.1571 +	data8 sys_socketpair
  4.1572 +	data8 sys_send
  4.1573 +	data8 sys_sendto
  4.1574 +	data8 sys_recv				// 1200
  4.1575 +	data8 sys_recvfrom
  4.1576 +	data8 sys_shutdown
  4.1577 +	data8 sys_setsockopt
  4.1578 +	data8 sys_getsockopt
  4.1579 +	data8 sys_sendmsg			// 1205
  4.1580 +	data8 sys_recvmsg
  4.1581 +	data8 sys_pivot_root
  4.1582 +	data8 sys_mincore
  4.1583 +	data8 sys_madvise
  4.1584 +	data8 sys_newstat			// 1210
  4.1585 +	data8 sys_newlstat
  4.1586 +	data8 sys_newfstat
  4.1587 +	data8 sys_clone2
  4.1588 +	data8 sys_getdents64
  4.1589 +	data8 sys_getunwind			// 1215
  4.1590 +	data8 sys_readahead
  4.1591 +	data8 sys_setxattr
  4.1592 +	data8 sys_lsetxattr
  4.1593 +	data8 sys_fsetxattr
  4.1594 +	data8 sys_getxattr			// 1220
  4.1595 +	data8 sys_lgetxattr
  4.1596 +	data8 sys_fgetxattr
  4.1597 +	data8 sys_listxattr
  4.1598 +	data8 sys_llistxattr
  4.1599 +	data8 sys_flistxattr			// 1225
  4.1600 +	data8 sys_removexattr
  4.1601 +	data8 sys_lremovexattr
  4.1602 +	data8 sys_fremovexattr
  4.1603 +	data8 sys_tkill
  4.1604 +	data8 sys_futex				// 1230
  4.1605 +	data8 sys_sched_setaffinity
  4.1606 +	data8 sys_sched_getaffinity
  4.1607 +	data8 sys_set_tid_address
  4.1608 +	data8 sys_fadvise64_64
  4.1609 +	data8 sys_tgkill 			// 1235
  4.1610 +	data8 sys_exit_group
  4.1611 +	data8 sys_lookup_dcookie
  4.1612 +	data8 sys_io_setup
  4.1613 +	data8 sys_io_destroy
  4.1614 +	data8 sys_io_getevents			// 1240
  4.1615 +	data8 sys_io_submit
  4.1616 +	data8 sys_io_cancel
  4.1617 +	data8 sys_epoll_create
  4.1618 +	data8 sys_epoll_ctl
  4.1619 +	data8 sys_epoll_wait			// 1245
  4.1620 +	data8 sys_restart_syscall
  4.1621 +	data8 sys_semtimedop
  4.1622 +	data8 sys_timer_create
  4.1623 +	data8 sys_timer_settime
  4.1624 +	data8 sys_timer_gettime			// 1250
  4.1625 +	data8 sys_timer_getoverrun
  4.1626 +	data8 sys_timer_delete
  4.1627 +	data8 sys_clock_settime
  4.1628 +	data8 sys_clock_gettime
  4.1629 +	data8 sys_clock_getres			// 1255
  4.1630 +	data8 sys_clock_nanosleep
  4.1631 +	data8 sys_fstatfs64
  4.1632 +	data8 sys_statfs64
  4.1633 +	data8 sys_mbind
  4.1634 +	data8 sys_get_mempolicy			// 1260
  4.1635 +	data8 sys_set_mempolicy
  4.1636 +	data8 sys_mq_open
  4.1637 +	data8 sys_mq_unlink
  4.1638 +	data8 sys_mq_timedsend
  4.1639 +	data8 sys_mq_timedreceive		// 1265
  4.1640 +	data8 sys_mq_notify
  4.1641 +	data8 sys_mq_getsetattr
  4.1642 +	data8 sys_ni_syscall			// reserved for kexec_load
  4.1643 +	data8 sys_ni_syscall			// reserved for vserver
  4.1644 +	data8 sys_waitid			// 1270
  4.1645 +	data8 sys_add_key
  4.1646 +	data8 sys_request_key
  4.1647 +	data8 sys_keyctl
  4.1648 +	data8 sys_ni_syscall
  4.1649 +	data8 sys_ni_syscall			// 1275
  4.1650 +	data8 sys_ni_syscall
  4.1651 +	data8 sys_ni_syscall
  4.1652 +	data8 sys_ni_syscall
  4.1653 +	data8 sys_ni_syscall
  4.1654 +
  4.1655 +	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
  4.1656 +#endif
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/xen/arch/ia64/entry.h	Tue Aug 02 15:59:09 2005 -0800
     5.3 @@ -0,0 +1,97 @@
     5.4 +#include <linux/config.h>
     5.5 +
     5.6 +/*
     5.7 + * Preserved registers that are shared between code in ivt.S and
     5.8 + * entry.S.  Be careful not to step on these!
     5.9 + */
    5.10 +#define PRED_LEAVE_SYSCALL	1 /* TRUE iff leave from syscall */
    5.11 +#define PRED_KERNEL_STACK	2 /* returning to kernel-stacks? */
    5.12 +#define PRED_USER_STACK		3 /* returning to user-stacks? */
    5.13 +#ifdef CONFIG_VTI
    5.14 +#define PRED_EMUL		2 /* Need to save r4-r7 for inst emulation */
    5.15 +#define PRED_NON_EMUL		3 /* No need to save r4-r7 for normal path */
    5.16 +#define PRED_BN0		6 /* Guest is in bank 0 */
    5.17 +#define PRED_BN1		7 /* Guest is in bank 1 */
    5.18 +#endif // CONFIG_VTI
    5.19 +#define PRED_SYSCALL		4 /* inside a system call? */
    5.20 +#define PRED_NON_SYSCALL	5 /* complement of PRED_SYSCALL */
    5.21 +
    5.22 +#ifdef __ASSEMBLY__
    5.23 +# define PASTE2(x,y)	x##y
    5.24 +# define PASTE(x,y)	PASTE2(x,y)
    5.25 +
    5.26 +# define pLvSys		PASTE(p,PRED_LEAVE_SYSCALL)
    5.27 +# define pKStk		PASTE(p,PRED_KERNEL_STACK)
    5.28 +# define pUStk		PASTE(p,PRED_USER_STACK)
    5.29 +#ifdef CONFIG_VTI
    5.30 +# define pEml		PASTE(p,PRED_EMUL)
    5.31 +# define pNonEml	PASTE(p,PRED_NON_EMUL)
    5.32 +# define pBN0		PASTE(p,PRED_BN0)
    5.33 +# define pBN1		PASTE(p,PRED_BN1)
    5.34 +#endif // CONFIG_VTI
    5.35 +# define pSys		PASTE(p,PRED_SYSCALL)
    5.36 +# define pNonSys	PASTE(p,PRED_NON_SYSCALL)
    5.37 +#endif
    5.38 +
    5.39 +#define PT(f)		(IA64_PT_REGS_##f##_OFFSET)
    5.40 +#define SW(f)		(IA64_SWITCH_STACK_##f##_OFFSET)
    5.41 +#ifdef CONFIG_VTI
    5.42 +#define VPD(f)      (VPD_##f##_START_OFFSET)
    5.43 +#endif // CONFIG_VTI
    5.44 +
    5.45 +#define PT_REGS_SAVES(off)			\
    5.46 +	.unwabi 3, 'i';				\
    5.47 +	.fframe IA64_PT_REGS_SIZE+16+(off);	\
    5.48 +	.spillsp rp, PT(CR_IIP)+16+(off);	\
    5.49 +	.spillsp ar.pfs, PT(CR_IFS)+16+(off);	\
    5.50 +	.spillsp ar.unat, PT(AR_UNAT)+16+(off);	\
    5.51 +	.spillsp ar.fpsr, PT(AR_FPSR)+16+(off);	\
    5.52 +	.spillsp pr, PT(PR)+16+(off);
    5.53 +
    5.54 +#define PT_REGS_UNWIND_INFO(off)		\
    5.55 +	.prologue;				\
    5.56 +	PT_REGS_SAVES(off);			\
    5.57 +	.body
    5.58 +
    5.59 +#define SWITCH_STACK_SAVES(off)							\
    5.60 +	.savesp ar.unat,SW(CALLER_UNAT)+16+(off);				\
    5.61 +	.savesp ar.fpsr,SW(AR_FPSR)+16+(off);					\
    5.62 +	.spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off);		\
    5.63 +	.spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off);		\
    5.64 +	.spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off);		\
    5.65 +	.spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off);		\
    5.66 +	.spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off);		\
    5.67 +	.spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off);		\
    5.68 +	.spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off);		\
    5.69 +	.spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off);		\
    5.70 +	.spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off);		\
    5.71 +	.spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off);		\
    5.72 +	.spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off);		\
    5.73 +	.spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off);		\
    5.74 +	.spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off);		\
    5.75 +	.spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off);		\
    5.76 +	.spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off);		\
    5.77 +	.spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off);	\
    5.78 +	.spillsp @priunat,SW(AR_UNAT)+16+(off);					\
    5.79 +	.spillsp ar.rnat,SW(AR_RNAT)+16+(off);					\
    5.80 +	.spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off);				\
    5.81 +	.spillsp pr,SW(PR)+16+(off))
    5.82 +
    5.83 +#define DO_SAVE_SWITCH_STACK			\
    5.84 +	movl r28=1f;				\
    5.85 +	;;					\
    5.86 +	.fframe IA64_SWITCH_STACK_SIZE;		\
    5.87 +	adds sp=-IA64_SWITCH_STACK_SIZE,sp;	\
    5.88 +	mov.ret.sptk b7=r28,1f;			\
    5.89 +	SWITCH_STACK_SAVES(0);			\
    5.90 +	br.cond.sptk.many save_switch_stack;	\
    5.91 +1:
    5.92 +
    5.93 +#define DO_LOAD_SWITCH_STACK			\
    5.94 +	movl r28=1f;				\
    5.95 +	;;					\
    5.96 +	invala;					\
    5.97 +	mov.ret.sptk b7=r28,1f;			\
    5.98 +	br.cond.sptk.many load_switch_stack;	\
    5.99 +1:	.restore sp;				\
   5.100 +	adds sp=IA64_SWITCH_STACK_SIZE,sp
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/xen/arch/ia64/head.S	Tue Aug 02 15:59:09 2005 -0800
     6.3 @@ -0,0 +1,1026 @@
     6.4 +/*
     6.5 + * Here is where the ball gets rolling as far as the kernel is concerned.
     6.6 + * When control is transferred to _start, the bootload has already
     6.7 + * loaded us to the correct address.  All that's left to do here is
     6.8 + * to set up the kernel's global pointer and jump to the kernel
     6.9 + * entry point.
    6.10 + *
    6.11 + * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
    6.12 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    6.13 + *	Stephane Eranian <eranian@hpl.hp.com>
    6.14 + * Copyright (C) 1999 VA Linux Systems
    6.15 + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
    6.16 + * Copyright (C) 1999 Intel Corp.
    6.17 + * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com>
    6.18 + * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com>
    6.19 + * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com>
    6.20 + *   -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2.
    6.21 + */
    6.22 +
    6.23 +#include <linux/config.h>
    6.24 +
    6.25 +#include <asm/asmmacro.h>
    6.26 +#include <asm/fpu.h>
    6.27 +#include <asm/kregs.h>
    6.28 +#include <asm/mmu_context.h>
    6.29 +#include <asm/offsets.h>
    6.30 +#include <asm/pal.h>
    6.31 +#include <asm/pgtable.h>
    6.32 +#include <asm/processor.h>
    6.33 +#include <asm/ptrace.h>
    6.34 +#include <asm/system.h>
    6.35 +
    6.36 +	.section __special_page_section,"ax"
    6.37 +
    6.38 +	.global empty_zero_page
    6.39 +empty_zero_page:
    6.40 +	.skip PAGE_SIZE
    6.41 +
    6.42 +	.global swapper_pg_dir
    6.43 +swapper_pg_dir:
    6.44 +	.skip PAGE_SIZE
    6.45 +
    6.46 +	.rodata
    6.47 +halt_msg:
    6.48 +	stringz "Halting kernel\n"
    6.49 +
    6.50 +	.text
    6.51 +
    6.52 +	.global start_ap
    6.53 +
    6.54 +	/*
    6.55 +	 * Start the kernel.  When the bootloader passes control to _start(), r28
    6.56 +	 * points to the address of the boot parameter area.  Execution reaches
    6.57 +	 * here in physical mode.
    6.58 +	 */
    6.59 +GLOBAL_ENTRY(_start)
    6.60 +start_ap:
    6.61 +	.prologue
    6.62 +	.save rp, r0		// terminate unwind chain with a NULL rp
    6.63 +	.body
    6.64 +
    6.65 +	rsm psr.i | psr.ic
    6.66 +	;;
    6.67 +	srlz.i
    6.68 +	;;
    6.69 +	/*
    6.70 +	 * Initialize kernel region registers:
    6.71 +	 *	rr[0]: VHPT enabled, page size = PAGE_SHIFT
    6.72 +	 *	rr[1]: VHPT enabled, page size = PAGE_SHIFT
    6.73 +	 *	rr[2]: VHPT enabled, page size = PAGE_SHIFT
    6.74 +	 *	rr[3]: VHPT enabled, page size = PAGE_SHIFT
    6.75 +	 *	rr[4]: VHPT enabled, page size = PAGE_SHIFT
    6.76 +	 *	rr[5]: VHPT enabled, page size = PAGE_SHIFT
    6.77 +	 *	rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT
    6.78 +	 *	rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT
    6.79 +	 * We initialize all of them to prevent inadvertently assuming
    6.80 +	 * something about the state of address translation early in boot.
    6.81 +	 */
    6.82 +	movl r6=((ia64_rid(IA64_REGION_ID_KERNEL, (0<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
    6.83 +	movl r7=(0<<61)
    6.84 +	movl r8=((ia64_rid(IA64_REGION_ID_KERNEL, (1<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
    6.85 +	movl r9=(1<<61)
    6.86 +	movl r10=((ia64_rid(IA64_REGION_ID_KERNEL, (2<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
    6.87 +	movl r11=(2<<61)
    6.88 +	movl r12=((ia64_rid(IA64_REGION_ID_KERNEL, (3<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
    6.89 +	movl r13=(3<<61)
    6.90 +	movl r14=((ia64_rid(IA64_REGION_ID_KERNEL, (4<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
    6.91 +	movl r15=(4<<61)
    6.92 +	movl r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
    6.93 +	movl r17=(5<<61)
    6.94 +	movl r18=((ia64_rid(IA64_REGION_ID_KERNEL, (6<<61)) << 8) | (IA64_GRANULE_SHIFT << 2))
    6.95 +	movl r19=(6<<61)
    6.96 +	movl r20=((ia64_rid(IA64_REGION_ID_KERNEL, (7<<61)) << 8) | (IA64_GRANULE_SHIFT << 2))
    6.97 +	movl r21=(7<<61)
    6.98 +	;;
    6.99 +	mov rr[r7]=r6
   6.100 +	mov rr[r9]=r8
   6.101 +	mov rr[r11]=r10
   6.102 +	mov rr[r13]=r12
   6.103 +	mov rr[r15]=r14
   6.104 +	mov rr[r17]=r16
   6.105 +	mov rr[r19]=r18
   6.106 +	mov rr[r21]=r20
   6.107 +	;;
   6.108 +	/*
   6.109 +	 * Now pin mappings into the TLB for kernel text and data
   6.110 +	 */
   6.111 +	mov r18=KERNEL_TR_PAGE_SHIFT<<2
   6.112 +	movl r17=KERNEL_START
   6.113 +	;;
   6.114 +	mov cr.itir=r18
   6.115 +	mov cr.ifa=r17
   6.116 +	mov r16=IA64_TR_KERNEL
   6.117 +	mov r3=ip
   6.118 +	movl r18=PAGE_KERNEL
   6.119 +	;;
   6.120 +	dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT
   6.121 +	;;
   6.122 +	or r18=r2,r18
   6.123 +	;;
   6.124 +	srlz.i
   6.125 +	;;
   6.126 +	itr.i itr[r16]=r18
   6.127 +	;;
   6.128 +	itr.d dtr[r16]=r18
   6.129 +	;;
   6.130 +	srlz.i
   6.131 +
   6.132 +	/*
   6.133 +	 * Switch into virtual mode:
   6.134 +	 */
   6.135 +#ifdef CONFIG_VTI
   6.136 +	movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH \
   6.137 +		  |IA64_PSR_DI)
   6.138 +#else // CONFIG_VTI
   6.139 +	movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
   6.140 +		  |IA64_PSR_DI)
   6.141 +#endif // CONFIG_VTI
   6.142 +	;;
   6.143 +	mov cr.ipsr=r16
   6.144 +	movl r17=1f
   6.145 +	;;
   6.146 +	mov cr.iip=r17
   6.147 +	mov cr.ifs=r0
   6.148 +	;;
   6.149 +	rfi
   6.150 +	;;
   6.151 +1:	// now we are in virtual mode
   6.152 +
   6.153 +	// set IVT entry point---can't access I/O ports without it
   6.154 +#ifdef CONFIG_VTI
   6.155 +    movl r3=vmx_ia64_ivt
   6.156 +#else // CONFIG_VTI
   6.157 +	movl r3=ia64_ivt
   6.158 +#endif // CONFIG_VTI
   6.159 +	;;
   6.160 +	mov cr.iva=r3
   6.161 +	movl r2=FPSR_DEFAULT
   6.162 +	;;
   6.163 +	srlz.i
   6.164 +	movl gp=__gp
   6.165 +
   6.166 +	mov ar.fpsr=r2
   6.167 +	;;
   6.168 +
   6.169 +#define isAP	p2	// are we an Application Processor?
   6.170 +#define isBP	p3	// are we the Bootstrap Processor?
   6.171 +
   6.172 +#ifdef CONFIG_SMP
   6.173 +	/*
   6.174 +	 * Find the init_task for the currently booting CPU.  At poweron, and in
   6.175 +	 * UP mode, task_for_booting_cpu is NULL.
   6.176 +	 */
   6.177 +	movl r3=task_for_booting_cpu
   6.178 + 	;;
   6.179 +	ld8 r3=[r3]
   6.180 +	movl r2=init_task
   6.181 +	;;
   6.182 +	cmp.eq isBP,isAP=r3,r0
   6.183 +	;;
   6.184 +(isAP)	mov r2=r3
   6.185 +#else
   6.186 +	movl r2=init_task
   6.187 +	cmp.eq isBP,isAP=r0,r0
   6.188 +#endif
   6.189 +	;;
   6.190 +	tpa r3=r2		// r3 == phys addr of task struct
   6.191 +	mov r16=-1
   6.192 +(isBP)	br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it
   6.193 +
   6.194 +	// load mapping for stack (virtaddr in r2, physaddr in r3)
   6.195 +	rsm psr.ic
   6.196 +	movl r17=PAGE_KERNEL
   6.197 +	;;
   6.198 +	srlz.d
   6.199 +	dep r18=0,r3,0,12
   6.200 +	;;
   6.201 +	or r18=r17,r18
   6.202 +#ifdef XEN
   6.203 +	dep r2=-1,r3,60,4	// IMVA of task
   6.204 +#else
   6.205 +	dep r2=-1,r3,61,3	// IMVA of task
   6.206 +#endif
   6.207 +	;;
   6.208 +	mov r17=rr[r2]
   6.209 +	shr.u r16=r3,IA64_GRANULE_SHIFT
   6.210 +	;;
   6.211 +	dep r17=0,r17,8,24
   6.212 +	;;
   6.213 +	mov cr.itir=r17
   6.214 +	mov cr.ifa=r2
   6.215 +
   6.216 +	mov r19=IA64_TR_CURRENT_STACK
   6.217 +	;;
   6.218 +	itr.d dtr[r19]=r18
   6.219 +	;;
   6.220 +	ssm psr.ic
   6.221 +	srlz.d
   6.222 +  	;;
   6.223 +
   6.224 +.load_current:
   6.225 +	// load the "current" pointer (r13) and ar.k6 with the current task
   6.226 +#ifdef CONFIG_VTI
   6.227 +	mov r21=r2		// virtual address
   6.228 +	;;
   6.229 +	bsw.1
   6.230 +	;;
   6.231 +#else // CONFIG_VTI
   6.232 +	mov IA64_KR(CURRENT)=r2		// virtual address
   6.233 +	mov IA64_KR(CURRENT_STACK)=r16
   6.234 +#endif // CONFIG_VTI
   6.235 +	mov r13=r2
   6.236 +	/*
   6.237 +	 * Reserve space at the top of the stack for "struct pt_regs".  Kernel threads
   6.238 +	 * don't store interesting values in that structure, but the space still needs
   6.239 +	 * to be there because time-critical stuff such as the context switching can
   6.240 +	 * be implemented more efficiently (for example, __switch_to()
   6.241 +	 * always sets the psr.dfh bit of the task it is switching to).
   6.242 +	 */
   6.243 +	addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2
   6.244 +	addl r2=IA64_RBS_OFFSET,r2	// initialize the RSE
   6.245 +	mov ar.rsc=0		// place RSE in enforced lazy mode
   6.246 +	;;
   6.247 +	loadrs			// clear the dirty partition
   6.248 +	;;
   6.249 +	mov ar.bspstore=r2	// establish the new RSE stack
   6.250 +	;;
   6.251 +	mov ar.rsc=0x3		// place RSE in eager mode
   6.252 +
   6.253 +#ifdef XEN
   6.254 +(isBP)	dep r28=-1,r28,60,4	// make address virtual
   6.255 +#else
   6.256 +(isBP)	dep r28=-1,r28,61,3	// make address virtual
   6.257 +#endif
   6.258 +(isBP)	movl r2=ia64_boot_param
   6.259 +	;;
   6.260 +(isBP)	st8 [r2]=r28		// save the address of the boot param area passed by the bootloader
   6.261 +
   6.262 +#ifdef CONFIG_SMP
   6.263 +(isAP)	br.call.sptk.many rp=start_secondary
   6.264 +.ret0:
   6.265 +(isAP)	br.cond.sptk self
   6.266 +#endif
   6.267 +
   6.268 +	// This is executed by the bootstrap processor (bsp) only:
   6.269 +
   6.270 +#ifdef CONFIG_IA64_FW_EMU
   6.271 +	// initialize PAL & SAL emulator:
   6.272 +	br.call.sptk.many rp=sys_fw_init
   6.273 +.ret1:
   6.274 +#endif
   6.275 +	br.call.sptk.many rp=start_kernel
   6.276 +.ret2:	addl r3=@ltoff(halt_msg),gp
   6.277 +	;;
   6.278 +	alloc r2=ar.pfs,8,0,2,0
   6.279 +	;;
   6.280 +	ld8 out0=[r3]
   6.281 +	br.call.sptk.many b0=console_print
   6.282 +
   6.283 +self:	hint @pause
   6.284 +	;;
   6.285 +	br.sptk.many self		// endless loop
   6.286 +	;;
   6.287 +END(_start)
   6.288 +
   6.289 +GLOBAL_ENTRY(ia64_save_debug_regs)
   6.290 +	alloc r16=ar.pfs,1,0,0,0
   6.291 +	mov r20=ar.lc			// preserve ar.lc
   6.292 +	mov ar.lc=IA64_NUM_DBG_REGS-1
   6.293 +	mov r18=0
   6.294 +	add r19=IA64_NUM_DBG_REGS*8,in0
   6.295 +	;;
   6.296 +1:	mov r16=dbr[r18]
   6.297 +#ifdef CONFIG_ITANIUM
   6.298 +	;;
   6.299 +	srlz.d
   6.300 +#endif
   6.301 +	mov r17=ibr[r18]
   6.302 +	add r18=1,r18
   6.303 +	;;
   6.304 +	st8.nta [in0]=r16,8
   6.305 +	st8.nta [r19]=r17,8
   6.306 +	br.cloop.sptk.many 1b
   6.307 +	;;
   6.308 +	mov ar.lc=r20			// restore ar.lc
   6.309 +	br.ret.sptk.many rp
   6.310 +END(ia64_save_debug_regs)
   6.311 +
   6.312 +GLOBAL_ENTRY(ia64_load_debug_regs)
   6.313 +	alloc r16=ar.pfs,1,0,0,0
   6.314 +	lfetch.nta [in0]
   6.315 +	mov r20=ar.lc			// preserve ar.lc
   6.316 +	add r19=IA64_NUM_DBG_REGS*8,in0
   6.317 +	mov ar.lc=IA64_NUM_DBG_REGS-1
   6.318 +	mov r18=-1
   6.319 +	;;
   6.320 +1:	ld8.nta r16=[in0],8
   6.321 +	ld8.nta r17=[r19],8
   6.322 +	add r18=1,r18
   6.323 +	;;
   6.324 +	mov dbr[r18]=r16
   6.325 +#ifdef CONFIG_ITANIUM
   6.326 +	;;
   6.327 +	srlz.d				// Errata 132 (NoFix status)
   6.328 +#endif
   6.329 +	mov ibr[r18]=r17
   6.330 +	br.cloop.sptk.many 1b
   6.331 +	;;
   6.332 +	mov ar.lc=r20			// restore ar.lc
   6.333 +	br.ret.sptk.many rp
   6.334 +END(ia64_load_debug_regs)
   6.335 +
   6.336 +GLOBAL_ENTRY(__ia64_save_fpu)
   6.337 +	alloc r2=ar.pfs,1,4,0,0
   6.338 +	adds loc0=96*16-16,in0
   6.339 +	adds loc1=96*16-16-128,in0
   6.340 +	;;
   6.341 +	stf.spill.nta [loc0]=f127,-256
   6.342 +	stf.spill.nta [loc1]=f119,-256
   6.343 +	;;
   6.344 +	stf.spill.nta [loc0]=f111,-256
   6.345 +	stf.spill.nta [loc1]=f103,-256
   6.346 +	;;
   6.347 +	stf.spill.nta [loc0]=f95,-256
   6.348 +	stf.spill.nta [loc1]=f87,-256
   6.349 +	;;
   6.350 +	stf.spill.nta [loc0]=f79,-256
   6.351 +	stf.spill.nta [loc1]=f71,-256
   6.352 +	;;
   6.353 +	stf.spill.nta [loc0]=f63,-256
   6.354 +	stf.spill.nta [loc1]=f55,-256
   6.355 +	adds loc2=96*16-32,in0
   6.356 +	;;
   6.357 +	stf.spill.nta [loc0]=f47,-256
   6.358 +	stf.spill.nta [loc1]=f39,-256
   6.359 +	adds loc3=96*16-32-128,in0
   6.360 +	;;
   6.361 +	stf.spill.nta [loc2]=f126,-256
   6.362 +	stf.spill.nta [loc3]=f118,-256
   6.363 +	;;
   6.364 +	stf.spill.nta [loc2]=f110,-256
   6.365 +	stf.spill.nta [loc3]=f102,-256
   6.366 +	;;
   6.367 +	stf.spill.nta [loc2]=f94,-256
   6.368 +	stf.spill.nta [loc3]=f86,-256
   6.369 +	;;
   6.370 +	stf.spill.nta [loc2]=f78,-256
   6.371 +	stf.spill.nta [loc3]=f70,-256
   6.372 +	;;
   6.373 +	stf.spill.nta [loc2]=f62,-256
   6.374 +	stf.spill.nta [loc3]=f54,-256
   6.375 +	adds loc0=96*16-48,in0
   6.376 +	;;
   6.377 +	stf.spill.nta [loc2]=f46,-256
   6.378 +	stf.spill.nta [loc3]=f38,-256
   6.379 +	adds loc1=96*16-48-128,in0
   6.380 +	;;
   6.381 +	stf.spill.nta [loc0]=f125,-256
   6.382 +	stf.spill.nta [loc1]=f117,-256
   6.383 +	;;
   6.384 +	stf.spill.nta [loc0]=f109,-256
   6.385 +	stf.spill.nta [loc1]=f101,-256
   6.386 +	;;
   6.387 +	stf.spill.nta [loc0]=f93,-256
   6.388 +	stf.spill.nta [loc1]=f85,-256
   6.389 +	;;
   6.390 +	stf.spill.nta [loc0]=f77,-256
   6.391 +	stf.spill.nta [loc1]=f69,-256
   6.392 +	;;
   6.393 +	stf.spill.nta [loc0]=f61,-256
   6.394 +	stf.spill.nta [loc1]=f53,-256
   6.395 +	adds loc2=96*16-64,in0
   6.396 +	;;
   6.397 +	stf.spill.nta [loc0]=f45,-256
   6.398 +	stf.spill.nta [loc1]=f37,-256
   6.399 +	adds loc3=96*16-64-128,in0
   6.400 +	;;
   6.401 +	stf.spill.nta [loc2]=f124,-256
   6.402 +	stf.spill.nta [loc3]=f116,-256
   6.403 +	;;
   6.404 +	stf.spill.nta [loc2]=f108,-256
   6.405 +	stf.spill.nta [loc3]=f100,-256
   6.406 +	;;
   6.407 +	stf.spill.nta [loc2]=f92,-256
   6.408 +	stf.spill.nta [loc3]=f84,-256
   6.409 +	;;
   6.410 +	stf.spill.nta [loc2]=f76,-256
   6.411 +	stf.spill.nta [loc3]=f68,-256
   6.412 +	;;
   6.413 +	stf.spill.nta [loc2]=f60,-256
   6.414 +	stf.spill.nta [loc3]=f52,-256
   6.415 +	adds loc0=96*16-80,in0
   6.416 +	;;
   6.417 +	stf.spill.nta [loc2]=f44,-256
   6.418 +	stf.spill.nta [loc3]=f36,-256
   6.419 +	adds loc1=96*16-80-128,in0
   6.420 +	;;
   6.421 +	stf.spill.nta [loc0]=f123,-256
   6.422 +	stf.spill.nta [loc1]=f115,-256
   6.423 +	;;
   6.424 +	stf.spill.nta [loc0]=f107,-256
   6.425 +	stf.spill.nta [loc1]=f99,-256
   6.426 +	;;
   6.427 +	stf.spill.nta [loc0]=f91,-256
   6.428 +	stf.spill.nta [loc1]=f83,-256
   6.429 +	;;
   6.430 +	stf.spill.nta [loc0]=f75,-256
   6.431 +	stf.spill.nta [loc1]=f67,-256
   6.432 +	;;
   6.433 +	stf.spill.nta [loc0]=f59,-256
   6.434 +	stf.spill.nta [loc1]=f51,-256
   6.435 +	adds loc2=96*16-96,in0
   6.436 +	;;
   6.437 +	stf.spill.nta [loc0]=f43,-256
   6.438 +	stf.spill.nta [loc1]=f35,-256
   6.439 +	adds loc3=96*16-96-128,in0
   6.440 +	;;
   6.441 +	stf.spill.nta [loc2]=f122,-256
   6.442 +	stf.spill.nta [loc3]=f114,-256
   6.443 +	;;
   6.444 +	stf.spill.nta [loc2]=f106,-256
   6.445 +	stf.spill.nta [loc3]=f98,-256
   6.446 +	;;
   6.447 +	stf.spill.nta [loc2]=f90,-256
   6.448 +	stf.spill.nta [loc3]=f82,-256
   6.449 +	;;
   6.450 +	stf.spill.nta [loc2]=f74,-256
   6.451 +	stf.spill.nta [loc3]=f66,-256
   6.452 +	;;
   6.453 +	stf.spill.nta [loc2]=f58,-256
   6.454 +	stf.spill.nta [loc3]=f50,-256
   6.455 +	adds loc0=96*16-112,in0
   6.456 +	;;
   6.457 +	stf.spill.nta [loc2]=f42,-256
   6.458 +	stf.spill.nta [loc3]=f34,-256
   6.459 +	adds loc1=96*16-112-128,in0
   6.460 +	;;
   6.461 +	stf.spill.nta [loc0]=f121,-256
   6.462 +	stf.spill.nta [loc1]=f113,-256
   6.463 +	;;
   6.464 +	stf.spill.nta [loc0]=f105,-256
   6.465 +	stf.spill.nta [loc1]=f97,-256
   6.466 +	;;
   6.467 +	stf.spill.nta [loc0]=f89,-256
   6.468 +	stf.spill.nta [loc1]=f81,-256
   6.469 +	;;
   6.470 +	stf.spill.nta [loc0]=f73,-256
   6.471 +	stf.spill.nta [loc1]=f65,-256
   6.472 +	;;
   6.473 +	stf.spill.nta [loc0]=f57,-256
   6.474 +	stf.spill.nta [loc1]=f49,-256
   6.475 +	adds loc2=96*16-128,in0
   6.476 +	;;
   6.477 +	stf.spill.nta [loc0]=f41,-256
   6.478 +	stf.spill.nta [loc1]=f33,-256
   6.479 +	adds loc3=96*16-128-128,in0
   6.480 +	;;
   6.481 +	stf.spill.nta [loc2]=f120,-256
   6.482 +	stf.spill.nta [loc3]=f112,-256
   6.483 +	;;
   6.484 +	stf.spill.nta [loc2]=f104,-256
   6.485 +	stf.spill.nta [loc3]=f96,-256
   6.486 +	;;
   6.487 +	stf.spill.nta [loc2]=f88,-256
   6.488 +	stf.spill.nta [loc3]=f80,-256
   6.489 +	;;
   6.490 +	stf.spill.nta [loc2]=f72,-256
   6.491 +	stf.spill.nta [loc3]=f64,-256
   6.492 +	;;
   6.493 +	stf.spill.nta [loc2]=f56,-256
   6.494 +	stf.spill.nta [loc3]=f48,-256
   6.495 +	;;
   6.496 +	stf.spill.nta [loc2]=f40
   6.497 +	stf.spill.nta [loc3]=f32
   6.498 +	br.ret.sptk.many rp
   6.499 +END(__ia64_save_fpu)
   6.500 +
   6.501 +GLOBAL_ENTRY(__ia64_load_fpu)
   6.502 +	alloc r2=ar.pfs,1,2,0,0
   6.503 +	adds r3=128,in0
   6.504 +	adds r14=256,in0
   6.505 +	adds r15=384,in0
   6.506 +	mov loc0=512
   6.507 +	mov loc1=-1024+16
   6.508 +	;;
   6.509 +	ldf.fill.nta f32=[in0],loc0
   6.510 +	ldf.fill.nta f40=[ r3],loc0
   6.511 +	ldf.fill.nta f48=[r14],loc0
   6.512 +	ldf.fill.nta f56=[r15],loc0
   6.513 +	;;
   6.514 +	ldf.fill.nta f64=[in0],loc0
   6.515 +	ldf.fill.nta f72=[ r3],loc0
   6.516 +	ldf.fill.nta f80=[r14],loc0
   6.517 +	ldf.fill.nta f88=[r15],loc0
   6.518 +	;;
   6.519 +	ldf.fill.nta f96=[in0],loc1
   6.520 +	ldf.fill.nta f104=[ r3],loc1
   6.521 +	ldf.fill.nta f112=[r14],loc1
   6.522 +	ldf.fill.nta f120=[r15],loc1
   6.523 +	;;
   6.524 +	ldf.fill.nta f33=[in0],loc0
   6.525 +	ldf.fill.nta f41=[ r3],loc0
   6.526 +	ldf.fill.nta f49=[r14],loc0
   6.527 +	ldf.fill.nta f57=[r15],loc0
   6.528 +	;;
   6.529 +	ldf.fill.nta f65=[in0],loc0
   6.530 +	ldf.fill.nta f73=[ r3],loc0
   6.531 +	ldf.fill.nta f81=[r14],loc0
   6.532 +	ldf.fill.nta f89=[r15],loc0
   6.533 +	;;
   6.534 +	ldf.fill.nta f97=[in0],loc1
   6.535 +	ldf.fill.nta f105=[ r3],loc1
   6.536 +	ldf.fill.nta f113=[r14],loc1
   6.537 +	ldf.fill.nta f121=[r15],loc1
   6.538 +	;;
   6.539 +	ldf.fill.nta f34=[in0],loc0
   6.540 +	ldf.fill.nta f42=[ r3],loc0
   6.541 +	ldf.fill.nta f50=[r14],loc0
   6.542 +	ldf.fill.nta f58=[r15],loc0
   6.543 +	;;
   6.544 +	ldf.fill.nta f66=[in0],loc0
   6.545 +	ldf.fill.nta f74=[ r3],loc0
   6.546 +	ldf.fill.nta f82=[r14],loc0
   6.547 +	ldf.fill.nta f90=[r15],loc0
   6.548 +	;;
   6.549 +	ldf.fill.nta f98=[in0],loc1
   6.550 +	ldf.fill.nta f106=[ r3],loc1
   6.551 +	ldf.fill.nta f114=[r14],loc1
   6.552 +	ldf.fill.nta f122=[r15],loc1
   6.553 +	;;
   6.554 +	ldf.fill.nta f35=[in0],loc0
   6.555 +	ldf.fill.nta f43=[ r3],loc0
   6.556 +	ldf.fill.nta f51=[r14],loc0
   6.557 +	ldf.fill.nta f59=[r15],loc0
   6.558 +	;;
   6.559 +	ldf.fill.nta f67=[in0],loc0
   6.560 +	ldf.fill.nta f75=[ r3],loc0
   6.561 +	ldf.fill.nta f83=[r14],loc0
   6.562 +	ldf.fill.nta f91=[r15],loc0
   6.563 +	;;
   6.564 +	ldf.fill.nta f99=[in0],loc1
   6.565 +	ldf.fill.nta f107=[ r3],loc1
   6.566 +	ldf.fill.nta f115=[r14],loc1
   6.567 +	ldf.fill.nta f123=[r15],loc1
   6.568 +	;;
   6.569 +	ldf.fill.nta f36=[in0],loc0
   6.570 +	ldf.fill.nta f44=[ r3],loc0
   6.571 +	ldf.fill.nta f52=[r14],loc0
   6.572 +	ldf.fill.nta f60=[r15],loc0
   6.573 +	;;
   6.574 +	ldf.fill.nta f68=[in0],loc0
   6.575 +	ldf.fill.nta f76=[ r3],loc0
   6.576 +	ldf.fill.nta f84=[r14],loc0
   6.577 +	ldf.fill.nta f92=[r15],loc0
   6.578 +	;;
   6.579 +	ldf.fill.nta f100=[in0],loc1
   6.580 +	ldf.fill.nta f108=[ r3],loc1
   6.581 +	ldf.fill.nta f116=[r14],loc1
   6.582 +	ldf.fill.nta f124=[r15],loc1
   6.583 +	;;
   6.584 +	ldf.fill.nta f37=[in0],loc0
   6.585 +	ldf.fill.nta f45=[ r3],loc0
   6.586 +	ldf.fill.nta f53=[r14],loc0
   6.587 +	ldf.fill.nta f61=[r15],loc0
   6.588 +	;;
   6.589 +	ldf.fill.nta f69=[in0],loc0
   6.590 +	ldf.fill.nta f77=[ r3],loc0
   6.591 +	ldf.fill.nta f85=[r14],loc0
   6.592 +	ldf.fill.nta f93=[r15],loc0
   6.593 +	;;
   6.594 +	ldf.fill.nta f101=[in0],loc1
   6.595 +	ldf.fill.nta f109=[ r3],loc1
   6.596 +	ldf.fill.nta f117=[r14],loc1
   6.597 +	ldf.fill.nta f125=[r15],loc1
   6.598 +	;;
   6.599 +	ldf.fill.nta f38 =[in0],loc0
   6.600 +	ldf.fill.nta f46 =[ r3],loc0
   6.601 +	ldf.fill.nta f54 =[r14],loc0
   6.602 +	ldf.fill.nta f62 =[r15],loc0
   6.603 +	;;
   6.604 +	ldf.fill.nta f70 =[in0],loc0
   6.605 +	ldf.fill.nta f78 =[ r3],loc0
   6.606 +	ldf.fill.nta f86 =[r14],loc0
   6.607 +	ldf.fill.nta f94 =[r15],loc0
   6.608 +	;;
   6.609 +	ldf.fill.nta f102=[in0],loc1
   6.610 +	ldf.fill.nta f110=[ r3],loc1
   6.611 +	ldf.fill.nta f118=[r14],loc1
   6.612 +	ldf.fill.nta f126=[r15],loc1
   6.613 +	;;
   6.614 +	ldf.fill.nta f39 =[in0],loc0
   6.615 +	ldf.fill.nta f47 =[ r3],loc0
   6.616 +	ldf.fill.nta f55 =[r14],loc0
   6.617 +	ldf.fill.nta f63 =[r15],loc0
   6.618 +	;;
   6.619 +	ldf.fill.nta f71 =[in0],loc0
   6.620 +	ldf.fill.nta f79 =[ r3],loc0
   6.621 +	ldf.fill.nta f87 =[r14],loc0
   6.622 +	ldf.fill.nta f95 =[r15],loc0
   6.623 +	;;
   6.624 +	ldf.fill.nta f103=[in0]
   6.625 +	ldf.fill.nta f111=[ r3]
   6.626 +	ldf.fill.nta f119=[r14]
   6.627 +	ldf.fill.nta f127=[r15]
   6.628 +	br.ret.sptk.many rp
   6.629 +END(__ia64_load_fpu)
   6.630 +
   6.631 +GLOBAL_ENTRY(__ia64_init_fpu)
   6.632 +	stf.spill [sp]=f0		// M3
   6.633 +	mov	 f32=f0			// F
   6.634 +	nop.b	 0
   6.635 +
   6.636 +	ldfps	 f33,f34=[sp]		// M0
   6.637 +	ldfps	 f35,f36=[sp]		// M1
   6.638 +	mov      f37=f0			// F
   6.639 +	;;
   6.640 +
   6.641 +	setf.s	 f38=r0			// M2
   6.642 +	setf.s	 f39=r0			// M3
   6.643 +	mov      f40=f0			// F
   6.644 +
   6.645 +	ldfps	 f41,f42=[sp]		// M0
   6.646 +	ldfps	 f43,f44=[sp]		// M1
   6.647 +	mov      f45=f0			// F
   6.648 +
   6.649 +	setf.s	 f46=r0			// M2
   6.650 +	setf.s	 f47=r0			// M3
   6.651 +	mov      f48=f0			// F
   6.652 +
   6.653 +	ldfps	 f49,f50=[sp]		// M0
   6.654 +	ldfps	 f51,f52=[sp]		// M1
   6.655 +	mov      f53=f0			// F
   6.656 +
   6.657 +	setf.s	 f54=r0			// M2
   6.658 +	setf.s	 f55=r0			// M3
   6.659 +	mov      f56=f0			// F
   6.660 +
   6.661 +	ldfps	 f57,f58=[sp]		// M0
   6.662 +	ldfps	 f59,f60=[sp]		// M1
   6.663 +	mov      f61=f0			// F
   6.664 +
   6.665 +	setf.s	 f62=r0			// M2
   6.666 +	setf.s	 f63=r0			// M3
   6.667 +	mov      f64=f0			// F
   6.668 +
   6.669 +	ldfps	 f65,f66=[sp]		// M0
   6.670 +	ldfps	 f67,f68=[sp]		// M1
   6.671 +	mov      f69=f0			// F
   6.672 +
   6.673 +	setf.s	 f70=r0			// M2
   6.674 +	setf.s	 f71=r0			// M3
   6.675 +	mov      f72=f0			// F
   6.676 +
   6.677 +	ldfps	 f73,f74=[sp]		// M0
   6.678 +	ldfps	 f75,f76=[sp]		// M1
   6.679 +	mov      f77=f0			// F
   6.680 +
   6.681 +	setf.s	 f78=r0			// M2
   6.682 +	setf.s	 f79=r0			// M3
   6.683 +	mov      f80=f0			// F
   6.684 +
   6.685 +	ldfps	 f81,f82=[sp]		// M0
   6.686 +	ldfps	 f83,f84=[sp]		// M1
   6.687 +	mov      f85=f0			// F
   6.688 +
   6.689 +	setf.s	 f86=r0			// M2
   6.690 +	setf.s	 f87=r0			// M3
   6.691 +	mov      f88=f0			// F
   6.692 +
   6.693 +	/*
   6.694 +	 * When the instructions are cached, it would be faster to initialize
   6.695 +	 * the remaining registers with simply mov instructions (F-unit).
   6.696 +	 * This gets the time down to ~29 cycles.  However, this would use up
   6.697 +	 * 33 bundles, whereas continuing with the above pattern yields
   6.698 +	 * 10 bundles and ~30 cycles.
   6.699 +	 */
   6.700 +
   6.701 +	ldfps	 f89,f90=[sp]		// M0
   6.702 +	ldfps	 f91,f92=[sp]		// M1
   6.703 +	mov      f93=f0			// F
   6.704 +
   6.705 +	setf.s	 f94=r0			// M2
   6.706 +	setf.s	 f95=r0			// M3
   6.707 +	mov      f96=f0			// F
   6.708 +
   6.709 +	ldfps	 f97,f98=[sp]		// M0
   6.710 +	ldfps	 f99,f100=[sp]		// M1
   6.711 +	mov      f101=f0		// F
   6.712 +
   6.713 +	setf.s	 f102=r0		// M2
   6.714 +	setf.s	 f103=r0		// M3
   6.715 +	mov      f104=f0		// F
   6.716 +
   6.717 +	ldfps	 f105,f106=[sp]		// M0
   6.718 +	ldfps	 f107,f108=[sp]		// M1
   6.719 +	mov      f109=f0		// F
   6.720 +
   6.721 +	setf.s	 f110=r0		// M2
   6.722 +	setf.s	 f111=r0		// M3
   6.723 +	mov      f112=f0		// F
   6.724 +
   6.725 +	ldfps	 f113,f114=[sp]		// M0
   6.726 +	ldfps	 f115,f116=[sp]		// M1
   6.727 +	mov      f117=f0		// F
   6.728 +
   6.729 +	setf.s	 f118=r0		// M2
   6.730 +	setf.s	 f119=r0		// M3
   6.731 +	mov      f120=f0		// F
   6.732 +
   6.733 +	ldfps	 f121,f122=[sp]		// M0
   6.734 +	ldfps	 f123,f124=[sp]		// M1
   6.735 +	mov      f125=f0		// F
   6.736 +
   6.737 +	setf.s	 f126=r0		// M2
   6.738 +	setf.s	 f127=r0		// M3
   6.739 +	br.ret.sptk.many rp		// F
   6.740 +END(__ia64_init_fpu)
   6.741 +
   6.742 +/*
   6.743 + * Switch execution mode from virtual to physical
   6.744 + *
   6.745 + * Inputs:
   6.746 + *	r16 = new psr to establish
   6.747 + * Output:
   6.748 + *	r19 = old virtual address of ar.bsp
   6.749 + *	r20 = old virtual address of sp
   6.750 + *
   6.751 + * Note: RSE must already be in enforced lazy mode
   6.752 + */
   6.753 +GLOBAL_ENTRY(ia64_switch_mode_phys)
   6.754 + {
   6.755 +	alloc r2=ar.pfs,0,0,0,0
   6.756 +	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
   6.757 +	mov r15=ip
   6.758 + }
   6.759 +	;;
   6.760 + {
   6.761 +	flushrs				// must be first insn in group
   6.762 +	srlz.i
   6.763 + }
   6.764 +	;;
   6.765 +	mov cr.ipsr=r16			// set new PSR
   6.766 +	add r3=1f-ia64_switch_mode_phys,r15
   6.767 +
   6.768 +	mov r19=ar.bsp
   6.769 +	mov r20=sp
   6.770 +	mov r14=rp			// get return address into a general register
   6.771 +	;;
   6.772 +
   6.773 +	// going to physical mode, use tpa to translate virt->phys
   6.774 +	tpa r17=r19
   6.775 +	tpa r3=r3
   6.776 +	tpa sp=sp
   6.777 +	tpa r14=r14
   6.778 +	;;
   6.779 +
   6.780 +	mov r18=ar.rnat			// save ar.rnat
   6.781 +	mov ar.bspstore=r17		// this steps on ar.rnat
   6.782 +	mov cr.iip=r3
   6.783 +	mov cr.ifs=r0
   6.784 +	;;
   6.785 +	mov ar.rnat=r18			// restore ar.rnat
   6.786 +	rfi				// must be last insn in group
   6.787 +	;;
   6.788 +1:	mov rp=r14
   6.789 +	br.ret.sptk.many rp
   6.790 +END(ia64_switch_mode_phys)
   6.791 +
   6.792 +/*
   6.793 + * Switch execution mode from physical to virtual
   6.794 + *
   6.795 + * Inputs:
   6.796 + *	r16 = new psr to establish
   6.797 + *	r19 = new bspstore to establish
   6.798 + *	r20 = new sp to establish
   6.799 + *
   6.800 + * Note: RSE must already be in enforced lazy mode
   6.801 + */
   6.802 +GLOBAL_ENTRY(ia64_switch_mode_virt)
   6.803 + {
   6.804 +	alloc r2=ar.pfs,0,0,0,0
   6.805 +	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
   6.806 +	mov r15=ip
   6.807 + }
   6.808 +	;;
   6.809 + {
   6.810 +	flushrs				// must be first insn in group
   6.811 +	srlz.i
   6.812 + }
   6.813 +	;;
   6.814 +	mov cr.ipsr=r16			// set new PSR
   6.815 +	add r3=1f-ia64_switch_mode_virt,r15
   6.816 +
   6.817 +	mov r14=rp			// get return address into a general register
   6.818 +	;;
   6.819 +
   6.820 +	// going to virtual
   6.821 +	//   - for code addresses, set upper bits of addr to KERNEL_START
   6.822 +	//   - for stack addresses, copy from input argument
   6.823 +	movl r18=KERNEL_START
   6.824 +	dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
   6.825 +	dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
   6.826 +	mov sp=r20
   6.827 +	;;
   6.828 +	or r3=r3,r18
   6.829 +	or r14=r14,r18
   6.830 +	;;
   6.831 +
   6.832 +	mov r18=ar.rnat			// save ar.rnat
   6.833 +	mov ar.bspstore=r19		// this steps on ar.rnat
   6.834 +	mov cr.iip=r3
   6.835 +	mov cr.ifs=r0
   6.836 +	;;
   6.837 +	mov ar.rnat=r18			// restore ar.rnat
   6.838 +	rfi				// must be last insn in group
   6.839 +	;;
   6.840 +1:	mov rp=r14
   6.841 +	br.ret.sptk.many rp
   6.842 +END(ia64_switch_mode_virt)
   6.843 +
   6.844 +GLOBAL_ENTRY(ia64_delay_loop)
   6.845 +	.prologue
   6.846 +{	nop 0			// work around GAS unwind info generation bug...
   6.847 +	.save ar.lc,r2
   6.848 +	mov r2=ar.lc
   6.849 +	.body
   6.850 +	;;
   6.851 +	mov ar.lc=r32
   6.852 +}
   6.853 +	;;
   6.854 +	// force loop to be 32-byte aligned (GAS bug means we cannot use .align
   6.855 +	// inside function body without corrupting unwind info).
   6.856 +{	nop 0 }
   6.857 +1:	br.cloop.sptk.few 1b
   6.858 +	;;
   6.859 +	mov ar.lc=r2
   6.860 +	br.ret.sptk.many rp
   6.861 +END(ia64_delay_loop)
   6.862 +
   6.863 +/*
   6.864 + * Return a CPU-local timestamp in nano-seconds.  This timestamp is
   6.865 + * NOT synchronized across CPUs its return value must never be
   6.866 + * compared against the values returned on another CPU.  The usage in
   6.867 + * kernel/sched.c ensures that.
   6.868 + *
   6.869 + * The return-value of sched_clock() is NOT supposed to wrap-around.
   6.870 + * If it did, it would cause some scheduling hiccups (at the worst).
   6.871 + * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even
   6.872 + * that would happen only once every 5+ years.
   6.873 + *
   6.874 + * The code below basically calculates:
   6.875 + *
   6.876 + *   (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT
   6.877 + *
   6.878 + * except that the multiplication and the shift are done with 128-bit
   6.879 + * intermediate precision so that we can produce a full 64-bit result.
   6.880 + */
   6.881 +GLOBAL_ENTRY(sched_clock)
   6.882 +#ifdef XEN
   6.883 +	movl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET
   6.884 +#else
   6.885 +	addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
   6.886 +#endif
   6.887 +	mov.m r9=ar.itc		// fetch cycle-counter				(35 cyc)
   6.888 +	;;
   6.889 +	ldf8 f8=[r8]
   6.890 +	;;
   6.891 +	setf.sig f9=r9		// certain to stall, so issue it _after_ ldf8...
   6.892 +	;;
   6.893 +	xmpy.lu f10=f9,f8	// calculate low 64 bits of 128-bit product	(4 cyc)
   6.894 +	xmpy.hu f11=f9,f8	// calculate high 64 bits of 128-bit product
   6.895 +	;;
   6.896 +	getf.sig r8=f10		//						(5 cyc)
   6.897 +	getf.sig r9=f11
   6.898 +	;;
   6.899 +	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
   6.900 +	br.ret.sptk.many rp
   6.901 +END(sched_clock)
   6.902 +
   6.903 +GLOBAL_ENTRY(start_kernel_thread)
   6.904 +	.prologue
   6.905 +	.save rp, r0				// this is the end of the call-chain
   6.906 +	.body
   6.907 +	alloc r2 = ar.pfs, 0, 0, 2, 0
   6.908 +	mov out0 = r9
   6.909 +	mov out1 = r11;;
   6.910 +	br.call.sptk.many rp = kernel_thread_helper;;
   6.911 +	mov out0 = r8
   6.912 +	br.call.sptk.many rp = sys_exit;;
   6.913 +1:	br.sptk.few 1b				// not reached
   6.914 +END(start_kernel_thread)
   6.915 +
   6.916 +#ifdef CONFIG_IA64_BRL_EMU
   6.917 +
   6.918 +/*
   6.919 + *  Assembly routines used by brl_emu.c to set preserved register state.
   6.920 + */
   6.921 +
   6.922 +#define SET_REG(reg)				\
   6.923 + GLOBAL_ENTRY(ia64_set_##reg);			\
   6.924 +	alloc r16=ar.pfs,1,0,0,0;		\
   6.925 +	mov reg=r32;				\
   6.926 +	;;					\
   6.927 +	br.ret.sptk.many rp;			\
   6.928 + END(ia64_set_##reg)
   6.929 +
   6.930 +SET_REG(b1);
   6.931 +SET_REG(b2);
   6.932 +SET_REG(b3);
   6.933 +SET_REG(b4);
   6.934 +SET_REG(b5);
   6.935 +
   6.936 +#endif /* CONFIG_IA64_BRL_EMU */
   6.937 +
   6.938 +#ifdef CONFIG_SMP
   6.939 +	/*
   6.940 +	 * This routine handles spinlock contention.  It uses a non-standard calling
   6.941 +	 * convention to avoid converting leaf routines into interior routines.  Because
   6.942 +	 * of this special convention, there are several restrictions:
   6.943 +	 *
   6.944 +	 * - do not use gp relative variables, this code is called from the kernel
   6.945 +	 *   and from modules, r1 is undefined.
   6.946 +	 * - do not use stacked registers, the caller owns them.
   6.947 +	 * - do not use the scratch stack space, the caller owns it.
   6.948 +	 * - do not use any registers other than the ones listed below
   6.949 +	 *
   6.950 +	 * Inputs:
   6.951 +	 *   ar.pfs - saved CFM of caller
   6.952 +	 *   ar.ccv - 0 (and available for use)
   6.953 +	 *   r27    - flags from spin_lock_irqsave or 0.  Must be preserved.
   6.954 +	 *   r28    - available for use.
   6.955 +	 *   r29    - available for use.
   6.956 +	 *   r30    - available for use.
   6.957 +	 *   r31    - address of lock, available for use.
   6.958 +	 *   b6     - return address
   6.959 +	 *   p14    - available for use.
   6.960 +	 *   p15    - used to track flag status.
   6.961 +	 *
   6.962 +	 * If you patch this code to use more registers, do not forget to update
   6.963 +	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
   6.964 +	 */
   6.965 +
   6.966 +#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
   6.967 +
   6.968 +GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
   6.969 +	.prologue
   6.970 +	.save ar.pfs, r0	// this code effectively has a zero frame size
   6.971 +	.save rp, r28
   6.972 +	.body
   6.973 +	nop 0
   6.974 +	tbit.nz p15,p0=r27,IA64_PSR_I_BIT
   6.975 +	.restore sp		// pop existing prologue after next insn
   6.976 +	mov b6 = r28
   6.977 +	.prologue
   6.978 +	.save ar.pfs, r0
   6.979 +	.altrp b6
   6.980 +	.body
   6.981 +	;;
   6.982 +(p15)	ssm psr.i		// reenable interrupts if they were on
   6.983 +				// DavidM says that srlz.d is slow and is not required in this case
   6.984 +.wait:
   6.985 +	// exponential backoff, kdb, lockmeter etc. go in here
   6.986 +	hint @pause
   6.987 +	ld4 r30=[r31]		// don't use ld4.bias; if it's contended, we won't write the word
   6.988 +	nop 0
   6.989 +	;;
   6.990 +	cmp4.ne p14,p0=r30,r0
   6.991 +(p14)	br.cond.sptk.few .wait
   6.992 +(p15)	rsm psr.i		// disable interrupts if we reenabled them
   6.993 +	br.cond.sptk.few b6	// lock is now free, try to acquire
   6.994 +	.global ia64_spinlock_contention_pre3_4_end	// for kernprof
   6.995 +ia64_spinlock_contention_pre3_4_end:
   6.996 +END(ia64_spinlock_contention_pre3_4)
   6.997 +
   6.998 +#else
   6.999 +
  6.1000 +GLOBAL_ENTRY(ia64_spinlock_contention)
  6.1001 +	.prologue
  6.1002 +	.altrp b6
  6.1003 +	.body
  6.1004 +	tbit.nz p15,p0=r27,IA64_PSR_I_BIT
  6.1005 +	;;
  6.1006 +.wait:
  6.1007 +(p15)	ssm psr.i		// reenable interrupts if they were on
  6.1008 +				// DavidM says that srlz.d is slow and is not required in this case
  6.1009 +.wait2:
  6.1010 +	// exponential backoff, kdb, lockmeter etc. go in here
  6.1011 +	hint @pause
  6.1012 +	ld4 r30=[r31]		// don't use ld4.bias; if it's contended, we won't write the word
  6.1013 +	;;
  6.1014 +	cmp4.ne p14,p0=r30,r0
  6.1015 +	mov r30 = 1
  6.1016 +(p14)	br.cond.sptk.few .wait2
  6.1017 +(p15)	rsm psr.i		// disable interrupts if we reenabled them
  6.1018 +	;;
  6.1019 +	cmpxchg4.acq r30=[r31], r30, ar.ccv
  6.1020 +	;;
  6.1021 +	cmp4.ne p14,p0=r0,r30
  6.1022 +(p14)	br.cond.sptk.few .wait
  6.1023 +
  6.1024 +	br.ret.sptk.many b6	// lock is now taken
  6.1025 +END(ia64_spinlock_contention)
  6.1026 +
  6.1027 +#endif
  6.1028 +
  6.1029 +#endif /* CONFIG_SMP */
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/xen/arch/ia64/irq_ia64.c	Tue Aug 02 15:59:09 2005 -0800
     7.3 @@ -0,0 +1,381 @@
     7.4 +/*
     7.5 + * linux/arch/ia64/kernel/irq.c
     7.6 + *
     7.7 + * Copyright (C) 1998-2001 Hewlett-Packard Co
     7.8 + *	Stephane Eranian <eranian@hpl.hp.com>
     7.9 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    7.10 + *
    7.11 + *  6/10/99: Updated to bring in sync with x86 version to facilitate
    7.12 + *	     support for SMP and different interrupt controllers.
    7.13 + *
    7.14 + * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector
    7.15 + *                      PCI to vector allocation routine.
    7.16 + * 04/14/2004 Ashok Raj <ashok.raj@intel.com>
    7.17 + *						Added CPU Hotplug handling for IPF.
    7.18 + */
    7.19 +
    7.20 +#include <linux/config.h>
    7.21 +#include <linux/module.h>
    7.22 +
    7.23 +#include <linux/jiffies.h>
    7.24 +#include <linux/errno.h>
    7.25 +#include <linux/init.h>
    7.26 +#include <linux/interrupt.h>
    7.27 +#include <linux/ioport.h>
    7.28 +#include <linux/kernel_stat.h>
    7.29 +#include <linux/slab.h>
    7.30 +#include <linux/ptrace.h>
    7.31 +#include <linux/random.h>	/* for rand_initialize_irq() */
    7.32 +#include <linux/signal.h>
    7.33 +#include <linux/smp.h>
    7.34 +#include <linux/smp_lock.h>
    7.35 +#include <linux/threads.h>
    7.36 +#include <linux/bitops.h>
    7.37 +
    7.38 +#include <asm/delay.h>
    7.39 +#include <asm/intrinsics.h>
    7.40 +#include <asm/io.h>
    7.41 +#include <asm/hw_irq.h>
    7.42 +#include <asm/machvec.h>
    7.43 +#include <asm/pgtable.h>
    7.44 +#include <asm/system.h>
    7.45 +
    7.46 +#ifdef CONFIG_PERFMON
    7.47 +# include <asm/perfmon.h>
    7.48 +#endif
    7.49 +
    7.50 +#define IRQ_DEBUG	0
    7.51 +
    7.52 +/* default base addr of IPI table */
    7.53 +void __iomem *ipi_base_addr = ((void __iomem *)
    7.54 +			       (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR));
    7.55 +
    7.56 +/*
    7.57 + * Legacy IRQ to IA-64 vector translation table.
    7.58 + */
    7.59 +__u8 isa_irq_to_vector_map[16] = {
    7.60 +	/* 8259 IRQ translation, first 16 entries */
    7.61 +	0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
    7.62 +	0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21
    7.63 +};
    7.64 +EXPORT_SYMBOL(isa_irq_to_vector_map);
    7.65 +
    7.66 +static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)];
    7.67 +
    7.68 +int
    7.69 +assign_irq_vector (int irq)
    7.70 +{
    7.71 +	int pos, vector;
    7.72 + again:
    7.73 +	pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
    7.74 +	vector = IA64_FIRST_DEVICE_VECTOR + pos;
    7.75 +	if (vector > IA64_LAST_DEVICE_VECTOR)
    7.76 +		/* XXX could look for sharable vectors instead of panic'ing... */
    7.77 +		panic("assign_irq_vector: out of interrupt vectors!");
    7.78 +	if (test_and_set_bit(pos, ia64_vector_mask))
    7.79 +		goto again;
    7.80 +	return vector;
    7.81 +}
    7.82 +
    7.83 +void
    7.84 +free_irq_vector (int vector)
    7.85 +{
    7.86 +	int pos;
    7.87 +
    7.88 +	if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
    7.89 +		return;
    7.90 +
    7.91 +	pos = vector - IA64_FIRST_DEVICE_VECTOR;
    7.92 +	if (!test_and_clear_bit(pos, ia64_vector_mask))
    7.93 +		printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
    7.94 +}
    7.95 +
    7.96 +#ifdef CONFIG_SMP
    7.97 +#	define IS_RESCHEDULE(vec)	(vec == IA64_IPI_RESCHEDULE)
    7.98 +#else
    7.99 +#	define IS_RESCHEDULE(vec)	(0)
   7.100 +#endif
   7.101 +/*
   7.102 + * That's where the IVT branches when we get an external
   7.103 + * interrupt. This branches to the correct hardware IRQ handler via
   7.104 + * function ptr.
   7.105 + */
   7.106 +void
   7.107 +ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
   7.108 +{
   7.109 +	unsigned long saved_tpr;
   7.110 +
   7.111 +#if IRQ_DEBUG
   7.112 +#ifdef XEN
   7.113 +	xen_debug_irq(vector, regs);
   7.114 +#endif
   7.115 +	{
   7.116 +		unsigned long bsp, sp;
   7.117 +
   7.118 +		/*
   7.119 +		 * Note: if the interrupt happened while executing in
   7.120 +		 * the context switch routine (ia64_switch_to), we may
   7.121 +		 * get a spurious stack overflow here.  This is
   7.122 +		 * because the register and the memory stack are not
   7.123 +		 * switched atomically.
   7.124 +		 */
   7.125 +		bsp = ia64_getreg(_IA64_REG_AR_BSP);
   7.126 +		sp = ia64_getreg(_IA64_REG_SP);
   7.127 +
   7.128 +		if ((sp - bsp) < 1024) {
   7.129 +			static unsigned char count;
   7.130 +			static long last_time;
   7.131 +
   7.132 +			if (jiffies - last_time > 5*HZ)
   7.133 +				count = 0;
   7.134 +			if (++count < 5) {
   7.135 +				last_time = jiffies;
   7.136 +				printk("ia64_handle_irq: DANGER: less than "
   7.137 +				       "1KB of free stack space!!\n"
   7.138 +				       "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
   7.139 +			}
   7.140 +		}
   7.141 +	}
   7.142 +#endif /* IRQ_DEBUG */
   7.143 +
   7.144 +	/*
   7.145 +	 * Always set TPR to limit maximum interrupt nesting depth to
   7.146 +	 * 16 (without this, it would be ~240, which could easily lead
   7.147 +	 * to kernel stack overflows).
   7.148 +	 */
   7.149 +	irq_enter();
   7.150 +	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
   7.151 +	ia64_srlz_d();
   7.152 +	while (vector != IA64_SPURIOUS_INT_VECTOR) {
   7.153 +		if (!IS_RESCHEDULE(vector)) {
   7.154 +			ia64_setreg(_IA64_REG_CR_TPR, vector);
   7.155 +			ia64_srlz_d();
   7.156 +
   7.157 +#ifdef XEN
   7.158 +			if (!xen_do_IRQ(vector))
   7.159 +#endif
   7.160 +			__do_IRQ(local_vector_to_irq(vector), regs);
   7.161 +
   7.162 +			/*
   7.163 +			 * Disable interrupts and send EOI:
   7.164 +			 */
   7.165 +			local_irq_disable();
   7.166 +			ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
   7.167 +		}
   7.168 +		ia64_eoi();
   7.169 +		vector = ia64_get_ivr();
   7.170 +	}
   7.171 +	/*
   7.172 +	 * This must be done *after* the ia64_eoi().  For example, the keyboard softirq
   7.173 +	 * handler needs to be able to wait for further keyboard interrupts, which can't
   7.174 +	 * come through until ia64_eoi() has been done.
   7.175 +	 */
   7.176 +	irq_exit();
   7.177 +}
   7.178 +
   7.179 +#ifdef  CONFIG_VTI
   7.180 +#define vmx_irq_enter()		\
   7.181 +	add_preempt_count(HARDIRQ_OFFSET);
   7.182 +
   7.183 +/* Now softirq will be checked when leaving hypervisor, or else
   7.184 + * scheduler irq will be executed too early.
   7.185 + */
   7.186 +#define vmx_irq_exit(void)	\
   7.187 +	sub_preempt_count(HARDIRQ_OFFSET);
   7.188 +/*
   7.189 + * That's where the IVT branches when we get an external
   7.190 + * interrupt. This branches to the correct hardware IRQ handler via
   7.191 + * function ptr.
   7.192 + */
   7.193 +void
   7.194 +vmx_ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
   7.195 +{
   7.196 +	unsigned long saved_tpr;
   7.197 +	int	wake_dom0 = 0;
   7.198 +
   7.199 +
   7.200 +#if IRQ_DEBUG
   7.201 +	{
   7.202 +		unsigned long bsp, sp;
   7.203 +
   7.204 +		/*
   7.205 +		 * Note: if the interrupt happened while executing in
   7.206 +		 * the context switch routine (ia64_switch_to), we may
   7.207 +		 * get a spurious stack overflow here.  This is
   7.208 +		 * because the register and the memory stack are not
   7.209 +		 * switched atomically.
   7.210 +		 */
   7.211 +		bsp = ia64_getreg(_IA64_REG_AR_BSP);
   7.212 +		sp = ia64_getreg(_IA64_REG_AR_SP);
   7.213 +
   7.214 +		if ((sp - bsp) < 1024) {
   7.215 +			static unsigned char count;
   7.216 +			static long last_time;
   7.217 +
   7.218 +			if (jiffies - last_time > 5*HZ)
   7.219 +				count = 0;
   7.220 +			if (++count < 5) {
   7.221 +				last_time = jiffies;
   7.222 +				printk("ia64_handle_irq: DANGER: less than "
   7.223 +				       "1KB of free stack space!!\n"
   7.224 +				       "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
   7.225 +			}
   7.226 +		}
   7.227 +	}
   7.228 +#endif /* IRQ_DEBUG */
   7.229 +
   7.230 +	/*
   7.231 +	 * Always set TPR to limit maximum interrupt nesting depth to
   7.232 +	 * 16 (without this, it would be ~240, which could easily lead
   7.233 +	 * to kernel stack overflows).
   7.234 +	 */
   7.235 +	vmx_irq_enter();
   7.236 +	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
   7.237 +	ia64_srlz_d();
   7.238 +	while (vector != IA64_SPURIOUS_INT_VECTOR) {
   7.239 +	    if (!IS_RESCHEDULE(vector)) {
   7.240 +		ia64_setreg(_IA64_REG_CR_TPR, vector);
   7.241 +		ia64_srlz_d();
   7.242 +
   7.243 +		if (vector != IA64_TIMER_VECTOR) {
   7.244 +			/* FIXME: Leave IRQ re-route later */
   7.245 +			vmx_vcpu_pend_interrupt(dom0->vcpu[0],vector);
   7.246 +			wake_dom0 = 1;
   7.247 +		}
   7.248 +		else {	// FIXME: Handle Timer only now
   7.249 +			__do_IRQ(local_vector_to_irq(vector), regs);
   7.250 +		}
   7.251 +		
   7.252 +		/*
   7.253 +		 * Disable interrupts and send EOI:
   7.254 +		 */
   7.255 +		local_irq_disable();
   7.256 +		ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
   7.257 +	    }
   7.258 +	    else {
   7.259 +                printf("Oops: RESCHEDULE IPI absorbed by HV\n");
   7.260 +            }
   7.261 +	    ia64_eoi();
   7.262 +	    vector = ia64_get_ivr();
   7.263 +	}
   7.264 +	/*
   7.265 +	 * This must be done *after* the ia64_eoi().  For example, the keyboard softirq
   7.266 +	 * handler needs to be able to wait for further keyboard interrupts, which can't
   7.267 +	 * come through until ia64_eoi() has been done.
   7.268 +	 */
   7.269 +	vmx_irq_exit();
   7.270 +	if ( wake_dom0 && current != dom0 ) 
   7.271 +		domain_wake(dom0->vcpu[0]);
   7.272 +}
   7.273 +#endif
   7.274 +
   7.275 +
   7.276 +#ifdef CONFIG_HOTPLUG_CPU
   7.277 +/*
   7.278 + * This function emulates a interrupt processing when a cpu is about to be
   7.279 + * brought down.
   7.280 + */
   7.281 +void ia64_process_pending_intr(void)
   7.282 +{
   7.283 +	ia64_vector vector;
   7.284 +	unsigned long saved_tpr;
   7.285 +	extern unsigned int vectors_in_migration[NR_IRQS];
   7.286 +
   7.287 +	vector = ia64_get_ivr();
   7.288 +
   7.289 +	 irq_enter();
   7.290 +	 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
   7.291 +	 ia64_srlz_d();
   7.292 +
   7.293 +	 /*
   7.294 +	  * Perform normal interrupt style processing
   7.295 +	  */
   7.296 +	while (vector != IA64_SPURIOUS_INT_VECTOR) {
   7.297 +		if (!IS_RESCHEDULE(vector)) {
   7.298 +			ia64_setreg(_IA64_REG_CR_TPR, vector);
   7.299 +			ia64_srlz_d();
   7.300 +
   7.301 +			/*
   7.302 +			 * Now try calling normal ia64_handle_irq as it would have got called
   7.303 +			 * from a real intr handler. Try passing null for pt_regs, hopefully
   7.304 +			 * it will work. I hope it works!.
   7.305 +			 * Probably could shared code.
   7.306 +			 */
   7.307 +			vectors_in_migration[local_vector_to_irq(vector)]=0;
   7.308 +			__do_IRQ(local_vector_to_irq(vector), NULL);
   7.309 +
   7.310 +			/*
   7.311 +			 * Disable interrupts and send EOI
   7.312 +			 */
   7.313 +			local_irq_disable();
   7.314 +			ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
   7.315 +		}
   7.316 +		ia64_eoi();
   7.317 +		vector = ia64_get_ivr();
   7.318 +	}
   7.319 +	irq_exit();
   7.320 +}
   7.321 +#endif
   7.322 +
   7.323 +
   7.324 +#ifdef CONFIG_SMP
   7.325 +extern irqreturn_t handle_IPI (int irq, void *dev_id, struct pt_regs *regs);
   7.326 +
   7.327 +static struct irqaction ipi_irqaction = {
   7.328 +	.handler =	handle_IPI,
   7.329 +	.flags =	SA_INTERRUPT,
   7.330 +	.name =		"IPI"
   7.331 +};
   7.332 +#endif
   7.333 +
   7.334 +void
   7.335 +register_percpu_irq (ia64_vector vec, struct irqaction *action)
   7.336 +{
   7.337 +	irq_desc_t *desc;
   7.338 +	unsigned int irq;
   7.339 +
   7.340 +	for (irq = 0; irq < NR_IRQS; ++irq)
   7.341 +		if (irq_to_vector(irq) == vec) {
   7.342 +			desc = irq_descp(irq);
   7.343 +			desc->status |= IRQ_PER_CPU;
   7.344 +			desc->handler = &irq_type_ia64_lsapic;
   7.345 +			if (action)
   7.346 +				setup_irq(irq, action);
   7.347 +		}
   7.348 +}
   7.349 +
   7.350 +void __init
   7.351 +init_IRQ (void)
   7.352 +{
   7.353 +	register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
   7.354 +#ifdef CONFIG_SMP
   7.355 +	register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
   7.356 +#endif
   7.357 +#ifdef CONFIG_PERFMON
   7.358 +	pfm_init_percpu();
   7.359 +#endif
   7.360 +	platform_irq_init();
   7.361 +}
   7.362 +
   7.363 +void
   7.364 +ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
   7.365 +{
   7.366 +	void __iomem *ipi_addr;
   7.367 +	unsigned long ipi_data;
   7.368 +	unsigned long phys_cpu_id;
   7.369 +
   7.370 +#ifdef CONFIG_SMP
   7.371 +	phys_cpu_id = cpu_physical_id(cpu);
   7.372 +#else
   7.373 +	phys_cpu_id = (ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff;
   7.374 +#endif
   7.375 +
   7.376 +	/*
   7.377 +	 * cpu number is in 8bit ID and 8bit EID
   7.378 +	 */
   7.379 +
   7.380 +	ipi_data = (delivery_mode << 8) | (vector & 0xff);
   7.381 +	ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3));
   7.382 +
   7.383 +	writeq(ipi_data, ipi_addr);
   7.384 +}
     8.1 --- a/xen/arch/ia64/lib/Makefile	Tue Aug 02 10:20:46 2005 -0700
     8.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.3 @@ -1,44 +0,0 @@
     8.4 -#
     8.5 -# Makefile for ia64-specific library routines..
     8.6 -#
     8.7 -
     8.8 -include $(BASEDIR)/Rules.mk
     8.9 -
    8.10 -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
    8.11 -	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
    8.12 -	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
    8.13 -	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
    8.14 -	flush.o ip_fast_csum.o do_csum.o copy_user.o			\
    8.15 -	memset.o strlen.o memcpy.o 
    8.16 -
    8.17 -default: $(OBJS)
    8.18 -	$(LD) -r -o ia64lib.o $(OBJS)
    8.19 -
    8.20 -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
    8.21 -
    8.22 -__divdi3.o: idiv64.S
    8.23 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    8.24 -
    8.25 -__udivdi3.o: idiv64.S
    8.26 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    8.27 -
    8.28 -__moddi3.o: idiv64.S
    8.29 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    8.30 -
    8.31 -__umoddi3.o: idiv64.S
    8.32 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    8.33 -
    8.34 -__divsi3.o: idiv32.S
    8.35 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
    8.36 -
    8.37 -__udivsi3.o: idiv32.S
    8.38 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
    8.39 -
    8.40 -__modsi3.o: idiv32.S
    8.41 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
    8.42 -
    8.43 -__umodsi3.o: idiv32.S
    8.44 -	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
    8.45 -
    8.46 -clean:
    8.47 -	rm -f *.o *~
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/xen/arch/ia64/linux/cmdline.c	Tue Aug 02 15:59:09 2005 -0800
     9.3 @@ -0,0 +1,120 @@
     9.4 +/*
     9.5 + * linux/lib/cmdline.c
     9.6 + * Helper functions generally used for parsing kernel command line
     9.7 + * and module options.
     9.8 + *
     9.9 + * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c.
    9.10 + *
    9.11 + * This source code is licensed under the GNU General Public License,
    9.12 + * Version 2.  See the file COPYING for more details.
    9.13 + *
    9.14 + * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs
    9.15 + *
    9.16 + */
    9.17 +
    9.18 +#include <linux/module.h>
    9.19 +#include <linux/kernel.h>
    9.20 +#include <linux/string.h>
    9.21 +
    9.22 +
    9.23 +/**
    9.24 + *	get_option - Parse integer from an option string
    9.25 + *	@str: option string
    9.26 + *	@pint: (output) integer value parsed from @str
    9.27 + *
    9.28 + *	Read an int from an option string; if available accept a subsequent
    9.29 + *	comma as well.
    9.30 + *
    9.31 + *	Return values:
    9.32 + *	0 : no int in string
    9.33 + *	1 : int found, no subsequent comma
    9.34 + *	2 : int found including a subsequent comma
    9.35 + */
    9.36 +
    9.37 +int get_option (char **str, int *pint)
    9.38 +{
    9.39 +	char *cur = *str;
    9.40 +
    9.41 +	if (!cur || !(*cur))
    9.42 +		return 0;
    9.43 +	*pint = simple_strtol (cur, str, 0);
    9.44 +	if (cur == *str)
    9.45 +		return 0;
    9.46 +	if (**str == ',') {
    9.47 +		(*str)++;
    9.48 +		return 2;
    9.49 +	}
    9.50 +
    9.51 +	return 1;
    9.52 +}
    9.53 +
    9.54 +/**
    9.55 + *	get_options - Parse a string into a list of integers
    9.56 + *	@str: String to be parsed
    9.57 + *	@nints: size of integer array
    9.58 + *	@ints: integer array
    9.59 + *
    9.60 + *	This function parses a string containing a comma-separated
    9.61 + *	list of integers.  The parse halts when the array is
    9.62 + *	full, or when no more numbers can be retrieved from the
    9.63 + *	string.
    9.64 + *
    9.65 + *	Return value is the character in the string which caused
    9.66 + *	the parse to end (typically a null terminator, if @str is
    9.67 + *	completely parseable).
    9.68 + */
    9.69 + 
    9.70 +char *get_options(const char *str, int nints, int *ints)
    9.71 +{
    9.72 +	int res, i = 1;
    9.73 +
    9.74 +	while (i < nints) {
    9.75 +		res = get_option ((char **)&str, ints + i);
    9.76 +		if (res == 0)
    9.77 +			break;
    9.78 +		i++;
    9.79 +		if (res == 1)
    9.80 +			break;
    9.81 +	}
    9.82 +	ints[0] = i - 1;
    9.83 +	return (char *)str;
    9.84 +}
    9.85 +
    9.86 +/**
    9.87 + *	memparse - parse a string with mem suffixes into a number
    9.88 + *	@ptr: Where parse begins
    9.89 + *	@retptr: (output) Pointer to next char after parse completes
    9.90 + *
    9.91 + *	Parses a string into a number.  The number stored at @ptr is
    9.92 + *	potentially suffixed with %K (for kilobytes, or 1024 bytes),
    9.93 + *	%M (for megabytes, or 1048576 bytes), or %G (for gigabytes, or
    9.94 + *	1073741824).  If the number is suffixed with K, M, or G, then
    9.95 + *	the return value is the number multiplied by one kilobyte, one
    9.96 + *	megabyte, or one gigabyte, respectively.
    9.97 + */
    9.98 +
    9.99 +unsigned long long memparse (char *ptr, char **retptr)
   9.100 +{
   9.101 +	unsigned long long ret = simple_strtoull (ptr, retptr, 0);
   9.102 +
   9.103 +	switch (**retptr) {
   9.104 +	case 'G':
   9.105 +	case 'g':
   9.106 +		ret <<= 10;
   9.107 +	case 'M':
   9.108 +	case 'm':
   9.109 +		ret <<= 10;
   9.110 +	case 'K':
   9.111 +	case 'k':
   9.112 +		ret <<= 10;
   9.113 +		(*retptr)++;
   9.114 +	default:
   9.115 +		break;
   9.116 +	}
   9.117 +	return ret;
   9.118 +}
   9.119 +
   9.120 +
   9.121 +EXPORT_SYMBOL(memparse);
   9.122 +EXPORT_SYMBOL(get_option);
   9.123 +EXPORT_SYMBOL(get_options);
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/xen/arch/ia64/linux/efi_stub.S	Tue Aug 02 15:59:09 2005 -0800
    10.3 @@ -0,0 +1,86 @@
    10.4 +/*
    10.5 + * EFI call stub.
    10.6 + *
    10.7 + * Copyright (C) 1999-2001 Hewlett-Packard Co
    10.8 + *	David Mosberger <davidm@hpl.hp.com>
    10.9 + *
   10.10 + * This stub allows us to make EFI calls in physical mode with interrupts
   10.11 + * turned off.  We need this because we can't call SetVirtualMap() until
   10.12 + * the kernel has booted far enough to allow allocation of struct vma_struct
   10.13 + * entries (which we would need to map stuff with memory attributes other
   10.14 + * than uncached or writeback...).  Since the GetTime() service gets called
   10.15 + * earlier than that, we need to be able to make physical mode EFI calls from
   10.16 + * the kernel.
   10.17 + */
   10.18 +
   10.19 +/*
   10.20 + * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
   10.21 + * Abstraction Layer Specification", revision 2.6e).  Note that
   10.22 + * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
   10.23 + * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
   10.24 + * (the br.ia instruction fails unless psr.dfl and psr.dfh are
   10.25 + * cleared).  Fortunately, SAL promises not to touch the floating
   10.26 + * point regs, so at least we don't have to save f2-f127.
   10.27 + */
   10.28 +#define PSR_BITS_TO_CLEAR						\
   10.29 +	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |		\
   10.30 +	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	\
   10.31 +	 IA64_PSR_DFL | IA64_PSR_DFH)
   10.32 +
   10.33 +#define PSR_BITS_TO_SET							\
   10.34 +	(IA64_PSR_BN)
   10.35 +
   10.36 +#include <asm/processor.h>
   10.37 +#include <asm/asmmacro.h>
   10.38 +
   10.39 +/*
   10.40 + * Inputs:
   10.41 + *	in0 = address of function descriptor of EFI routine to call
   10.42 + *	in1..in7 = arguments to routine
   10.43 + *
   10.44 + * Outputs:
   10.45 + *	r8 = EFI_STATUS returned by called function
   10.46 + */
   10.47 +
   10.48 +GLOBAL_ENTRY(efi_call_phys)
   10.49 +	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
   10.50 +	alloc loc1=ar.pfs,8,7,7,0
   10.51 +	ld8 r2=[in0],8			// load EFI function's entry point
   10.52 +	mov loc0=rp
   10.53 +	.body
   10.54 +	;;
   10.55 +	mov loc2=gp			// save global pointer
   10.56 +	mov loc4=ar.rsc			// save RSE configuration
   10.57 +	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
   10.58 +	;;
   10.59 +	ld8 gp=[in0]			// load EFI function's global pointer
   10.60 +	movl r16=PSR_BITS_TO_CLEAR
   10.61 +	mov loc3=psr			// save processor status word
   10.62 +	movl r17=PSR_BITS_TO_SET
   10.63 +	;;
   10.64 +	or loc3=loc3,r17
   10.65 +	mov b6=r2
   10.66 +	;;
   10.67 +	andcm r16=loc3,r16		// get psr with IT, DT, and RT bits cleared
   10.68 +	br.call.sptk.many rp=ia64_switch_mode_phys
   10.69 +.ret0:	mov out4=in5
   10.70 +	mov out0=in1
   10.71 +	mov out1=in2
   10.72 +	mov out2=in3
   10.73 +	mov out3=in4
   10.74 +	mov out5=in6
   10.75 +	mov out6=in7
   10.76 +	mov loc5=r19
   10.77 +	mov loc6=r20
   10.78 +	br.call.sptk.many rp=b6		// call the EFI function
   10.79 +.ret1:	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
   10.80 +	mov r16=loc3
   10.81 +	mov r19=loc5
   10.82 +	mov r20=loc6
   10.83 +	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
   10.84 +.ret2:	mov ar.rsc=loc4			// restore RSE configuration
   10.85 +	mov ar.pfs=loc1
   10.86 +	mov rp=loc0
   10.87 +	mov gp=loc2
   10.88 +	br.ret.sptk.many rp
   10.89 +END(efi_call_phys)
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/xen/arch/ia64/linux/extable.c	Tue Aug 02 15:59:09 2005 -0800
    11.3 @@ -0,0 +1,93 @@
    11.4 +/*
    11.5 + * Kernel exception handling table support.  Derived from arch/alpha/mm/extable.c.
    11.6 + *
    11.7 + * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
    11.8 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    11.9 + */
   11.10 +
   11.11 +#include <linux/config.h>
   11.12 +
   11.13 +#include <asm/uaccess.h>
   11.14 +#include <asm/module.h>
   11.15 +
   11.16 +static inline int
   11.17 +compare_entries (struct exception_table_entry *l, struct exception_table_entry *r)
   11.18 +{
   11.19 +	u64 lip = (u64) &l->addr + l->addr;
   11.20 +	u64 rip = (u64) &r->addr + r->addr;
   11.21 +
   11.22 +	if (lip < rip)
   11.23 +		return -1;
   11.24 +	if (lip == rip)
   11.25 +		return 0;
   11.26 +	else
   11.27 +		return 1;
   11.28 +}
   11.29 +
   11.30 +static inline void
   11.31 +swap_entries (struct exception_table_entry *l, struct exception_table_entry *r)
   11.32 +{
   11.33 +	u64 delta = (u64) r - (u64) l;
   11.34 +	struct exception_table_entry tmp;
   11.35 +
   11.36 +	tmp = *l;
   11.37 +	l->addr = r->addr + delta;
   11.38 +	l->cont = r->cont + delta;
   11.39 +	r->addr = tmp.addr - delta;
   11.40 +	r->cont = tmp.cont - delta;
   11.41 +}
   11.42 +
   11.43 +/*
   11.44 + * Sort the exception table.  It's usually already sorted, but there may be unordered
   11.45 + * entries due to multiple text sections (such as the .init text section).  Note that the
   11.46 + * exception-table-entries contain location-relative addresses, which requires a bit of
   11.47 + * care during sorting to avoid overflows in the offset members (e.g., it would not be
   11.48 + * safe to make a temporary copy of an exception-table entry on the stack, because the
   11.49 + * stack may be more than 2GB away from the exception-table).
   11.50 + */
   11.51 +void
   11.52 +sort_extable (struct exception_table_entry *start, struct exception_table_entry *finish)
   11.53 +{
   11.54 +	struct exception_table_entry *p, *q;
   11.55 +
   11.56 + 	/* insertion sort */
   11.57 +	for (p = start + 1; p < finish; ++p)
   11.58 +		/* start .. p-1 is sorted; push p down to it's proper place */
   11.59 +		for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; --q)
   11.60 +			swap_entries(&q[0], &q[-1]);
   11.61 +}
   11.62 +
   11.63 +const struct exception_table_entry *
   11.64 +search_extable (const struct exception_table_entry *first,
   11.65 +		const struct exception_table_entry *last,
   11.66 +		unsigned long ip)
   11.67 +{
   11.68 +	const struct exception_table_entry *mid;
   11.69 +	unsigned long mid_ip;
   11.70 +	long diff;
   11.71 +
   11.72 +        while (first <= last) {
   11.73 +		mid = &first[(last - first)/2];
   11.74 +		mid_ip = (u64) &mid->addr + mid->addr;
   11.75 +		diff = mid_ip - ip;
   11.76 +                if (diff == 0)
   11.77 +                        return mid;
   11.78 +                else if (diff < 0)
   11.79 +                        first = mid + 1;
   11.80 +                else
   11.81 +                        last = mid - 1;
   11.82 +        }
   11.83 +        return NULL;
   11.84 +}
   11.85 +
   11.86 +void
   11.87 +ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
   11.88 +{
   11.89 +	long fix = (u64) &e->cont + e->cont;
   11.90 +
   11.91 +	regs->r8 = -EFAULT;
   11.92 +	if (fix & 4)
   11.93 +		regs->r9 = 0;
   11.94 +	regs->cr_iip = fix & ~0xf;
   11.95 +	ia64_psr(regs)->ri = fix & 0x3;		/* set continuation slot number */
   11.96 +}
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/xen/arch/ia64/linux/hpsim.S	Tue Aug 02 15:59:09 2005 -0800
    12.3 @@ -0,0 +1,10 @@
    12.4 +#include <asm/asmmacro.h>
    12.5 +
    12.6 +/*
    12.7 + * Simulator system call.
    12.8 + */
    12.9 +GLOBAL_ENTRY(ia64_ssc)
   12.10 +	mov r15=r36
   12.11 +	break 0x80001
   12.12 +	br.ret.sptk.many rp
   12.13 +END(ia64_ssc)
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/xen/arch/ia64/linux/ia64_ksyms.c	Tue Aug 02 15:59:09 2005 -0800
    13.3 @@ -0,0 +1,127 @@
    13.4 +/*
    13.5 + * Architecture-specific kernel symbols
    13.6 + *
    13.7 + * Don't put any exports here unless it's defined in an assembler file.
    13.8 + * All other exports should be put directly after the definition.
    13.9 + */
   13.10 +
   13.11 +#include <linux/config.h>
   13.12 +#include <linux/module.h>
   13.13 +
   13.14 +#include <linux/string.h>
   13.15 +EXPORT_SYMBOL(memset);
   13.16 +EXPORT_SYMBOL(memchr);
   13.17 +EXPORT_SYMBOL(memcmp);
   13.18 +EXPORT_SYMBOL(memcpy);
   13.19 +EXPORT_SYMBOL(memmove);
   13.20 +EXPORT_SYMBOL(memscan);
   13.21 +EXPORT_SYMBOL(strcat);
   13.22 +EXPORT_SYMBOL(strchr);
   13.23 +EXPORT_SYMBOL(strcmp);
   13.24 +EXPORT_SYMBOL(strcpy);
   13.25 +EXPORT_SYMBOL(strlen);
   13.26 +EXPORT_SYMBOL(strncat);
   13.27 +EXPORT_SYMBOL(strncmp);
   13.28 +EXPORT_SYMBOL(strncpy);
   13.29 +EXPORT_SYMBOL(strnlen);
   13.30 +EXPORT_SYMBOL(strrchr);
   13.31 +EXPORT_SYMBOL(strstr);
   13.32 +EXPORT_SYMBOL(strpbrk);
   13.33 +
   13.34 +#include <asm/checksum.h>
   13.35 +EXPORT_SYMBOL(ip_fast_csum);		/* hand-coded assembly */
   13.36 +
   13.37 +#include <asm/semaphore.h>
   13.38 +EXPORT_SYMBOL(__down);
   13.39 +EXPORT_SYMBOL(__down_interruptible);
   13.40 +EXPORT_SYMBOL(__down_trylock);
   13.41 +EXPORT_SYMBOL(__up);
   13.42 +
   13.43 +#include <asm/page.h>
   13.44 +EXPORT_SYMBOL(clear_page);
   13.45 +
   13.46 +#ifdef CONFIG_VIRTUAL_MEM_MAP
   13.47 +#include <linux/bootmem.h>
   13.48 +EXPORT_SYMBOL(max_low_pfn);	/* defined by bootmem.c, but not exported by generic code */
   13.49 +#endif
   13.50 +
   13.51 +#include <asm/processor.h>
   13.52 +EXPORT_SYMBOL(per_cpu__cpu_info);
   13.53 +#ifdef CONFIG_SMP
   13.54 +EXPORT_SYMBOL(per_cpu__local_per_cpu_offset);
   13.55 +#endif
   13.56 +
   13.57 +#include <asm/uaccess.h>
   13.58 +EXPORT_SYMBOL(__copy_user);
   13.59 +EXPORT_SYMBOL(__do_clear_user);
   13.60 +EXPORT_SYMBOL(__strlen_user);
   13.61 +EXPORT_SYMBOL(__strncpy_from_user);
   13.62 +EXPORT_SYMBOL(__strnlen_user);
   13.63 +
   13.64 +#include <asm/unistd.h>
   13.65 +EXPORT_SYMBOL(__ia64_syscall);
   13.66 +
   13.67 +/* from arch/ia64/lib */
   13.68 +extern void __divsi3(void);
   13.69 +extern void __udivsi3(void);
   13.70 +extern void __modsi3(void);
   13.71 +extern void __umodsi3(void);
   13.72 +extern void __divdi3(void);
   13.73 +extern void __udivdi3(void);
   13.74 +extern void __moddi3(void);
   13.75 +extern void __umoddi3(void);
   13.76 +
   13.77 +EXPORT_SYMBOL(__divsi3);
   13.78 +EXPORT_SYMBOL(__udivsi3);
   13.79 +EXPORT_SYMBOL(__modsi3);
   13.80 +EXPORT_SYMBOL(__umodsi3);
   13.81 +EXPORT_SYMBOL(__divdi3);
   13.82 +EXPORT_SYMBOL(__udivdi3);
   13.83 +EXPORT_SYMBOL(__moddi3);
   13.84 +EXPORT_SYMBOL(__umoddi3);
   13.85 +
   13.86 +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
   13.87 +extern void xor_ia64_2(void);
   13.88 +extern void xor_ia64_3(void);
   13.89 +extern void xor_ia64_4(void);
   13.90 +extern void xor_ia64_5(void);
   13.91 +
   13.92 +EXPORT_SYMBOL(xor_ia64_2);
   13.93 +EXPORT_SYMBOL(xor_ia64_3);
   13.94 +EXPORT_SYMBOL(xor_ia64_4);
   13.95 +EXPORT_SYMBOL(xor_ia64_5);
   13.96 +#endif
   13.97 +
   13.98 +#include <asm/pal.h>
   13.99 +EXPORT_SYMBOL(ia64_pal_call_phys_stacked);
  13.100 +EXPORT_SYMBOL(ia64_pal_call_phys_static);
  13.101 +EXPORT_SYMBOL(ia64_pal_call_stacked);
  13.102 +EXPORT_SYMBOL(ia64_pal_call_static);
  13.103 +EXPORT_SYMBOL(ia64_load_scratch_fpregs);
  13.104 +EXPORT_SYMBOL(ia64_save_scratch_fpregs);
  13.105 +
  13.106 +#include <asm/unwind.h>
  13.107 +EXPORT_SYMBOL(unw_init_running);
  13.108 +
  13.109 +#ifdef ASM_SUPPORTED
  13.110 +# ifdef CONFIG_SMP
  13.111 +#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
  13.112 +/*
  13.113 + * This is not a normal routine and we don't want a function descriptor for it, so we use
  13.114 + * a fake declaration here.
  13.115 + */
  13.116 +extern char ia64_spinlock_contention_pre3_4;
  13.117 +EXPORT_SYMBOL(ia64_spinlock_contention_pre3_4);
  13.118 +#  else
  13.119 +/*
  13.120 + * This is not a normal routine and we don't want a function descriptor for it, so we use
  13.121 + * a fake declaration here.
  13.122 + */
  13.123 +extern char ia64_spinlock_contention;
  13.124 +EXPORT_SYMBOL(ia64_spinlock_contention);
  13.125 +#  endif
  13.126 +# endif
  13.127 +#endif
  13.128 +
  13.129 +extern char ia64_ivt[];
  13.130 +EXPORT_SYMBOL(ia64_ivt);
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/xen/arch/ia64/linux/irq_lsapic.c	Tue Aug 02 15:59:09 2005 -0800
    14.3 @@ -0,0 +1,37 @@
    14.4 +/*
    14.5 + * LSAPIC Interrupt Controller
    14.6 + *
    14.7 + * This takes care of interrupts that are generated by the CPU's
    14.8 + * internal Streamlined Advanced Programmable Interrupt Controller
    14.9 + * (LSAPIC), such as the ITC and IPI interrupts.
   14.10 +    *
   14.11 + * Copyright (C) 1999 VA Linux Systems
   14.12 + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
   14.13 + * Copyright (C) 2000 Hewlett-Packard Co
   14.14 + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
   14.15 + */
   14.16 +
   14.17 +#include <linux/sched.h>
   14.18 +#include <linux/irq.h>
   14.19 +
   14.20 +static unsigned int
   14.21 +lsapic_noop_startup (unsigned int irq)
   14.22 +{
   14.23 +	return 0;
   14.24 +}
   14.25 +
   14.26 +static void
   14.27 +lsapic_noop (unsigned int irq)
   14.28 +{
   14.29 +	/* nuthing to do... */
   14.30 +}
   14.31 +
   14.32 +struct hw_interrupt_type irq_type_ia64_lsapic = {
   14.33 +	.typename =	"LSAPIC",
   14.34 +	.startup =	lsapic_noop_startup,
   14.35 +	.shutdown =	lsapic_noop,
   14.36 +	.enable =	lsapic_noop,
   14.37 +	.disable =	lsapic_noop,
   14.38 +	.ack =		lsapic_noop,
   14.39 +	.end =		lsapic_noop
   14.40 +};
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/xen/arch/ia64/linux/lib/Makefile	Tue Aug 02 15:59:09 2005 -0800
    15.3 @@ -0,0 +1,44 @@
    15.4 +#
    15.5 +# Makefile for ia64-specific library routines..
    15.6 +#
    15.7 +
    15.8 +include $(BASEDIR)/Rules.mk
    15.9 +
   15.10 +OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
   15.11 +	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
   15.12 +	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
   15.13 +	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
   15.14 +	flush.o ip_fast_csum.o do_csum.o copy_user.o			\
   15.15 +	memset.o strlen.o memcpy.o 
   15.16 +
   15.17 +default: $(OBJS)
   15.18 +	$(LD) -r -o ia64lib.o $(OBJS)
   15.19 +
   15.20 +AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
   15.21 +
   15.22 +__divdi3.o: idiv64.S
   15.23 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
   15.24 +
   15.25 +__udivdi3.o: idiv64.S
   15.26 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
   15.27 +
   15.28 +__moddi3.o: idiv64.S
   15.29 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
   15.30 +
   15.31 +__umoddi3.o: idiv64.S
   15.32 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
   15.33 +
   15.34 +__divsi3.o: idiv32.S
   15.35 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
   15.36 +
   15.37 +__udivsi3.o: idiv32.S
   15.38 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
   15.39 +
   15.40 +__modsi3.o: idiv32.S
   15.41 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
   15.42 +
   15.43 +__umodsi3.o: idiv32.S
   15.44 +	$(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
   15.45 +
   15.46 +clean:
   15.47 +	rm -f *.o *~
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/xen/arch/ia64/linux/lib/bitop.c	Tue Aug 02 15:59:09 2005 -0800
    16.3 @@ -0,0 +1,88 @@
    16.4 +#include <linux/compiler.h>
    16.5 +#include <linux/types.h>
    16.6 +#include <asm/intrinsics.h>
    16.7 +#include <linux/module.h>
    16.8 +#include <linux/bitops.h>
    16.9 +
   16.10 +/*
   16.11 + * Find next zero bit in a bitmap reasonably efficiently..
   16.12 + */
   16.13 +
   16.14 +int __find_next_zero_bit (const void *addr, unsigned long size, unsigned long offset)
   16.15 +{
   16.16 +	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
   16.17 +	unsigned long result = offset & ~63UL;
   16.18 +	unsigned long tmp;
   16.19 +
   16.20 +	if (offset >= size)
   16.21 +		return size;
   16.22 +	size -= result;
   16.23 +	offset &= 63UL;
   16.24 +	if (offset) {
   16.25 +		tmp = *(p++);
   16.26 +		tmp |= ~0UL >> (64-offset);
   16.27 +		if (size < 64)
   16.28 +			goto found_first;
   16.29 +		if (~tmp)
   16.30 +			goto found_middle;
   16.31 +		size -= 64;
   16.32 +		result += 64;
   16.33 +	}
   16.34 +	while (size & ~63UL) {
   16.35 +		if (~(tmp = *(p++)))
   16.36 +			goto found_middle;
   16.37 +		result += 64;
   16.38 +		size -= 64;
   16.39 +	}
   16.40 +	if (!size)
   16.41 +		return result;
   16.42 +	tmp = *p;
   16.43 +found_first:
   16.44 +	tmp |= ~0UL << size;
   16.45 +	if (tmp == ~0UL)		/* any bits zero? */
   16.46 +		return result + size;	/* nope */
   16.47 +found_middle:
   16.48 +	return result + ffz(tmp);
   16.49 +}
   16.50 +EXPORT_SYMBOL(__find_next_zero_bit);
   16.51 +
   16.52 +/*
   16.53 + * Find next bit in a bitmap reasonably efficiently..
   16.54 + */
   16.55 +int __find_next_bit(const void *addr, unsigned long size, unsigned long offset)
   16.56 +{
   16.57 +	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
   16.58 +	unsigned long result = offset & ~63UL;
   16.59 +	unsigned long tmp;
   16.60 +
   16.61 +	if (offset >= size)
   16.62 +		return size;
   16.63 +	size -= result;
   16.64 +	offset &= 63UL;
   16.65 +	if (offset) {
   16.66 +		tmp = *(p++);
   16.67 +		tmp &= ~0UL << offset;
   16.68 +		if (size < 64)
   16.69 +			goto found_first;
   16.70 +		if (tmp)
   16.71 +			goto found_middle;
   16.72 +		size -= 64;
   16.73 +		result += 64;
   16.74 +	}
   16.75 +	while (size & ~63UL) {
   16.76 +		if ((tmp = *(p++)))
   16.77 +			goto found_middle;
   16.78 +		result += 64;
   16.79 +		size -= 64;
   16.80 +	}
   16.81 +	if (!size)
   16.82 +		return result;
   16.83 +	tmp = *p;
   16.84 +  found_first:
   16.85 +	tmp &= ~0UL >> (64-size);
   16.86 +	if (tmp == 0UL)		/* Are any bits set? */
   16.87 +		return result + size; /* Nope. */
   16.88 +  found_middle:
   16.89 +	return result + __ffs(tmp);
   16.90 +}
   16.91 +EXPORT_SYMBOL(__find_next_bit);
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/xen/arch/ia64/linux/lib/carta_random.S	Tue Aug 02 15:59:09 2005 -0800
    17.3 @@ -0,0 +1,54 @@
    17.4 +/*
    17.5 + * Fast, simple, yet decent quality random number generator based on
    17.6 + * a paper by David G. Carta ("Two Fast Implementations of the
    17.7 + * `Minimal Standard' Random Number Generator," Communications of the
    17.8 + * ACM, January, 1990).
    17.9 + *
   17.10 + * Copyright (C) 2002 Hewlett-Packard Co
   17.11 + *	David Mosberger-Tang <davidm@hpl.hp.com>
   17.12 + */
   17.13 +
   17.14 +#include <asm/asmmacro.h>
   17.15 +
   17.16 +#define a	r2
   17.17 +#define m	r3
   17.18 +#define lo	r8
   17.19 +#define hi	r9
   17.20 +#define t0	r16
   17.21 +#define t1	r17
   17.22 +#define	seed	r32
   17.23 +
   17.24 +GLOBAL_ENTRY(carta_random32)
   17.25 +	movl	a = (16807 << 16) | 16807
   17.26 +	;;
   17.27 +	pmpyshr2.u t0 = a, seed, 0
   17.28 +	pmpyshr2.u t1 = a, seed, 16
   17.29 +	;;
   17.30 +	unpack2.l t0 = t1, t0
   17.31 +	dep	m = -1, r0, 0, 31
   17.32 +	;;
   17.33 +	zxt4	lo = t0
   17.34 +	shr.u	hi = t0, 32
   17.35 +	;;
   17.36 +	dep	t0 = 0, hi, 15, 49	// t0 = (hi & 0x7fff)
   17.37 +	;;
   17.38 +	shl	t0 = t0, 16		// t0 = (hi & 0x7fff) << 16
   17.39 +	shr	t1 = hi, 15		// t1 = (hi >> 15)
   17.40 +	;;
   17.41 +	add	lo = lo, t0
   17.42 +	;;
   17.43 +	cmp.gtu	p6, p0 = lo, m
   17.44 +	;;
   17.45 +(p6)	and	lo = lo, m
   17.46 +	;;
   17.47 +(p6)	add	lo = 1, lo
   17.48 +	;;
   17.49 +	add	lo = lo, t1
   17.50 +	;;
   17.51 +	cmp.gtu p6, p0 = lo, m
   17.52 +	;;
   17.53 +(p6)	and	lo = lo, m
   17.54 +	;;
   17.55 +(p6)	add	lo = 1, lo
   17.56 +	br.ret.sptk.many rp
   17.57 +END(carta_random32)
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/xen/arch/ia64/linux/lib/checksum.c	Tue Aug 02 15:59:09 2005 -0800
    18.3 @@ -0,0 +1,102 @@
    18.4 +/*
    18.5 + * Network checksum routines
    18.6 + *
    18.7 + * Copyright (C) 1999, 2003 Hewlett-Packard Co
    18.8 + *	Stephane Eranian <eranian@hpl.hp.com>
    18.9 + *
   18.10 + * Most of the code coming from arch/alpha/lib/checksum.c
   18.11 + *
   18.12 + * This file contains network checksum routines that are better done
   18.13 + * in an architecture-specific manner due to speed..
   18.14 + */
   18.15 +
   18.16 +#include <linux/module.h>
   18.17 +#include <linux/string.h>
   18.18 +
   18.19 +#include <asm/byteorder.h>
   18.20 +
   18.21 +static inline unsigned short
   18.22 +from64to16 (unsigned long x)
   18.23 +{
   18.24 +	/* add up 32-bit words for 33 bits */
   18.25 +	x = (x & 0xffffffff) + (x >> 32);
   18.26 +	/* add up 16-bit and 17-bit words for 17+c bits */
   18.27 +	x = (x & 0xffff) + (x >> 16);
   18.28 +	/* add up 16-bit and 2-bit for 16+c bit */
   18.29 +	x = (x & 0xffff) + (x >> 16);
   18.30 +	/* add up carry.. */
   18.31 +	x = (x & 0xffff) + (x >> 16);
   18.32 +	return x;
   18.33 +}
   18.34 +
   18.35 +/*
   18.36 + * computes the checksum of the TCP/UDP pseudo-header
   18.37 + * returns a 16-bit checksum, already complemented.
   18.38 + */
   18.39 +unsigned short int
   18.40 +csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
   18.41 +		   unsigned short proto, unsigned int sum)
   18.42 +{
   18.43 +	return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
   18.44 +			   ((unsigned long) proto << 8));
   18.45 +}
   18.46 +
   18.47 +EXPORT_SYMBOL(csum_tcpudp_magic);
   18.48 +
   18.49 +unsigned int
   18.50 +csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
   18.51 +		    unsigned short proto, unsigned int sum)
   18.52 +{
   18.53 +	unsigned long result;
   18.54 +
   18.55 +	result = (saddr + daddr + sum +
   18.56 +		  ((unsigned long) ntohs(len) << 16) +
   18.57 +		  ((unsigned long) proto << 8));
   18.58 +
   18.59 +	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
   18.60 +	/* 64 to 33 */
   18.61 +	result = (result & 0xffffffff) + (result >> 32);
   18.62 +	/* 33 to 32 */
   18.63 +	result = (result & 0xffffffff) + (result >> 32);
   18.64 +	return result;
   18.65 +}
   18.66 +
   18.67 +extern unsigned long do_csum (const unsigned char *, long);
   18.68 +
   18.69 +/*
   18.70 + * computes the checksum of a memory block at buff, length len,
   18.71 + * and adds in "sum" (32-bit)
   18.72 + *
   18.73 + * returns a 32-bit number suitable for feeding into itself
   18.74 + * or csum_tcpudp_magic
   18.75 + *
   18.76 + * this function must be called with even lengths, except
   18.77 + * for the last fragment, which may be odd
   18.78 + *
   18.79 + * it's best to have buff aligned on a 32-bit boundary
   18.80 + */
   18.81 +unsigned int
   18.82 +csum_partial (const unsigned char * buff, int len, unsigned int sum)
   18.83 +{
   18.84 +	unsigned long result = do_csum(buff, len);
   18.85 +
   18.86 +	/* add in old sum, and carry.. */
   18.87 +	result += sum;
   18.88 +	/* 32+c bits -> 32 bits */
   18.89 +	result = (result & 0xffffffff) + (result >> 32);
   18.90 +	return result;
   18.91 +}
   18.92 +
   18.93 +EXPORT_SYMBOL(csum_partial);
   18.94 +
   18.95 +/*
   18.96 + * this routine is used for miscellaneous IP-like checksums, mainly
   18.97 + * in icmp.c
   18.98 + */
   18.99 +unsigned short
  18.100 +ip_compute_csum (unsigned char * buff, int len)
  18.101 +{
  18.102 +	return ~do_csum(buff,len);
  18.103 +}
  18.104 +
  18.105 +EXPORT_SYMBOL(ip_compute_csum);
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/xen/arch/ia64/linux/lib/clear_page.S	Tue Aug 02 15:59:09 2005 -0800
    19.3 @@ -0,0 +1,77 @@
    19.4 +/*
    19.5 + * Copyright (C) 1999-2002 Hewlett-Packard Co
    19.6 + *	Stephane Eranian <eranian@hpl.hp.com>
    19.7 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    19.8 + * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
    19.9 + *
   19.10 + * 1/06/01 davidm	Tuned for Itanium.
   19.11 + * 2/12/02 kchen	Tuned for both Itanium and McKinley
   19.12 + * 3/08/02 davidm	Some more tweaking
   19.13 + */
   19.14 +#include <linux/config.h>
   19.15 +
   19.16 +#include <asm/asmmacro.h>
   19.17 +#include <asm/page.h>
   19.18 +
   19.19 +#ifdef CONFIG_ITANIUM
   19.20 +# define L3_LINE_SIZE	64	// Itanium L3 line size
   19.21 +# define PREFETCH_LINES	9	// magic number
   19.22 +#else
   19.23 +# define L3_LINE_SIZE	128	// McKinley L3 line size
   19.24 +# define PREFETCH_LINES	12	// magic number
   19.25 +#endif
   19.26 +
   19.27 +#define saved_lc	r2
   19.28 +#define dst_fetch	r3
   19.29 +#define dst1		r8
   19.30 +#define dst2		r9
   19.31 +#define dst3		r10
   19.32 +#define dst4		r11
   19.33 +
   19.34 +#define dst_last	r31
   19.35 +
   19.36 +GLOBAL_ENTRY(clear_page)
   19.37 +	.prologue
   19.38 +	.regstk 1,0,0,0
   19.39 +	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
   19.40 +	.save ar.lc, saved_lc
   19.41 +	mov saved_lc = ar.lc
   19.42 +
   19.43 +	.body
   19.44 +	mov ar.lc = (PREFETCH_LINES - 1)
   19.45 +	mov dst_fetch = in0
   19.46 +	adds dst1 = 16, in0
   19.47 +	adds dst2 = 32, in0
   19.48 +	;;
   19.49 +.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
   19.50 +	adds dst3 = 48, in0		// executing this multiple times is harmless
   19.51 +	br.cloop.sptk.few .fetch
   19.52 +	;;
   19.53 +	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
   19.54 +	mov ar.lc = r16			// one L3 line per iteration
   19.55 +	adds dst4 = 64, in0
   19.56 +	;;
   19.57 +#ifdef CONFIG_ITANIUM
   19.58 +	// Optimized for Itanium
   19.59 +1:	stf.spill.nta [dst1] = f0, 64
   19.60 +	stf.spill.nta [dst2] = f0, 64
   19.61 +	cmp.lt p8,p0=dst_fetch, dst_last
   19.62 +	;;
   19.63 +#else
   19.64 +	// Optimized for McKinley
   19.65 +1:	stf.spill.nta [dst1] = f0, 64
   19.66 +	stf.spill.nta [dst2] = f0, 64
   19.67 +	stf.spill.nta [dst3] = f0, 64
   19.68 +	stf.spill.nta [dst4] = f0, 128
   19.69 +	cmp.lt p8,p0=dst_fetch, dst_last
   19.70 +	;;
   19.71 +	stf.spill.nta [dst1] = f0, 64
   19.72 +	stf.spill.nta [dst2] = f0, 64
   19.73 +#endif
   19.74 +	stf.spill.nta [dst3] = f0, 64
   19.75 +(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
   19.76 +	br.cloop.sptk.few 1b
   19.77 +	;;
   19.78 +	mov ar.lc = saved_lc		// restore lc
   19.79 +	br.ret.sptk.many rp
   19.80 +END(clear_page)
    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/xen/arch/ia64/linux/lib/clear_user.S	Tue Aug 02 15:59:09 2005 -0800
    20.3 @@ -0,0 +1,209 @@
    20.4 +/*
    20.5 + * This routine clears to zero a linear memory buffer in user space.
    20.6 + *
    20.7 + * Inputs:
    20.8 + *	in0:	address of buffer
    20.9 + *	in1:	length of buffer in bytes
   20.10 + * Outputs:
   20.11 + *	r8:	number of bytes that didn't get cleared due to a fault
   20.12 + *
   20.13 + * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
   20.14 + *	Stephane Eranian <eranian@hpl.hp.com>
   20.15 + */
   20.16 +
   20.17 +#include <asm/asmmacro.h>
   20.18 +
   20.19 +//
   20.20 +// arguments
   20.21 +//
   20.22 +#define buf		r32
   20.23 +#define len		r33
   20.24 +
   20.25 +//
   20.26 +// local registers
   20.27 +//
   20.28 +#define cnt		r16
   20.29 +#define buf2		r17
   20.30 +#define saved_lc	r18
   20.31 +#define saved_pfs	r19
   20.32 +#define tmp		r20
   20.33 +#define len2		r21
   20.34 +#define len3		r22
   20.35 +
   20.36 +//
   20.37 +// Theory of operations:
   20.38 +//	- we check whether or not the buffer is small, i.e., less than 17
   20.39 +//	  in which case we do the byte by byte loop.
   20.40 +//
   20.41 +//	- Otherwise we go progressively from 1 byte store to 8byte store in
   20.42 +//	  the head part, the body is a 16byte store loop and we finish we the
   20.43 +//	  tail for the last 15 bytes.
   20.44 +//	  The good point about this breakdown is that the long buffer handling
   20.45 +//	  contains only 2 branches.
   20.46 +//
   20.47 +//	The reason for not using shifting & masking for both the head and the
   20.48 +//	tail is to stay semantically correct. This routine is not supposed
   20.49 +//	to write bytes outside of the buffer. While most of the time this would
   20.50 +//	be ok, we can't tolerate a mistake. A classical example is the case
   20.51 +//	of multithreaded code were to the extra bytes touched is actually owned
   20.52 +//	by another thread which runs concurrently to ours. Another, less likely,
   20.53 +//	example is with device drivers where reading an I/O mapped location may
   20.54 +//	have side effects (same thing for writing).
   20.55 +//
   20.56 +
   20.57 +GLOBAL_ENTRY(__do_clear_user)
   20.58 +	.prologue
   20.59 +	.save ar.pfs, saved_pfs
   20.60 +	alloc	saved_pfs=ar.pfs,2,0,0,0
   20.61 +	cmp.eq p6,p0=r0,len		// check for zero length
   20.62 +	.save ar.lc, saved_lc
   20.63 +	mov saved_lc=ar.lc		// preserve ar.lc (slow)
   20.64 +	.body
   20.65 +	;;				// avoid WAW on CFM
   20.66 +	adds tmp=-1,len			// br.ctop is repeat/until
   20.67 +	mov ret0=len			// return value is length at this point
   20.68 +(p6)	br.ret.spnt.many rp
   20.69 +	;;
   20.70 +	cmp.lt p6,p0=16,len		// if len > 16 then long memset
   20.71 +	mov ar.lc=tmp			// initialize lc for small count
   20.72 +(p6)	br.cond.dptk .long_do_clear
   20.73 +	;;				// WAR on ar.lc
   20.74 +	//
   20.75 +	// worst case 16 iterations, avg 8 iterations
   20.76 +	//
   20.77 +	// We could have played with the predicates to use the extra
   20.78 +	// M slot for 2 stores/iteration but the cost the initialization
   20.79 +	// the various counters compared to how long the loop is supposed
   20.80 +	// to last on average does not make this solution viable.
   20.81 +	//
   20.82 +1:
   20.83 +	EX( .Lexit1, st1 [buf]=r0,1 )
   20.84 +	adds len=-1,len			// countdown length using len
   20.85 +	br.cloop.dptk 1b
   20.86 +	;;				// avoid RAW on ar.lc
   20.87 +	//
   20.88 +	// .Lexit4: comes from byte by byte loop
   20.89 +	//	    len contains bytes left
   20.90 +.Lexit1:
   20.91 +	mov ret0=len			// faster than using ar.lc
   20.92 +	mov ar.lc=saved_lc
   20.93 +	br.ret.sptk.many rp		// end of short clear_user
   20.94 +
   20.95 +
   20.96 +	//
   20.97 +	// At this point we know we have more than 16 bytes to copy
   20.98 +	// so we focus on alignment (no branches required)
   20.99 +	//
  20.100 +	// The use of len/len2 for countdown of the number of bytes left
  20.101 +	// instead of ret0 is due to the fact that the exception code
  20.102 +	// changes the values of r8.
  20.103 +	//
  20.104 +.long_do_clear:
  20.105 +	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
  20.106 +	;;
  20.107 +	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
  20.108 +(p6)	adds len=-1,len;;		// sync because buf is modified
  20.109 +	tbit.nz p6,p0=buf,1
  20.110 +	;;
  20.111 +	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
  20.112 +(p6)	adds len=-2,len;;
  20.113 +	tbit.nz p6,p0=buf,2
  20.114 +	;;
  20.115 +	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
  20.116 +(p6)	adds len=-4,len;;
  20.117 +	tbit.nz p6,p0=buf,3
  20.118 +	;;
  20.119 +	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
  20.120 +(p6)	adds len=-8,len;;
  20.121 +	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
  20.122 +	;;
  20.123 +	cmp.eq p6,p0=r0,cnt
  20.124 +	adds tmp=-1,cnt
  20.125 +(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
  20.126 +	;;
  20.127 +	adds buf2=8,buf			// setup second base pointer
  20.128 +	mov ar.lc=tmp
  20.129 +	;;
  20.130 +
  20.131 +	//
  20.132 +	// 16bytes/iteration core loop
  20.133 +	//
  20.134 +	// The second store can never generate a fault because
  20.135 +	// we come into the loop only when we are 16-byte aligned.
  20.136 +	// This means that if we cross a page then it will always be
  20.137 +	// in the first store and never in the second.
  20.138 +	//
  20.139 +	//
  20.140 +	// We need to keep track of the remaining length. A possible (optimistic)
  20.141 +	// way would be to use ar.lc and derive how many byte were left by
  20.142 +	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
  20.143 +	// every iteration.
  20.144 +	// However we need to keep the synchronization point. A template
  20.145 +	// M;;MB does not exist and thus we can keep the addition at no
  20.146 +	// extra cycle cost (use a nop slot anyway). It also simplifies the
  20.147 +	// (unlikely)  error recovery code
  20.148 +	//
  20.149 +
  20.150 +2:	EX(.Lexit3, st8 [buf]=r0,16 )
  20.151 +	;;				// needed to get len correct when error
  20.152 +	st8 [buf2]=r0,16
  20.153 +	adds len=-16,len
  20.154 +	br.cloop.dptk 2b
  20.155 +	;;
  20.156 +	mov ar.lc=saved_lc
  20.157 +	//
  20.158 +	// tail correction based on len only
  20.159 +	//
  20.160 +	// We alternate the use of len3,len2 to allow parallelism and correct
  20.161 +	// error handling. We also reuse p6/p7 to return correct value.
  20.162 +	// The addition of len2/len3 does not cost anything more compared to
  20.163 +	// the regular memset as we had empty slots.
  20.164 +	//
  20.165 +.dotail:
  20.166 +	mov len2=len			// for parallelization of error handling
  20.167 +	mov len3=len
  20.168 +	tbit.nz p6,p0=len,3
  20.169 +	;;
  20.170 +	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
  20.171 +(p6)	adds len3=-8,len2
  20.172 +	tbit.nz p7,p6=len,2
  20.173 +	;;
  20.174 +	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
  20.175 +(p7)	adds len2=-4,len3
  20.176 +	tbit.nz p6,p7=len,1
  20.177 +	;;
  20.178 +	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
  20.179 +(p6)	adds len3=-2,len2
  20.180 +	tbit.nz p7,p6=len,0
  20.181 +	;;
  20.182 +	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
  20.183 +	mov ret0=r0				// success
  20.184 +	br.ret.sptk.many rp			// end of most likely path
  20.185 +
  20.186 +	//
  20.187 +	// Outlined error handling code
  20.188 +	//
  20.189 +
  20.190 +	//
  20.191 +	// .Lexit3: comes from core loop, need restore pr/lc
  20.192 +	//	    len contains bytes left
  20.193 +	//
  20.194 +	//
  20.195 +	// .Lexit2:
  20.196 +	//	if p6 -> coming from st8 or st2 : len2 contains what's left
  20.197 +	//	if p7 -> coming from st4 or st1 : len3 contains what's left
  20.198 +	// We must restore lc/pr even though might not have been used.
  20.199 +.Lexit2:
  20.200 +	.pred.rel "mutex", p6, p7
  20.201 +(p6)	mov len=len2
  20.202 +(p7)	mov len=len3
  20.203 +	;;
  20.204 +	//
  20.205 +	// .Lexit4: comes from head, need not restore pr/lc
  20.206 +	//	    len contains bytes left
  20.207 +	//
  20.208 +.Lexit3:
  20.209 +	mov ret0=len
  20.210 +	mov ar.lc=saved_lc
  20.211 +	br.ret.sptk.many rp
  20.212 +END(__do_clear_user)
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/xen/arch/ia64/linux/lib/copy_page.S	Tue Aug 02 15:59:09 2005 -0800
    21.3 @@ -0,0 +1,98 @@
    21.4 +/*
    21.5 + *
    21.6 + * Optimized version of the standard copy_page() function
    21.7 + *
    21.8 + * Inputs:
    21.9 + *	in0:	address of target page
   21.10 + *	in1:	address of source page
   21.11 + * Output:
   21.12 + *	no return value
   21.13 + *
   21.14 + * Copyright (C) 1999, 2001 Hewlett-Packard Co
   21.15 + *	Stephane Eranian <eranian@hpl.hp.com>
   21.16 + *	David Mosberger <davidm@hpl.hp.com>
   21.17 + *
   21.18 + * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
   21.19 + */
   21.20 +#include <asm/asmmacro.h>
   21.21 +#include <asm/page.h>
   21.22 +
   21.23 +#define PIPE_DEPTH	3
   21.24 +#define EPI		p[PIPE_DEPTH-1]
   21.25 +
   21.26 +#define lcount		r16
   21.27 +#define saved_pr	r17
   21.28 +#define saved_lc	r18
   21.29 +#define saved_pfs	r19
   21.30 +#define src1		r20
   21.31 +#define src2		r21
   21.32 +#define tgt1		r22
   21.33 +#define tgt2		r23
   21.34 +#define srcf		r24
   21.35 +#define tgtf		r25
   21.36 +#define tgt_last	r26
   21.37 +
   21.38 +#define Nrot		((8*PIPE_DEPTH+7)&~7)
   21.39 +
   21.40 +GLOBAL_ENTRY(copy_page)
   21.41 +	.prologue
   21.42 +	.save ar.pfs, saved_pfs
   21.43 +	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
   21.44 +
   21.45 +	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
   21.46 +	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
   21.47 +	.rotp p[PIPE_DEPTH]
   21.48 +
   21.49 +	.save ar.lc, saved_lc
   21.50 +	mov saved_lc=ar.lc
   21.51 +	mov ar.ec=PIPE_DEPTH
   21.52 +
   21.53 +	mov lcount=PAGE_SIZE/64-1
   21.54 +	.save pr, saved_pr
   21.55 +	mov saved_pr=pr
   21.56 +	mov pr.rot=1<<16
   21.57 +
   21.58 +	.body
   21.59 +
   21.60 +	mov src1=in1
   21.61 +	adds src2=8,in1
   21.62 +	mov tgt_last = PAGE_SIZE
   21.63 +	;;
   21.64 +	adds tgt2=8,in0
   21.65 +	add srcf=512,in1
   21.66 +	mov ar.lc=lcount
   21.67 +	mov tgt1=in0
   21.68 +	add tgtf=512,in0
   21.69 +	add tgt_last = tgt_last, in0
   21.70 +	;;
   21.71 +1:
   21.72 +(p[0])	ld8 t1[0]=[src1],16
   21.73 +(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
   21.74 +(p[0])	ld8 t2[0]=[src2],16
   21.75 +(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
   21.76 +	cmp.ltu p6,p0 = tgtf, tgt_last
   21.77 +	;;
   21.78 +(p[0])	ld8 t3[0]=[src1],16
   21.79 +(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
   21.80 +(p[0])	ld8 t4[0]=[src2],16
   21.81 +(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
   21.82 +	;;
   21.83 +(p[0])	ld8 t5[0]=[src1],16
   21.84 +(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
   21.85 +(p[0])	ld8 t6[0]=[src2],16
   21.86 +(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
   21.87 +	;;
   21.88 +(p[0])	ld8 t7[0]=[src1],16
   21.89 +(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
   21.90 +(p[0])	ld8 t8[0]=[src2],16
   21.91 +(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
   21.92 +
   21.93 +(p6)	lfetch [srcf], 64
   21.94 +(p6)	lfetch [tgtf], 64
   21.95 +	br.ctop.sptk.few 1b
   21.96 +	;;
   21.97 +	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
   21.98 +	mov ar.pfs=saved_pfs
   21.99 +	mov ar.lc=saved_lc
  21.100 +	br.ret.sptk.many rp
  21.101 +END(copy_page)
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/xen/arch/ia64/linux/lib/copy_page_mck.S	Tue Aug 02 15:59:09 2005 -0800
    22.3 @@ -0,0 +1,185 @@
    22.4 +/*
    22.5 + * McKinley-optimized version of copy_page().
    22.6 + *
    22.7 + * Copyright (C) 2002 Hewlett-Packard Co
    22.8 + *	David Mosberger <davidm@hpl.hp.com>
    22.9 + *
   22.10 + * Inputs:
   22.11 + *	in0:	address of target page
   22.12 + *	in1:	address of source page
   22.13 + * Output:
   22.14 + *	no return value
   22.15 + *
   22.16 + * General idea:
   22.17 + *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
   22.18 + *	  lfetches => good for in-cache performance
   22.19 + *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
   22.20 + *	  cycle
   22.21 + *
   22.22 + * Principle of operation:
   22.23 + *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
   22.24 + *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
   22.25 + *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
   22.26 + *	source line is in L1, we start copying the remaining words.  The second half of the
   22.27 + *	source line is prefetched in an earlier iteration, so that by the time we start
   22.28 + *	accessing it, it's also present in the L1.
   22.29 + *
   22.30 + *	We use a software-pipelined loop to control the overall operation.  The pipeline
   22.31 + *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
   22.32 + *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
   22.33 + *	cache-lines, the last K stages are used to copy the cache-line words not copied by
   22.34 + *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
   22.35 + *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
   22.36 + *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
   22.37 + *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
   22.38 + *
   22.39 + *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
   22.40 + *	the resulting code is very regular and quite easy to follow (once you get the idea).
   22.41 + *
   22.42 + *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
   22.43 + *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
   22.44 + *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
   22.45 + *	so that each loop iteration is faster (again, good for cached case).
   22.46 + *
   22.47 + *	When reading the code, it helps to keep the following picture in mind:
   22.48 + *
   22.49 + *	       word 0 word 1
   22.50 + *            +------+------+---
   22.51 + *	      |	v[x] | 	t1  | ^
   22.52 + *	      |	t2   |	t3  | |
   22.53 + *	      |	t4   |	t5  | |
   22.54 + *	      |	t6   |	t7  | | 128 bytes
   22.55 + *     	      |	n[y] | 	t9  | |	(L2 cache line)
   22.56 + *	      |	t10  | 	t11 | |
   22.57 + *	      |	t12  | 	t13 | |
   22.58 + *	      |	t14  | 	t15 | v
   22.59 + *	      +------+------+---
   22.60 + *
   22.61 + *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
   22.62 + *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
   22.63 + *	an order that avoids bank conflicts.
   22.64 + */
   22.65 +#include <asm/asmmacro.h>
   22.66 +#include <asm/page.h>
   22.67 +
   22.68 +#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
   22.69 +
   22.70 +#define src0		r2
   22.71 +#define src1		r3
   22.72 +#define dst0		r9
   22.73 +#define dst1		r10
   22.74 +#define src_pre_mem	r11
   22.75 +#define dst_pre_mem	r14
   22.76 +#define src_pre_l2	r15
   22.77 +#define dst_pre_l2	r16
   22.78 +#define t1		r17
   22.79 +#define t2		r18
   22.80 +#define t3		r19
   22.81 +#define t4		r20
   22.82 +#define t5		t1	// alias!
   22.83 +#define t6		t2	// alias!
   22.84 +#define t7		t3	// alias!
   22.85 +#define t9		t5	// alias!
   22.86 +#define t10		t4	// alias!
   22.87 +#define t11		t7	// alias!
   22.88 +#define t12		t6	// alias!
   22.89 +#define t14		t10	// alias!
   22.90 +#define t13		r21
   22.91 +#define t15		r22
   22.92 +
   22.93 +#define saved_lc	r23
   22.94 +#define saved_pr	r24
   22.95 +
   22.96 +#define	A	0
   22.97 +#define B	(PREFETCH_DIST)
   22.98 +#define C	(B + PREFETCH_DIST)
   22.99 +#define D	(C + 3)
  22.100 +#define N	(D + 1)
  22.101 +#define Nrot	((N + 7) & ~7)
  22.102 +
  22.103 +GLOBAL_ENTRY(copy_page)
  22.104 +	.prologue
  22.105 +	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
  22.106 +
  22.107 +	.rotr v[2*PREFETCH_DIST], n[D-C+1]
  22.108 +	.rotp p[N]
  22.109 +
  22.110 +	.save ar.lc, saved_lc
  22.111 +	mov saved_lc = ar.lc
  22.112 +	.save pr, saved_pr
  22.113 +	mov saved_pr = pr
  22.114 +	.body
  22.115 +
  22.116 +	mov src_pre_mem = in1
  22.117 +	mov pr.rot = 0x10000
  22.118 +	mov ar.ec = 1				// special unrolled loop
  22.119 +
  22.120 +	mov dst_pre_mem = in0
  22.121 +	mov ar.lc = 2*PREFETCH_DIST - 1
  22.122 +
  22.123 +	add src_pre_l2 = 8*8, in1
  22.124 +	add dst_pre_l2 = 8*8, in0
  22.125 +	add src0 = 8, in1			// first t1 src
  22.126 +	add src1 = 3*8, in1			// first t3 src
  22.127 +	add dst0 = 8, in0			// first t1 dst
  22.128 +	add dst1 = 3*8, in0			// first t3 dst
  22.129 +	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
  22.130 +	nop.m 0
  22.131 +	nop.i 0
  22.132 +	;;
  22.133 +	// same as .line_copy loop, but with all predicated-off instructions removed:
  22.134 +.prefetch_loop:
  22.135 +(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
  22.136 +(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
  22.137 +	br.ctop.sptk .prefetch_loop
  22.138 +	;;
  22.139 +	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
  22.140 +	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
  22.141 +	mov ar.ec = N				// # of stages in pipeline
  22.142 +	;;
  22.143 +.line_copy:
  22.144 +(p[D])	ld8 t2 = [src0], 3*8			// M0
  22.145 +(p[D])	ld8 t4 = [src1], 3*8			// M1
  22.146 +(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
  22.147 +(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
  22.148 +	;;
  22.149 +(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
  22.150 +(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
  22.151 +(p[D])	st8 [dst0] =  t1, 8			// M2
  22.152 +(p[D])	st8 [dst1] =  t3, 8			// M3
  22.153 +	;;
  22.154 +(p[D])	ld8  t5 = [src0], 8
  22.155 +(p[D])	ld8  t7 = [src1], 3*8
  22.156 +(p[D])	st8 [dst0] =  t2, 3*8
  22.157 +(p[D])	st8 [dst1] =  t4, 3*8
  22.158 +	;;
  22.159 +(p[D])	ld8  t6 = [src0], 3*8
  22.160 +(p[D])	ld8 t10 = [src1], 8
  22.161 +(p[D])	st8 [dst0] =  t5, 8
  22.162 +(p[D])	st8 [dst1] =  t7, 3*8
  22.163 +	;;
  22.164 +(p[D])	ld8  t9 = [src0], 3*8
  22.165 +(p[D])	ld8 t11 = [src1], 3*8
  22.166 +(p[D])	st8 [dst0] =  t6, 3*8
  22.167 +(p[D])	st8 [dst1] = t10, 8
  22.168 +	;;
  22.169 +(p[D])	ld8 t12 = [src0], 8
  22.170 +(p[D])	ld8 t14 = [src1], 8
  22.171 +(p[D])	st8 [dst0] =  t9, 3*8
  22.172 +(p[D])	st8 [dst1] = t11, 3*8
  22.173 +	;;
  22.174 +(p[D])	ld8 t13 = [src0], 4*8
  22.175 +(p[D])	ld8 t15 = [src1], 4*8
  22.176 +(p[D])	st8 [dst0] = t12, 8
  22.177 +(p[D])	st8 [dst1] = t14, 8
  22.178 +	;;
  22.179 +(p[D-1])ld8  t1 = [src0], 8
  22.180 +(p[D-1])ld8  t3 = [src1], 8
  22.181 +(p[D])	st8 [dst0] = t13, 4*8
  22.182 +(p[D])	st8 [dst1] = t15, 4*8
  22.183 +	br.ctop.sptk .line_copy
  22.184 +	;;
  22.185 +	mov ar.lc = saved_lc
  22.186 +	mov pr = saved_pr, -1
  22.187 +	br.ret.sptk.many rp
  22.188 +END(copy_page)
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/xen/arch/ia64/linux/lib/copy_user.S	Tue Aug 02 15:59:09 2005 -0800
    23.3 @@ -0,0 +1,610 @@
    23.4 +/*
    23.5 + *
    23.6 + * Optimized version of the copy_user() routine.
    23.7 + * It is used to copy date across the kernel/user boundary.
    23.8 + *
    23.9 + * The source and destination are always on opposite side of
   23.10 + * the boundary. When reading from user space we must catch
   23.11 + * faults on loads. When writing to user space we must catch
   23.12 + * errors on stores. Note that because of the nature of the copy
   23.13 + * we don't need to worry about overlapping regions.
   23.14 + *
   23.15 + *
   23.16 + * Inputs:
   23.17 + *	in0	address of source buffer
   23.18 + *	in1	address of destination buffer
   23.19 + *	in2	number of bytes to copy
   23.20 + *
   23.21 + * Outputs:
   23.22 + *	ret0	0 in case of success. The number of bytes NOT copied in
   23.23 + *		case of error.
   23.24 + *
   23.25 + * Copyright (C) 2000-2001 Hewlett-Packard Co
   23.26 + *	Stephane Eranian <eranian@hpl.hp.com>
   23.27 + *
   23.28 + * Fixme:
   23.29 + *	- handle the case where we have more than 16 bytes and the alignment
   23.30 + *	  are different.
   23.31 + *	- more benchmarking
   23.32 + *	- fix extraneous stop bit introduced by the EX() macro.
   23.33 + */
   23.34 +
   23.35 +#include <asm/asmmacro.h>
   23.36 +
   23.37 +//
   23.38 +// Tuneable parameters
   23.39 +//
   23.40 +#define COPY_BREAK	16	// we do byte copy below (must be >=16)
   23.41 +#define PIPE_DEPTH	21	// pipe depth
   23.42 +
   23.43 +#define EPI		p[PIPE_DEPTH-1]
   23.44 +
   23.45 +//
   23.46 +// arguments
   23.47 +//
   23.48 +#define dst		in0
   23.49 +#define src		in1
   23.50 +#define len		in2
   23.51 +
   23.52 +//
   23.53 +// local registers
   23.54 +//
   23.55 +#define t1		r2	// rshift in bytes
   23.56 +#define t2		r3	// lshift in bytes
   23.57 +#define rshift		r14	// right shift in bits
   23.58 +#define lshift		r15	// left shift in bits
   23.59 +#define word1		r16
   23.60 +#define word2		r17
   23.61 +#define cnt		r18
   23.62 +#define len2		r19
   23.63 +#define saved_lc	r20
   23.64 +#define saved_pr	r21
   23.65 +#define tmp		r22
   23.66 +#define val		r23
   23.67 +#define src1		r24
   23.68 +#define dst1		r25
   23.69 +#define src2		r26
   23.70 +#define dst2		r27
   23.71 +#define len1		r28
   23.72 +#define enddst		r29
   23.73 +#define endsrc		r30
   23.74 +#define saved_pfs	r31
   23.75 +
   23.76 +GLOBAL_ENTRY(__copy_user)
   23.77 +	.prologue
   23.78 +	.save ar.pfs, saved_pfs
   23.79 +	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
   23.80 +
   23.81 +	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
   23.82 +	.rotp p[PIPE_DEPTH]
   23.83 +
   23.84 +	adds len2=-1,len	// br.ctop is repeat/until
   23.85 +	mov ret0=r0
   23.86 +
   23.87 +	;;			// RAW of cfm when len=0
   23.88 +	cmp.eq p8,p0=r0,len	// check for zero length
   23.89 +	.save ar.lc, saved_lc
   23.90 +	mov saved_lc=ar.lc	// preserve ar.lc (slow)
   23.91 +(p8)	br.ret.spnt.many rp	// empty mempcy()
   23.92 +	;;
   23.93 +	add enddst=dst,len	// first byte after end of source
   23.94 +	add endsrc=src,len	// first byte after end of destination
   23.95 +	.save pr, saved_pr
   23.96 +	mov saved_pr=pr		// preserve predicates
   23.97 +
   23.98 +	.body
   23.99 +
  23.100 +	mov dst1=dst		// copy because of rotation
  23.101 +	mov ar.ec=PIPE_DEPTH
  23.102 +	mov pr.rot=1<<16	// p16=true all others are false
  23.103 +
  23.104 +	mov src1=src		// copy because of rotation
  23.105 +	mov ar.lc=len2		// initialize lc for small count
  23.106 +	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
  23.107 +
  23.108 +	xor tmp=src,dst		// same alignment test prepare
  23.109 +(p10)	br.cond.dptk .long_copy_user
  23.110 +	;;			// RAW pr.rot/p16 ?
  23.111 +	//
  23.112 +	// Now we do the byte by byte loop with software pipeline
  23.113 +	//
  23.114 +	// p7 is necessarily false by now
  23.115 +1:
  23.116 +	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  23.117 +	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  23.118 +	br.ctop.dptk.few 1b
  23.119 +	;;
  23.120 +	mov ar.lc=saved_lc
  23.121 +	mov pr=saved_pr,0xffffffffffff0000
  23.122 +	mov ar.pfs=saved_pfs		// restore ar.ec
  23.123 +	br.ret.sptk.many rp		// end of short memcpy
  23.124 +
  23.125 +	//
  23.126 +	// Not 8-byte aligned
  23.127 +	//
  23.128 +.diff_align_copy_user:
  23.129 +	// At this point we know we have more than 16 bytes to copy
  23.130 +	// and also that src and dest do _not_ have the same alignment.
  23.131 +	and src2=0x7,src1				// src offset
  23.132 +	and dst2=0x7,dst1				// dst offset
  23.133 +	;;
  23.134 +	// The basic idea is that we copy byte-by-byte at the head so
  23.135 +	// that we can reach 8-byte alignment for both src1 and dst1.
  23.136 +	// Then copy the body using software pipelined 8-byte copy,
  23.137 +	// shifting the two back-to-back words right and left, then copy
  23.138 +	// the tail by copying byte-by-byte.
  23.139 +	//
  23.140 +	// Fault handling. If the byte-by-byte at the head fails on the
  23.141 +	// load, then restart and finish the pipleline by copying zeros
  23.142 +	// to the dst1. Then copy zeros for the rest of dst1.
  23.143 +	// If 8-byte software pipeline fails on the load, do the same as
  23.144 +	// failure_in3 does. If the byte-by-byte at the tail fails, it is
  23.145 +	// handled simply by failure_in_pipe1.
  23.146 +	//
  23.147 +	// The case p14 represents the source has more bytes in the
  23.148 +	// the first word (by the shifted part), whereas the p15 needs to
  23.149 +	// copy some bytes from the 2nd word of the source that has the
  23.150 +	// tail of the 1st of the destination.
  23.151 +	//
  23.152 +
  23.153 +	//
  23.154 +	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
  23.155 +	// to copy the head to dst1, to start 8-byte copy software pipeline.
  23.156 +	// We know src1 is not 8-byte aligned in this case.
  23.157 +	//
  23.158 +	cmp.eq p14,p15=r0,dst2
  23.159 +(p15)	br.cond.spnt 1f
  23.160 +	;;
  23.161 +	sub t1=8,src2
  23.162 +	mov t2=src2
  23.163 +	;;
  23.164 +	shl rshift=t2,3
  23.165 +	sub len1=len,t1					// set len1
  23.166 +	;;
  23.167 +	sub lshift=64,rshift
  23.168 +	;;
  23.169 +	br.cond.spnt .word_copy_user
  23.170 +	;;
  23.171 +1:
  23.172 +	cmp.leu	p14,p15=src2,dst2
  23.173 +	sub t1=dst2,src2
  23.174 +	;;
  23.175 +	.pred.rel "mutex", p14, p15
  23.176 +(p14)	sub word1=8,src2				// (8 - src offset)
  23.177 +(p15)	sub t1=r0,t1					// absolute value
  23.178 +(p15)	sub word1=8,dst2				// (8 - dst offset)
  23.179 +	;;
  23.180 +	// For the case p14, we don't need to copy the shifted part to
  23.181 +	// the 1st word of destination.
  23.182 +	sub t2=8,t1
  23.183 +(p14)	sub word1=word1,t1
  23.184 +	;;
  23.185 +	sub len1=len,word1				// resulting len
  23.186 +(p15)	shl rshift=t1,3					// in bits
  23.187 +(p14)	shl rshift=t2,3
  23.188 +	;;
  23.189 +(p14)	sub len1=len1,t1
  23.190 +	adds cnt=-1,word1
  23.191 +	;;
  23.192 +	sub lshift=64,rshift
  23.193 +	mov ar.ec=PIPE_DEPTH
  23.194 +	mov pr.rot=1<<16	// p16=true all others are false
  23.195 +	mov ar.lc=cnt
  23.196 +	;;
  23.197 +2:
  23.198 +	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
  23.199 +	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  23.200 +	br.ctop.dptk.few 2b
  23.201 +	;;
  23.202 +	clrrrb
  23.203 +	;;
  23.204 +.word_copy_user:
  23.205 +	cmp.gtu p9,p0=16,len1
  23.206 +(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
  23.207 +	;;
  23.208 +	shr.u cnt=len1,3		// number of 64-bit words
  23.209 +	;;
  23.210 +	adds cnt=-1,cnt
  23.211 +	;;
  23.212 +	.pred.rel "mutex", p14, p15
  23.213 +(p14)	sub src1=src1,t2
  23.214 +(p15)	sub src1=src1,t1
  23.215 +	//
  23.216 +	// Now both src1 and dst1 point to an 8-byte aligned address. And
  23.217 +	// we have more than 8 bytes to copy.
  23.218 +	//
  23.219 +	mov ar.lc=cnt
  23.220 +	mov ar.ec=PIPE_DEPTH
  23.221 +	mov pr.rot=1<<16	// p16=true all others are false
  23.222 +	;;
  23.223 +3:
  23.224 +	//
  23.225 +	// The pipleline consists of 3 stages:
  23.226 +	// 1 (p16):	Load a word from src1
  23.227 +	// 2 (EPI_1):	Shift right pair, saving to tmp
  23.228 +	// 3 (EPI):	Store tmp to dst1
  23.229 +	//
  23.230 +	// To make it simple, use at least 2 (p16) loops to set up val1[n]
  23.231 +	// because we need 2 back-to-back val1[] to get tmp.
  23.232 +	// Note that this implies EPI_2 must be p18 or greater.
  23.233 +	//
  23.234 +
  23.235 +#define EPI_1		p[PIPE_DEPTH-2]
  23.236 +#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
  23.237 +#define CASE(pred, shift)	\
  23.238 +	(pred)	br.cond.spnt .copy_user_bit##shift
  23.239 +#define BODY(rshift)						\
  23.240 +.copy_user_bit##rshift:						\
  23.241 +1:								\
  23.242 +	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
  23.243 +(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
  23.244 +	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
  23.245 +(p16)	mov val1[0]=r0;						\
  23.246 +	br.ctop.dptk 1b;					\
  23.247 +	;;							\
  23.248 +	br.cond.sptk.many .diff_align_do_tail;			\
  23.249 +2:								\
  23.250 +(EPI)	st8 [dst1]=tmp,8;					\
  23.251 +(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
  23.252 +3:								\
  23.253 +(p16)	mov val1[1]=r0;						\
  23.254 +(p16)	mov val1[0]=r0;						\
  23.255 +	br.ctop.dptk 2b;					\
  23.256 +	;;							\
  23.257 +	br.cond.sptk.many .failure_in2
  23.258 +
  23.259 +	//
  23.260 +	// Since the instruction 'shrp' requires a fixed 128-bit value
  23.261 +	// specifying the bits to shift, we need to provide 7 cases
  23.262 +	// below.
  23.263 +	//
  23.264 +	SWITCH(p6, 8)
  23.265 +	SWITCH(p7, 16)
  23.266 +	SWITCH(p8, 24)
  23.267 +	SWITCH(p9, 32)
  23.268 +	SWITCH(p10, 40)
  23.269 +	SWITCH(p11, 48)
  23.270 +	SWITCH(p12, 56)
  23.271 +	;;
  23.272 +	CASE(p6, 8)
  23.273 +	CASE(p7, 16)
  23.274 +	CASE(p8, 24)
  23.275 +	CASE(p9, 32)
  23.276 +	CASE(p10, 40)
  23.277 +	CASE(p11, 48)
  23.278 +	CASE(p12, 56)
  23.279 +	;;
  23.280 +	BODY(8)
  23.281 +	BODY(16)
  23.282 +	BODY(24)
  23.283 +	BODY(32)
  23.284 +	BODY(40)
  23.285 +	BODY(48)
  23.286 +	BODY(56)
  23.287 +	;;
  23.288 +.diff_align_do_tail:
  23.289 +	.pred.rel "mutex", p14, p15
  23.290 +(p14)	sub src1=src1,t1
  23.291 +(p14)	adds dst1=-8,dst1
  23.292 +(p15)	sub dst1=dst1,t1
  23.293 +	;;
  23.294 +4:
  23.295 +	// Tail correction.
  23.296 +	//
  23.297 +	// The problem with this piplelined loop is that the last word is not
  23.298 +	// loaded and thus parf of the last word written is not correct.
  23.299 +	// To fix that, we simply copy the tail byte by byte.
  23.300 +
  23.301 +	sub len1=endsrc,src1,1
  23.302 +	clrrrb
  23.303 +	;;
  23.304 +	mov ar.ec=PIPE_DEPTH
  23.305 +	mov pr.rot=1<<16	// p16=true all others are false
  23.306 +	mov ar.lc=len1
  23.307 +	;;
  23.308 +5:
  23.309 +	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  23.310 +	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  23.311 +	br.ctop.dptk.few 5b
  23.312 +	;;
  23.313 +	mov ar.lc=saved_lc
  23.314 +	mov pr=saved_pr,0xffffffffffff0000
  23.315 +	mov ar.pfs=saved_pfs
  23.316 +	br.ret.sptk.many rp
  23.317 +
  23.318 +	//
  23.319 +	// Beginning of long mempcy (i.e. > 16 bytes)
  23.320 +	//
  23.321 +.long_copy_user:
  23.322 +	tbit.nz p6,p7=src1,0	// odd alignment
  23.323 +	and tmp=7,tmp
  23.324 +	;;
  23.325 +	cmp.eq p10,p8=r0,tmp
  23.326 +	mov len1=len		// copy because of rotation
  23.327 +(p8)	br.cond.dpnt .diff_align_copy_user
  23.328 +	;;
  23.329 +	// At this point we know we have more than 16 bytes to copy
  23.330 +	// and also that both src and dest have the same alignment
  23.331 +	// which may not be the one we want. So for now we must move
  23.332 +	// forward slowly until we reach 16byte alignment: no need to
  23.333 +	// worry about reaching the end of buffer.
  23.334 +	//
  23.335 +	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
  23.336 +(p6)	adds len1=-1,len1;;
  23.337 +	tbit.nz p7,p0=src1,1
  23.338 +	;;
  23.339 +	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
  23.340 +(p7)	adds len1=-2,len1;;
  23.341 +	tbit.nz p8,p0=src1,2
  23.342 +	;;
  23.343 +	//
  23.344 +	// Stop bit not required after ld4 because if we fail on ld4
  23.345 +	// we have never executed the ld1, therefore st1 is not executed.
  23.346 +	//
  23.347 +	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
  23.348 +	;;
  23.349 +	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
  23.350 +	tbit.nz p9,p0=src1,3
  23.351 +	;;
  23.352 +	//
  23.353 +	// Stop bit not required after ld8 because if we fail on ld8
  23.354 +	// we have never executed the ld2, therefore st2 is not executed.
  23.355 +	//
  23.356 +	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
  23.357 +	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
  23.358 +(p8)	adds len1=-4,len1
  23.359 +	;;
  23.360 +	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
  23.361 +(p9)	adds len1=-8,len1;;
  23.362 +	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
  23.363 +	;;
  23.364 +	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
  23.365 +	tbit.nz p6,p0=len1,3
  23.366 +	cmp.eq p7,p0=r0,cnt
  23.367 +	adds tmp=-1,cnt			// br.ctop is repeat/until
  23.368 +(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
  23.369 +	;;
  23.370 +	adds src2=8,src1
  23.371 +	adds dst2=8,dst1
  23.372 +	mov ar.lc=tmp
  23.373 +	;;
  23.374 +	//
  23.375 +	// 16bytes/iteration
  23.376 +	//
  23.377 +2:
  23.378 +	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
  23.379 +(p16)	ld8 val2[0]=[src2],16
  23.380 +
  23.381 +	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
  23.382 +(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
  23.383 +	br.ctop.dptk 2b
  23.384 +	;;			// RAW on src1 when fall through from loop
  23.385 +	//
  23.386 +	// Tail correction based on len only
  23.387 +	//
  23.388 +	// No matter where we come from (loop or test) the src1 pointer
  23.389 +	// is 16 byte aligned AND we have less than 16 bytes to copy.
  23.390 +	//
  23.391 +.dotail:
  23.392 +	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
  23.393 +	tbit.nz p7,p0=len1,2
  23.394 +	;;
  23.395 +	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
  23.396 +	tbit.nz p8,p0=len1,1
  23.397 +	;;
  23.398 +	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
  23.399 +	tbit.nz p9,p0=len1,0
  23.400 +	;;
  23.401 +	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
  23.402 +	;;
  23.403 +	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
  23.404 +	mov ar.lc=saved_lc
  23.405 +	;;
  23.406 +	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
  23.407 +	mov pr=saved_pr,0xffffffffffff0000
  23.408 +	;;
  23.409 +	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
  23.410 +	mov ar.pfs=saved_pfs
  23.411 +	;;
  23.412 +	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
  23.413 +	br.ret.sptk.many rp
  23.414 +
  23.415 +
  23.416 +	//
  23.417 +	// Here we handle the case where the byte by byte copy fails
  23.418 +	// on the load.
  23.419 +	// Several factors make the zeroing of the rest of the buffer kind of
  23.420 +	// tricky:
  23.421 +	//	- the pipeline: loads/stores are not in sync (pipeline)
  23.422 +	//
  23.423 +	//	  In the same loop iteration, the dst1 pointer does not directly
  23.424 +	//	  reflect where the faulty load was.
  23.425 +	//
  23.426 +	//	- pipeline effect
  23.427 +	//	  When you get a fault on load, you may have valid data from
  23.428 +	//	  previous loads not yet store in transit. Such data must be
  23.429 +	//	  store normally before moving onto zeroing the rest.
  23.430 +	//
  23.431 +	//	- single/multi dispersal independence.
  23.432 +	//
  23.433 +	// solution:
  23.434 +	//	- we don't disrupt the pipeline, i.e. data in transit in
  23.435 +	//	  the software pipeline will be eventually move to memory.
  23.436 +	//	  We simply replace the load with a simple mov and keep the
  23.437 +	//	  pipeline going. We can't really do this inline because
  23.438 +	//	  p16 is always reset to 1 when lc > 0.
  23.439 +	//
  23.440 +.failure_in_pipe1:
  23.441 +	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  23.442 +1:
  23.443 +(p16)	mov val1[0]=r0
  23.444 +(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
  23.445 +	br.ctop.dptk 1b
  23.446 +	;;
  23.447 +	mov pr=saved_pr,0xffffffffffff0000
  23.448 +	mov ar.lc=saved_lc
  23.449 +	mov ar.pfs=saved_pfs
  23.450 +	br.ret.sptk.many rp
  23.451 +
  23.452 +	//
  23.453 +	// This is the case where the byte by byte copy fails on the load
  23.454 +	// when we copy the head. We need to finish the pipeline and copy
  23.455 +	// zeros for the rest of the destination. Since this happens
  23.456 +	// at the top we still need to fill the body and tail.
  23.457 +.failure_in_pipe2:
  23.458 +	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  23.459 +2:
  23.460 +(p16)	mov val1[0]=r0
  23.461 +(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
  23.462 +	br.ctop.dptk 2b
  23.463 +	;;
  23.464 +	sub len=enddst,dst1,1		// precompute len
  23.465 +	br.cond.dptk.many .failure_in1bis
  23.466 +	;;
  23.467 +
  23.468 +	//
  23.469 +	// Here we handle the head & tail part when we check for alignment.
  23.470 +	// The following code handles only the load failures. The
  23.471 +	// main diffculty comes from the fact that loads/stores are
  23.472 +	// scheduled. So when you fail on a load, the stores corresponding
  23.473 +	// to previous successful loads must be executed.
  23.474 +	//
  23.475 +	// However some simplifications are possible given the way
  23.476 +	// things work.
  23.477 +	//
  23.478 +	// 1) HEAD
  23.479 +	// Theory of operation:
  23.480 +	//
  23.481 +	//  Page A   | Page B
  23.482 +	//  ---------|-----
  23.483 +	//          1|8 x
  23.484 +	//	  1 2|8 x
  23.485 +	//	    4|8 x
  23.486 +	//	  1 4|8 x
  23.487 +	//        2 4|8 x
  23.488 +	//      1 2 4|8 x
  23.489 +	//	     |1
  23.490 +	//	     |2 x
  23.491 +	//	     |4 x
  23.492 +	//
  23.493 +	// page_size >= 4k (2^12).  (x means 4, 2, 1)
  23.494 +	// Here we suppose Page A exists and Page B does not.
  23.495 +	//
  23.496 +	// As we move towards eight byte alignment we may encounter faults.
  23.497 +	// The numbers on each page show the size of the load (current alignment).
  23.498 +	//
  23.499 +	// Key point:
  23.500 +	//	- if you fail on 1, 2, 4 then you have never executed any smaller
  23.501 +	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
  23.502 +	//	  before.
  23.503 +	//
  23.504 +	// This allows us to simplify the cleanup code, because basically you
  23.505 +	// only have to worry about "pending" stores in the case of a failing
  23.506 +	// ld8(). Given the way the code is written today, this means only
  23.507 +	// worry about st2, st4. There we can use the information encapsulated
  23.508 +	// into the predicates.
  23.509 +	//
  23.510 +	// Other key point:
  23.511 +	//	- if you fail on the ld8 in the head, it means you went straight
  23.512 +	//	  to it, i.e. 8byte alignment within an unexisting page.
  23.513 +	// Again this comes from the fact that if you crossed just for the ld8 then
  23.514 +	// you are 8byte aligned but also 16byte align, therefore you would
  23.515 +	// either go for the 16byte copy loop OR the ld8 in the tail part.
  23.516 +	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
  23.517 +	// because it would mean you had 15bytes to copy in which case you
  23.518 +	// would have defaulted to the byte by byte copy.
  23.519 +	//
  23.520 +	//
  23.521 +	// 2) TAIL
  23.522 +	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
  23.523 +	// aligned.
  23.524 +	//
  23.525 +	// Key point:
  23.526 +	// This means that we either:
  23.527 +	//		- are right on a page boundary
  23.528 +	//	OR
  23.529 +	//		- are at more than 16 bytes from a page boundary with
  23.530 +	//		  at most 15 bytes to copy: no chance of crossing.
  23.531 +	//
  23.532 +	// This allows us to assume that if we fail on a load we haven't possibly
  23.533 +	// executed any of the previous (tail) ones, so we don't need to do
  23.534 +	// any stores. For instance, if we fail on ld2, this means we had
  23.535 +	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
  23.536 +	//
  23.537 +	// This means that we are in a situation similar the a fault in the
  23.538 +	// head part. That's nice!
  23.539 +	//
  23.540 +.failure_in1:
  23.541 +	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  23.542 +	sub len=endsrc,src1,1
  23.543 +	//
  23.544 +	// we know that ret0 can never be zero at this point
  23.545 +	// because we failed why trying to do a load, i.e. there is still
  23.546 +	// some work to do.
  23.547 +	// The failure_in1bis and length problem is taken care of at the
  23.548 +	// calling side.
  23.549 +	//
  23.550 +	;;
  23.551 +.failure_in1bis:		// from (.failure_in3)
  23.552 +	mov ar.lc=len		// Continue with a stupid byte store.
  23.553 +	;;
  23.554 +5:
  23.555 +	st1 [dst1]=r0,1
  23.556 +	br.cloop.dptk 5b
  23.557 +	;;
  23.558 +	mov pr=saved_pr,0xffffffffffff0000
  23.559 +	mov ar.lc=saved_lc
  23.560 +	mov ar.pfs=saved_pfs
  23.561 +	br.ret.sptk.many rp
  23.562 +
  23.563 +	//
  23.564 +	// Here we simply restart the loop but instead
  23.565 +	// of doing loads we fill the pipeline with zeroes
  23.566 +	// We can't simply store r0 because we may have valid
  23.567 +	// data in transit in the pipeline.
  23.568 +	// ar.lc and ar.ec are setup correctly at this point
  23.569 +	//
  23.570 +	// we MUST use src1/endsrc here and not dst1/enddst because
  23.571 +	// of the pipeline effect.
  23.572 +	//
  23.573 +.failure_in3:
  23.574 +	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
  23.575 +	;;
  23.576 +2:
  23.577 +(p16)	mov val1[0]=r0
  23.578 +(p16)	mov val2[0]=r0
  23.579 +(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
  23.580 +(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
  23.581 +	br.ctop.dptk 2b
  23.582 +	;;
  23.583 +	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
  23.584 +	sub len=enddst,dst1,1		// precompute len
  23.585 +(p6)	br.cond.dptk .failure_in1bis
  23.586 +	;;
  23.587 +	mov pr=saved_pr,0xffffffffffff0000
  23.588 +	mov ar.lc=saved_lc
  23.589 +	mov ar.pfs=saved_pfs
  23.590 +	br.ret.sptk.many rp
  23.591 +
  23.592 +.failure_in2:
  23.593 +	sub ret0=endsrc,src1
  23.594 +	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
  23.595 +	sub len=enddst,dst1,1		// precompute len
  23.596 +(p6)	br.cond.dptk .failure_in1bis
  23.597 +	;;
  23.598 +	mov pr=saved_pr,0xffffffffffff0000
  23.599 +	mov ar.lc=saved_lc
  23.600 +	mov ar.pfs=saved_pfs
  23.601 +	br.ret.sptk.many rp
  23.602 +
  23.603 +	//
  23.604 +	// handling of failures on stores: that's the easy part
  23.605 +	//
  23.606 +.failure_out:
  23.607 +	sub ret0=enddst,dst1
  23.608 +	mov pr=saved_pr,0xffffffffffff0000
  23.609 +	mov ar.lc=saved_lc
  23.610 +
  23.611 +	mov ar.pfs=saved_pfs
  23.612 +	br.ret.sptk.many rp
  23.613 +END(__copy_user)
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/xen/arch/ia64/linux/lib/csum_partial_copy.c	Tue Aug 02 15:59:09 2005 -0800
    24.3 @@ -0,0 +1,151 @@
    24.4 +/*
    24.5 + * Network Checksum & Copy routine
    24.6 + *
    24.7 + * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
    24.8 + *	Stephane Eranian <eranian@hpl.hp.com>
    24.9 + *
   24.10 + * Most of the code has been imported from Linux/Alpha
   24.11 + */
   24.12 +
   24.13 +#include <linux/module.h>
   24.14 +#include <linux/types.h>
   24.15 +#include <linux/string.h>
   24.16 +
   24.17 +#include <asm/uaccess.h>
   24.18 +
   24.19 +/*
   24.20 + * XXX Fixme: those 2 inlines are meant for debugging and will go away
   24.21 + */
   24.22 +static inline unsigned
   24.23 +short from64to16(unsigned long x)
   24.24 +{
   24.25 +	/* add up 32-bit words for 33 bits */
   24.26 +	x = (x & 0xffffffff) + (x >> 32);
   24.27 +	/* add up 16-bit and 17-bit words for 17+c bits */
   24.28 +	x = (x & 0xffff) + (x >> 16);
   24.29 +	/* add up 16-bit and 2-bit for 16+c bit */
   24.30 +	x = (x & 0xffff) + (x >> 16);
   24.31 +	/* add up carry.. */
   24.32 +	x = (x & 0xffff) + (x >> 16);
   24.33 +	return x;
   24.34 +}
   24.35 +
   24.36 +static inline
   24.37 +unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
   24.38 +{
   24.39 +	int odd, count;
   24.40 +	unsigned long result = (unsigned long)psum;
   24.41 +
   24.42 +	if (len <= 0)
   24.43 +		goto out;
   24.44 +	odd = 1 & (unsigned long) buff;
   24.45 +	if (odd) {
   24.46 +		result = *buff << 8;
   24.47 +		len--;
   24.48 +		buff++;
   24.49 +	}
   24.50 +	count = len >> 1;		/* nr of 16-bit words.. */
   24.51 +	if (count) {
   24.52 +		if (2 & (unsigned long) buff) {
   24.53 +			result += *(unsigned short *) buff;
   24.54 +			count--;
   24.55 +			len -= 2;
   24.56 +			buff += 2;
   24.57 +		}
   24.58 +		count >>= 1;		/* nr of 32-bit words.. */
   24.59 +		if (count) {
   24.60 +			if (4 & (unsigned long) buff) {
   24.61 +				result += *(unsigned int *) buff;
   24.62 +				count--;
   24.63 +				len -= 4;
   24.64 +				buff += 4;
   24.65 +			}
   24.66 +			count >>= 1;	/* nr of 64-bit words.. */
   24.67 +			if (count) {
   24.68 +				unsigned long carry = 0;
   24.69 +				do {
   24.70 +					unsigned long w = *(unsigned long *) buff;
   24.71 +					count--;
   24.72 +					buff += 8;
   24.73 +					result += carry;
   24.74 +					result += w;
   24.75 +					carry = (w > result);
   24.76 +				} while (count);
   24.77 +				result += carry;
   24.78 +				result = (result & 0xffffffff) + (result >> 32);
   24.79 +			}
   24.80 +			if (len & 4) {
   24.81 +				result += *(unsigned int *) buff;
   24.82 +				buff += 4;
   24.83 +			}
   24.84 +		}
   24.85 +		if (len & 2) {
   24.86 +			result += *(unsigned short *) buff;
   24.87 +			buff += 2;
   24.88 +		}
   24.89 +	}
   24.90 +	if (len & 1)
   24.91 +		result += *buff;
   24.92 +
   24.93 +	result = from64to16(result);
   24.94 +
   24.95 +	if (odd)
   24.96 +		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
   24.97 +
   24.98 +out:
   24.99 +	return result;
  24.100 +}
  24.101 +
  24.102 +/*
  24.103 + * XXX Fixme
  24.104 + *
  24.105 + * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
  24.106 + * But it's very tricky to get right even in C.
  24.107 + */
  24.108 +extern unsigned long do_csum(const unsigned char *, long);
  24.109 +
  24.110 +static unsigned int
  24.111 +do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
  24.112 +				int len, unsigned int psum, int *errp)
  24.113 +{
  24.114 +	unsigned long result;
  24.115 +
  24.116 +	/* XXX Fixme
  24.117 +	 * for now we separate the copy from checksum for obvious
  24.118 +	 * alignment difficulties. Look at the Alpha code and you'll be
  24.119 +	 * scared.
  24.120 +	 */
  24.121 +
  24.122 +	if (__copy_from_user(dst, src, len) != 0 && errp)
  24.123 +		*errp = -EFAULT;
  24.124 +
  24.125 +	result = do_csum(dst, len);
  24.126 +
  24.127 +	/* add in old sum, and carry.. */
  24.128 +	result += psum;
  24.129 +	/* 32+c bits -> 32 bits */
  24.130 +	result = (result & 0xffffffff) + (result >> 32);
  24.131 +	return result;
  24.132 +}
  24.133 +
  24.134 +unsigned int
  24.135 +csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
  24.136 +			     int len, unsigned int sum, int *errp)
  24.137 +{
  24.138 +	if (!access_ok(VERIFY_READ, src, len)) {
  24.139 +		*errp = -EFAULT;
  24.140 +		memset(dst, 0, len);
  24.141 +		return sum;
  24.142 +	}
  24.143 +
  24.144 +	return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
  24.145 +}
  24.146 +
  24.147 +unsigned int
  24.148 +csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
  24.149 +			  int len, unsigned int sum)
  24.150 +{
  24.151 +	return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
  24.152 +}
  24.153 +
  24.154 +EXPORT_SYMBOL(csum_partial_copy_nocheck);
    25.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.2 +++ b/xen/arch/ia64/linux/lib/dec_and_lock.c	Tue Aug 02 15:59:09 2005 -0800
    25.3 @@ -0,0 +1,42 @@
    25.4 +/*
    25.5 + * Copyright (C) 2003 Jerome Marchand, Bull S.A.
    25.6 + *	Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com>
    25.7 + *
    25.8 + * This file is released under the GPLv2, or at your option any later version.
    25.9 + *
   25.10 + * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction.  This
   25.11 + * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
   25.12 + */
   25.13 +
   25.14 +#include <linux/compiler.h>
   25.15 +#include <linux/module.h>
   25.16 +#include <linux/spinlock.h>
   25.17 +#include <asm/atomic.h>
   25.18 +
   25.19 +/*
   25.20 + * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  Both of these
   25.21 + * operations have to be done atomically, so that the count doesn't drop to zero without
   25.22 + * acquiring the spinlock first.
   25.23 + */
   25.24 +int
   25.25 +_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
   25.26 +{
   25.27 +	int old, new;
   25.28 +
   25.29 +	do {
   25.30 +		old = atomic_read(refcount);
   25.31 +		new = old - 1;
   25.32 +
   25.33 +		if (unlikely (old == 1)) {
   25.34 +			/* oops, we may be decrementing to zero, do it the slow way... */
   25.35 +			spin_lock(lock);
   25.36 +			if (atomic_dec_and_test(refcount))
   25.37 +				return 1;
   25.38 +			spin_unlock(lock);
   25.39 +			return 0;
   25.40 +		}
   25.41 +	} while (cmpxchg(&refcount->counter, old, new) != old);
   25.42 +	return 0;
   25.43 +}
   25.44 +
   25.45 +EXPORT_SYMBOL(_atomic_dec_and_lock);
    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/xen/arch/ia64/linux/lib/do_csum.S	Tue Aug 02 15:59:09 2005 -0800
    26.3 @@ -0,0 +1,323 @@
    26.4 +/*
    26.5 + *
    26.6 + * Optmized version of the standard do_csum() function
    26.7 + *
    26.8 + * Return: a 64bit quantity containing the 16bit Internet checksum
    26.9 + *
   26.10 + * Inputs:
   26.11 + *	in0: address of buffer to checksum (char *)
   26.12 + *	in1: length of the buffer (int)
   26.13 + *
   26.14 + * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
   26.15 + *	Stephane Eranian <eranian@hpl.hp.com>
   26.16 + *
   26.17 + * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
   26.18 + *		Data locality study on the checksum buffer.
   26.19 + *		More optimization cleanup - remove excessive stop bits.
   26.20 + * 02/04/08	David Mosberger <davidm@hpl.hp.com>
   26.21 + *		More cleanup and tuning.
   26.22 + * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
   26.23 + *		Clean up and optimize and the software pipeline, loading two
   26.24 + *		back-to-back 8-byte words per loop. Clean up the initialization
   26.25 + *		for the loop. Support the cases where load latency = 1 or 2.
   26.26 + *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
   26.27 + */
   26.28 +
   26.29 +#include <asm/asmmacro.h>
   26.30 +
   26.31 +//
   26.32 +// Theory of operations:
   26.33 +//	The goal is to go as quickly as possible to the point where
   26.34 +//	we can checksum 16 bytes/loop. Before reaching that point we must
   26.35 +//	take care of incorrect alignment of first byte.
   26.36 +//
   26.37 +//	The code hereafter also takes care of the "tail" part of the buffer
   26.38 +//	before entering the core loop, if any. The checksum is a sum so it
   26.39 +//	allows us to commute operations. So we do the "head" and "tail"
   26.40 +//	first to finish at full speed in the body. Once we get the head and
   26.41 +//	tail values, we feed them into the pipeline, very handy initialization.
   26.42 +//
   26.43 +//	Of course we deal with the special case where the whole buffer fits
   26.44 +//	into one 8 byte word. In this case we have only one entry in the pipeline.
   26.45 +//
   26.46 +//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
   26.47 +//	possible load latency and also to accommodate for head and tail.
   26.48 +//
   26.49 +//	The end of the function deals with folding the checksum from 64bits
   26.50 +//	down to 16bits taking care of the carry.
   26.51 +//
   26.52 +//	This version avoids synchronization in the core loop by also using a
   26.53 +//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
   26.54 +//
   26.55 +//	 wordx[] (x=1,2)
   26.56 +//	|---|
   26.57 +//      |   | 0			: new value loaded in pipeline
   26.58 +//	|---|
   26.59 +//      |   | -			: in transit data
   26.60 +//	|---|
   26.61 +//      |   | LOAD_LATENCY	: current value to add to checksum
   26.62 +//	|---|
   26.63 +//      |   | LOAD_LATENCY+1	: previous value added to checksum
   26.64 +//      |---|			(previous iteration)
   26.65 +//
   26.66 +//	resultx[] (x=1,2)
   26.67 +//	|---|
   26.68 +//      |   | 0			: initial value
   26.69 +//	|---|
   26.70 +//      |   | LOAD_LATENCY-1	: new checksum
   26.71 +//	|---|
   26.72 +//      |   | LOAD_LATENCY	: previous value of checksum
   26.73 +//	|---|
   26.74 +//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
   26.75 +//      |---|
   26.76 +//
   26.77 +//
   26.78 +//	See RFC1071 "Computing the Internet Checksum" for various techniques for
   26.79 +//	calculating the Internet checksum.
   26.80 +//
   26.81 +// NOT YET DONE:
   26.82 +//	- Maybe another algorithm which would take care of the folding at the
   26.83 +//	  end in a different manner
   26.84 +//	- Work with people more knowledgeable than me on the network stack
   26.85 +//	  to figure out if we could not split the function depending on the
   26.86 +//	  type of packet or alignment we get. Like the ip_fast_csum() routine
   26.87 +//	  where we know we have at least 20bytes worth of data to checksum.
   26.88 +//	- Do a better job of handling small packets.
   26.89 +//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
   26.90 +//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
   26.91 +//	  on the data that buffer points to (partly because the checksum is often preceded by
   26.92 +//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
   26.93 +//	  the data is already in the cache.
   26.94 +//
   26.95 +
   26.96 +#define saved_pfs	r11
   26.97 +#define hmask		r16
   26.98 +#define tmask		r17
   26.99 +#define first1		r18
  26.100 +#define firstval	r19
  26.101 +#define firstoff	r20
  26.102 +#define last		r21
  26.103 +#define lastval		r22
  26.104 +#define lastoff		r23
  26.105 +#define saved_lc	r24
  26.106 +#define saved_pr	r25
  26.107 +#define tmp1		r26
  26.108 +#define tmp2		r27
  26.109 +#define tmp3		r28
  26.110 +#define carry1		r29
  26.111 +#define carry2		r30
  26.112 +#define first2		r31
  26.113 +
  26.114 +#define buf		in0
  26.115 +#define len		in1
  26.116 +
  26.117 +#define LOAD_LATENCY	2	// XXX fix me
  26.118 +
  26.119 +#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
  26.120 +# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
  26.121 +#endif
  26.122 +
  26.123 +#define PIPE_DEPTH			(LOAD_LATENCY+2)
  26.124 +#define ELD	p[LOAD_LATENCY]		// end of load
  26.125 +#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
  26.126 +
  26.127 +// unsigned long do_csum(unsigned char *buf,long len)
  26.128 +
  26.129 +GLOBAL_ENTRY(do_csum)
  26.130 +	.prologue
  26.131 +	.save ar.pfs, saved_pfs
  26.132 +	alloc saved_pfs=ar.pfs,2,16,0,16
  26.133 +	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
  26.134 +	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
  26.135 +	mov ret0=r0		// in case we have zero length
  26.136 +	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
  26.137 +	;;
  26.138 +	add tmp1=buf,len	// last byte's address
  26.139 +	.save pr, saved_pr
  26.140 +	mov saved_pr=pr		// preserve predicates (rotation)
  26.141 +(p6)	br.ret.spnt.many rp	// return if zero or negative length
  26.142 +
  26.143 +	mov hmask=-1		// initialize head mask
  26.144 +	tbit.nz p15,p0=buf,0	// is buf an odd address?
  26.145 +	and first1=-8,buf	// 8-byte align down address of first1 element
  26.146 +
  26.147 +	and firstoff=7,buf	// how many bytes off for first1 element
  26.148 +	mov tmask=-1		// initialize tail mask
  26.149 +
  26.150 +	;;
  26.151 +	adds tmp2=-1,tmp1	// last-1
  26.152 +	and lastoff=7,tmp1	// how many bytes off for last element
  26.153 +	;;
  26.154 +	sub tmp1=8,lastoff	// complement to lastoff
  26.155 +	and last=-8,tmp2	// address of word containing last byte
  26.156 +	;;
  26.157 +	sub tmp3=last,first1	// tmp3=distance from first1 to last
  26.158 +	.save ar.lc, saved_lc
  26.159 +	mov saved_lc=ar.lc	// save lc
  26.160 +	cmp.eq p8,p9=last,first1	// everything fits in one word ?
  26.161 +
  26.162 +	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
  26.163 +	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
  26.164 +	shl tmp2=firstoff,3	// number of bits
  26.165 +	;;
  26.166 +(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
  26.167 +	shl tmp1=tmp1,3		// number of bits
  26.168 +(p9)	adds tmp3=-8,tmp3	// effectively loaded
  26.169 +	;;
  26.170 +(p8)	mov lastval=r0		// we don't need lastval if first1==last
  26.171 +	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
  26.172 +	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
  26.173 +	;;
  26.174 +	.body
  26.175 +#define count tmp3
  26.176 +
  26.177 +(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
  26.178 +(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
  26.179 +	shr.u count=count,3	// how many 8-byte?
  26.180 +	;;
  26.181 +	// If count is odd, finish this 8-byte word so that we can
  26.182 +	// load two back-to-back 8-byte words per loop thereafter.
  26.183 +	and word1[0]=firstval,hmask	// and mask it as appropriate
  26.184 +	tbit.nz p10,p11=count,0		// if (count is odd)
  26.185 +	;;
  26.186 +(p8)	mov result1[0]=word1[0]
  26.187 +(p9)	add result1[0]=word1[0],word2[0]
  26.188 +	;;
  26.189 +	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
  26.190 +	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
  26.191 +	;;
  26.192 +(p6)	adds result1[0]=1,result1[0]
  26.193 +(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
  26.194 +(p11)	br.cond.dptk .do_csum16		// if (count is even)
  26.195 +
  26.196 +	// Here count is odd.
  26.197 +	ld8 word1[1]=[first1],8		// load an 8-byte word
  26.198 +	cmp.eq p9,p10=1,count		// if (count == 1)
  26.199 +	adds count=-1,count		// loaded an 8-byte word
  26.200 +	;;
  26.201 +	add result1[0]=result1[0],word1[1]
  26.202 +	;;
  26.203 +	cmp.ltu p6,p0=result1[0],word1[1]
  26.204 +	;;
  26.205 +(p6)	adds result1[0]=1,result1[0]
  26.206 +(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
  26.207 +	// Fall through to caluculate the checksum, feeding result1[0] as
  26.208 +	// the initial value in result1[0].
  26.209 +	//
  26.210 +	// Calculate the checksum loading two 8-byte words per loop.
  26.211 +	//
  26.212 +.do_csum16:
  26.213 +	add first2=8,first1
  26.214 +	shr.u count=count,1	// we do 16 bytes per loop
  26.215 +	;;
  26.216 +	adds count=-1,count
  26.217 +	mov carry1=r0
  26.218 +	mov carry2=r0
  26.219 +	brp.loop.imp 1f,2f
  26.220 +	;;
  26.221 +	mov ar.ec=PIPE_DEPTH
  26.222 +	mov ar.lc=count	// set lc
  26.223 +	mov pr.rot=1<<16
  26.224 +	// result1[0] must be initialized in advance.
  26.225 +	mov result2[0]=r0
  26.226 +	;;
  26.227 +	.align 32
  26.228 +1:
  26.229 +(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
  26.230 +(pC1[1])adds carry1=1,carry1
  26.231 +(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
  26.232 +(pC2[1])adds carry2=1,carry2
  26.233 +(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
  26.234 +(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
  26.235 +2:
  26.236 +(p[0])	ld8 word1[0]=[first1],16
  26.237 +(p[0])	ld8 word2[0]=[first2],16
  26.238 +	br.ctop.sptk 1b
  26.239 +	;;
  26.240 +	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
  26.241 +(pC1[1])adds carry1=1,carry1	// since we miss the last one
  26.242 +(pC2[1])adds carry2=1,carry2
  26.243 +	;;
  26.244 +	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
  26.245 +	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
  26.246 +	;;
  26.247 +	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
  26.248 +	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
  26.249 +	;;
  26.250 +(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
  26.251 +(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
  26.252 +	;;
  26.253 +	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
  26.254 +	;;
  26.255 +	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
  26.256 +	;;
  26.257 +(p6)	adds result1[0]=1,result1[0]
  26.258 +	;;
  26.259 +.do_csum_exit:
  26.260 +	//
  26.261 +	// now fold 64 into 16 bits taking care of carry
  26.262 +	// that's not very good because it has lots of sequentiality
  26.263 +	//
  26.264 +	mov tmp3=0xffff
  26.265 +	zxt4 tmp1=result1[0]
  26.266 +	shr.u tmp2=result1[0],32
  26.267 +	;;
  26.268 +	add result1[0]=tmp1,tmp2
  26.269 +	;;
  26.270 +	and tmp1=result1[0],tmp3
  26.271 +	shr.u tmp2=result1[0],16
  26.272 +	;;
  26.273 +	add result1[0]=tmp1,tmp2
  26.274 +	;;
  26.275 +	and tmp1=result1[0],tmp3
  26.276 +	shr.u tmp2=result1[0],16
  26.277 +	;;
  26.278 +	add result1[0]=tmp1,tmp2
  26.279 +	;;
  26.280 +	and tmp1=result1[0],tmp3
  26.281 +	shr.u tmp2=result1[0],16
  26.282 +	;;
  26.283 +	add ret0=tmp1,tmp2
  26.284 +	mov pr=saved_pr,0xffffffffffff0000
  26.285 +	;;
  26.286 +	// if buf was odd then swap bytes
  26.287 +	mov ar.pfs=saved_pfs		// restore ar.ec
  26.288 +(p15)	mux1 ret0=ret0,@rev		// reverse word
  26.289 +	;;
  26.290 +	mov ar.lc=saved_lc
  26.291 +(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
  26.292 +	br.ret.sptk.many rp
  26.293 +
  26.294 +//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
  26.295 +//	not much better than the original. So keep the original there so that
  26.296 +//	someone else can challenge.
  26.297 +//
  26.298 +//	shr.u word1[0]=result1[0],32
  26.299 +//	zxt4 result1[0]=result1[0]
  26.300 +//	;;
  26.301 +//	add result1[0]=result1[0],word1[0]
  26.302 +//	;;
  26.303 +//	zxt2 result2[0]=result1[0]
  26.304 +//	extr.u word1[0]=result1[0],16,16
  26.305 +//	shr.u carry1=result1[0],32
  26.306 +//	;;
  26.307 +//	add result2[0]=result2[0],word1[0]
  26.308 +//	;;
  26.309 +//	add result2[0]=result2[0],carry1
  26.310 +//	;;
  26.311 +//	extr.u ret0=result2[0],16,16
  26.312 +//	;;
  26.313 +//	add ret0=ret0,result2[0]
  26.314 +//	;;
  26.315 +//	zxt2 ret0=ret0
  26.316 +//	mov ar.pfs=saved_pfs		 // restore ar.ec
  26.317 +//	mov pr=saved_pr,0xffffffffffff0000
  26.318 +//	;;
  26.319 +//	// if buf was odd then swap bytes
  26.320 +//	mov ar.lc=saved_lc
  26.321 +//(p15)	mux1 ret0=ret0,@rev		// reverse word
  26.322 +//	;;
  26.323 +//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
  26.324 +//	br.ret.sptk.many rp
  26.325 +
  26.326 +END(do_csum)
    27.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.2 +++ b/xen/arch/ia64/linux/lib/flush.S	Tue Aug 02 15:59:09 2005 -0800
    27.3 @@ -0,0 +1,39 @@
    27.4 +/*
    27.5 + * Cache flushing routines.
    27.6 + *
    27.7 + * Copyright (C) 1999-2001 Hewlett-Packard Co
    27.8 + * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com>
    27.9 + */
   27.10 +#include <asm/asmmacro.h>
   27.11 +#include <asm/page.h>
   27.12 +
   27.13 +	/*
   27.14 +	 * flush_icache_range(start,end)
   27.15 +	 *	Must flush range from start to end-1 but nothing else (need to
   27.16 +	 *	be careful not to touch addresses that may be unmapped).
   27.17 +	 */
   27.18 +GLOBAL_ENTRY(flush_icache_range)
   27.19 +	.prologue
   27.20 +	alloc r2=ar.pfs,2,0,0,0
   27.21 +	sub r8=in1,in0,1
   27.22 +	;;
   27.23 +	shr.u r8=r8,5			// we flush 32 bytes per iteration
   27.24 +	.save ar.lc, r3
   27.25 +	mov r3=ar.lc			// save ar.lc
   27.26 +	;;
   27.27 +
   27.28 +	.body
   27.29 +
   27.30 +	mov ar.lc=r8
   27.31 +	;;
   27.32 +.Loop:	fc in0				// issuable on M0 only
   27.33 +	add in0=32,in0
   27.34 +	br.cloop.sptk.few .Loop
   27.35 +	;;
   27.36 +	sync.i
   27.37 +	;;
   27.38 +	srlz.i
   27.39 +	;;
   27.40 +	mov ar.lc=r3			// restore ar.lc
   27.41 +	br.ret.sptk.many rp
   27.42 +END(flush_icache_range)
    28.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.2 +++ b/xen/arch/ia64/linux/lib/idiv32.S	Tue Aug 02 15:59:09 2005 -0800
    28.3 @@ -0,0 +1,83 @@
    28.4 +/*
    28.5 + * Copyright (C) 2000 Hewlett-Packard Co
    28.6 + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
    28.7 + *
    28.8 + * 32-bit integer division.
    28.9 + *
   28.10 + * This code is based on the application note entitled "Divide, Square Root
   28.11 + * and Remainder Algorithms for the IA-64 Architecture".  This document
   28.12 + * is available as Intel document number 248725-002 or via the web at
   28.13 + * http://developer.intel.com/software/opensource/numerics/
   28.14 + *
   28.15 + * For more details on the theory behind these algorithms, see "IA-64
   28.16 + * and Elementary Functions" by Peter Markstein; HP Professional Books
   28.17 + * (http://www.hp.com/go/retailbooks/)
   28.18 + */
   28.19 +
   28.20 +#include <asm/asmmacro.h>
   28.21 +
   28.22 +#ifdef MODULO
   28.23 +# define OP	mod
   28.24 +#else
   28.25 +# define OP	div
   28.26 +#endif
   28.27 +
   28.28 +#ifdef UNSIGNED
   28.29 +# define SGN	u
   28.30 +# define EXTEND	zxt4
   28.31 +# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
   28.32 +# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
   28.33 +#else
   28.34 +# define SGN
   28.35 +# define EXTEND	sxt4
   28.36 +# define INT_TO_FP(a,b)	fcvt.xf a=b
   28.37 +# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
   28.38 +#endif
   28.39 +
   28.40 +#define PASTE1(a,b)	a##b
   28.41 +#define PASTE(a,b)	PASTE1(a,b)
   28.42 +#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,si3))
   28.43 +
   28.44 +GLOBAL_ENTRY(NAME)
   28.45 +	.regstk 2,0,0,0
   28.46 +	// Transfer inputs to FP registers.
   28.47 +	mov r2 = 0xffdd			// r2 = -34 + 65535 (fp reg format bias)
   28.48 +	EXTEND in0 = in0		// in0 = a
   28.49 +	EXTEND in1 = in1		// in1 = b
   28.50 +	;;
   28.51 +	setf.sig f8 = in0
   28.52 +	setf.sig f9 = in1
   28.53 +#ifdef MODULO
   28.54 +	sub in1 = r0, in1		// in1 = -b
   28.55 +#endif
   28.56 +	;;
   28.57 +	// Convert the inputs to FP, to avoid FP software-assist faults.
   28.58 +	INT_TO_FP(f8, f8)
   28.59 +	INT_TO_FP(f9, f9)
   28.60 +	;;
   28.61 +	setf.exp f7 = r2		// f7 = 2^-34
   28.62 +	frcpa.s1 f6, p6 = f8, f9	// y0 = frcpa(b)
   28.63 +	;;
   28.64 +(p6)	fmpy.s1 f8 = f8, f6		// q0 = a*y0
   28.65 +(p6)	fnma.s1 f6 = f9, f6, f1		// e0 = -b*y0 + 1 
   28.66 +	;;
   28.67 +#ifdef MODULO
   28.68 +	setf.sig f9 = in1		// f9 = -b
   28.69 +#endif
   28.70 +(p6)	fma.s1 f8 = f6, f8, f8		// q1 = e0*q0 + q0
   28.71 +(p6)	fma.s1 f6 = f6, f6, f7		// e1 = e0*e0 + 2^-34
   28.72 +	;;
   28.73 +#ifdef MODULO
   28.74 +	setf.sig f7 = in0
   28.75 +#endif
   28.76 +(p6)	fma.s1 f6 = f6, f8, f8		// q2 = e1*q1 + q1
   28.77 +	;;
   28.78 +	FP_TO_INT(f6, f6)		// q = trunc(q2)
   28.79 +	;;
   28.80 +#ifdef MODULO
   28.81 +	xma.l f6 = f6, f9, f7		// r = q*(-b) + a
   28.82 +	;;
   28.83 +#endif
   28.84 +	getf.sig r8 = f6		// transfer result to result register
   28.85 +	br.ret.sptk.many rp
   28.86 +END(NAME)
    29.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.2 +++ b/xen/arch/ia64/linux/lib/idiv64.S	Tue Aug 02 15:59:09 2005 -0800
    29.3 @@ -0,0 +1,80 @@
    29.4 +/*
    29.5 + * Copyright (C) 1999-2000 Hewlett-Packard Co
    29.6 + * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
    29.7 + *
    29.8 + * 64-bit integer division.
    29.9 + *
   29.10 + * This code is based on the application note entitled "Divide, Square Root
   29.11 + * and Remainder Algorithms for the IA-64 Architecture".  This document
   29.12 + * is available as Intel document number 248725-002 or via the web at
   29.13 + * http://developer.intel.com/software/opensource/numerics/
   29.14 + *
   29.15 + * For more details on the theory behind these algorithms, see "IA-64
   29.16 + * and Elementary Functions" by Peter Markstein; HP Professional Books
   29.17 + * (http://www.hp.com/go/retailbooks/)
   29.18 + */
   29.19 +
   29.20 +#include <asm/asmmacro.h>
   29.21 +
   29.22 +#ifdef MODULO
   29.23 +# define OP	mod
   29.24 +#else
   29.25 +# define OP	div
   29.26 +#endif
   29.27 +
   29.28 +#ifdef UNSIGNED
   29.29 +# define SGN	u
   29.30 +# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
   29.31 +# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
   29.32 +#else
   29.33 +# define SGN
   29.34 +# define INT_TO_FP(a,b)	fcvt.xf a=b
   29.35 +# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
   29.36 +#endif
   29.37 +
   29.38 +#define PASTE1(a,b)	a##b
   29.39 +#define PASTE(a,b)	PASTE1(a,b)
   29.40 +#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
   29.41 +
   29.42 +GLOBAL_ENTRY(NAME)
   29.43 +	.regstk 2,0,0,0
   29.44 +	// Transfer inputs to FP registers.
   29.45 +	setf.sig f8 = in0
   29.46 +	setf.sig f9 = in1
   29.47 +	;;
   29.48 +	// Convert the inputs to FP, to avoid FP software-assist faults.
   29.49 +	INT_TO_FP(f8, f8)
   29.50 +	INT_TO_FP(f9, f9)
   29.51 +	;;
   29.52 +	frcpa.s1 f11, p6 = f8, f9	// y0 = frcpa(b)
   29.53 +	;;
   29.54 +(p6)	fmpy.s1 f7 = f8, f11		// q0 = a*y0
   29.55 +(p6)	fnma.s1 f6 = f9, f11, f1	// e0 = -b*y0 + 1
   29.56 +	;;
   29.57 +(p6)	fma.s1 f10 = f7, f6, f7		// q1 = q0*e0 + q0
   29.58 +(p6)	fmpy.s1 f7 = f6, f6		// e1 = e0*e0
   29.59 +	;;
   29.60 +#ifdef MODULO
   29.61 +	sub in1 = r0, in1		// in1 = -b
   29.62 +#endif
   29.63 +(p6)	fma.s1 f10 = f10, f7, f10	// q2 = q1*e1 + q1
   29.64 +(p6)	fma.s1 f6 = f11, f6, f11	// y1 = y0*e0 + y0
   29.65 +	;;
   29.66 +(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
   29.67 +(p6)	fnma.s1 f7 = f9, f10, f8	// r = -b*q2 + a
   29.68 +	;;
   29.69 +#ifdef MODULO
   29.70 +	setf.sig f8 = in0		// f8 = a
   29.71 +	setf.sig f9 = in1		// f9 = -b
   29.72 +#endif
   29.73 +(p6)	fma.s1 f11 = f7, f6, f10	// q3 = r*y2 + q2
   29.74 +	;;
   29.75 +	FP_TO_INT(f11, f11)		// q = trunc(q3)
   29.76 +	;;
   29.77 +#ifdef MODULO
   29.78 +	xma.l f11 = f11, f9, f8		// r = q*(-b) + a
   29.79 +	;;
   29.80 +#endif
   29.81 +	getf.sig r8 = f11		// transfer result to result register
   29.82 +	br.ret.sptk.many rp
   29.83 +END(NAME)
    30.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.2 +++ b/xen/arch/ia64/linux/lib/io.c	Tue Aug 02 15:59:09 2005 -0800
    30.3 @@ -0,0 +1,165 @@
    30.4 +#include <linux/config.h>
    30.5 +#include <linux/module.h>
    30.6 +#include <linux/types.h>
    30.7 +
    30.8 +#include <asm/io.h>
    30.9 +
   30.10 +/*
   30.11 + * Copy data from IO memory space to "real" memory space.
   30.12 + * This needs to be optimized.
   30.13 + */
   30.14 +void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
   30.15 +{
   30.16 +	char *dst = to;
   30.17 +
   30.18 +	while (count) {
   30.19 +		count--;
   30.20 +		*dst++ = readb(from++);
   30.21 +	}
   30.22 +}
   30.23 +EXPORT_SYMBOL(memcpy_fromio);
   30.24 +
   30.25 +/*
   30.26 + * Copy data from "real" memory space to IO memory space.
   30.27 + * This needs to be optimized.
   30.28 + */
   30.29 +void memcpy_toio(volatile void __iomem *to, const void *from, long count)
   30.30 +{
   30.31 +	const char *src = from;
   30.32 +
   30.33 +	while (count) {
   30.34 +		count--;
   30.35 +		writeb(*src++, to++);
   30.36 +	}
   30.37 +}
   30.38 +EXPORT_SYMBOL(memcpy_toio);
   30.39 +
   30.40 +/*
   30.41 + * "memset" on IO memory space.
   30.42 + * This needs to be optimized.
   30.43 + */
   30.44 +void memset_io(volatile void __iomem *dst, int c, long count)
   30.45 +{
   30.46 +	unsigned char ch = (char)(c & 0xff);
   30.47 +
   30.48 +	while (count) {
   30.49 +		count--;
   30.50 +		writeb(ch, dst);
   30.51 +		dst++;
   30.52 +	}
   30.53 +}
   30.54 +EXPORT_SYMBOL(memset_io);
   30.55 +
   30.56 +#ifdef CONFIG_IA64_GENERIC
   30.57 +
   30.58 +#undef __ia64_inb
   30.59 +#undef __ia64_inw
   30.60 +#undef __ia64_inl
   30.61 +#undef __ia64_outb
   30.62 +#undef __ia64_outw
   30.63 +#undef __ia64_outl
   30.64 +#undef __ia64_readb
   30.65 +#undef __ia64_readw
   30.66 +#undef __ia64_readl
   30.67 +#undef __ia64_readq
   30.68 +#undef __ia64_readb_relaxed
   30.69 +#undef __ia64_readw_relaxed
   30.70 +#undef __ia64_readl_relaxed
   30.71 +#undef __ia64_readq_relaxed
   30.72 +#undef __ia64_writeb
   30.73 +#undef __ia64_writew
   30.74 +#undef __ia64_writel
   30.75 +#undef __ia64_writeq
   30.76 +#undef __ia64_mmiowb
   30.77 +
   30.78 +unsigned int
   30.79 +__ia64_inb (unsigned long port)
   30.80 +{
   30.81 +	return ___ia64_inb(port);
   30.82 +}
   30.83 +
   30.84 +unsigned int
   30.85 +__ia64_inw (unsigned long port)
   30.86 +{
   30.87 +	return ___ia64_inw(port);
   30.88 +}
   30.89 +
   30.90 +unsigned int
   30.91 +__ia64_inl (unsigned long port)
   30.92 +{
   30.93 +	return ___ia64_inl(port);
   30.94 +}
   30.95 +
   30.96 +void
   30.97 +__ia64_outb (unsigned char val, unsigned long port)
   30.98 +{
   30.99 +	___ia64_outb(val, port);
  30.100 +}
  30.101 +
  30.102 +void
  30.103 +__ia64_outw (unsigned short val, unsigned long port)
  30.104 +{
  30.105 +	___ia64_outw(val, port);
  30.106 +}
  30.107 +
  30.108 +void
  30.109 +__ia64_outl (unsigned int val, unsigned long port)
  30.110 +{
  30.111 +	___ia64_outl(val, port);
  30.112 +}
  30.113 +
  30.114 +unsigned char
  30.115 +__ia64_readb (void __iomem *addr)
  30.116 +{
  30.117 +	return ___ia64_readb (addr);
  30.118 +}
  30.119 +
  30.120 +unsigned short
  30.121 +__ia64_readw (void __iomem *addr)
  30.122 +{
  30.123 +	return ___ia64_readw (addr);
  30.124 +}
  30.125 +
  30.126 +unsigned int
  30.127 +__ia64_readl (void __iomem *addr)
  30.128 +{
  30.129 +	return ___ia64_readl (addr);
  30.130 +}
  30.131 +
  30.132 +unsigned long
  30.133 +__ia64_readq (void __iomem *addr)
  30.134 +{
  30.135 +	return ___ia64_readq (addr);
  30.136 +}
  30.137 +
  30.138 +unsigned char
  30.139 +__ia64_readb_relaxed (void __iomem *addr)
  30.140 +{
  30.141 +	return ___ia64_readb (addr);
  30.142 +}
  30.143 +
  30.144 +unsigned short
  30.145 +__ia64_readw_relaxed (void __iomem *addr)
  30.146 +{
  30.147 +	return ___ia64_readw (addr);
  30.148 +}
  30.149 +
  30.150 +unsigned int
  30.151 +__ia64_readl_relaxed (void __iomem *addr)
  30.152 +{
  30.153 +	return ___ia64_readl (addr);
  30.154 +}
  30.155 +
  30.156 +unsigned long
  30.157 +__ia64_readq_relaxed (void __iomem *addr)
  30.158 +{
  30.159 +	return ___ia64_readq (addr);
  30.160 +}
  30.161 +
  30.162 +void
  30.163 +__ia64_mmiowb(void)
  30.164 +{
  30.165 +	___ia64_mmiowb();
  30.166 +}
  30.167 +
  30.168 +#endif /* CONFIG_IA64_GENERIC */
    31.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.2 +++ b/xen/arch/ia64/linux/lib/ip_fast_csum.S	Tue Aug 02 15:59:09 2005 -0800
    31.3 @@ -0,0 +1,90 @@
    31.4 +/*
    31.5 + * Optmized version of the ip_fast_csum() function
    31.6 + * Used for calculating IP header checksum
    31.7 + *
    31.8 + * Return: 16bit checksum, complemented
    31.9 + *
   31.10 + * Inputs:
   31.11 + *      in0: address of buffer to checksum (char *)
   31.12 + *      in1: length of the buffer (int)
   31.13 + *
   31.14 + * Copyright (C) 2002 Intel Corp.
   31.15 + * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
   31.16 + */
   31.17 +
   31.18 +#include <asm/asmmacro.h>
   31.19 +
   31.20 +/*
   31.21 + * Since we know that most likely this function is called with buf aligned
   31.22 + * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
   31.23 + * versus calling generic version of do_csum, which has lots of overhead in
   31.24 + * handling various alignments and sizes.  However, due to lack of constrains
   31.25 + * put on the function input argument, cases with alignment not on 4-byte or
   31.26 + * size not equal to 20 bytes will be handled by the generic do_csum function.
   31.27 + */
   31.28 +
   31.29 +#define in0	r32
   31.30 +#define in1	r33
   31.31 +#define ret0	r8
   31.32 +
   31.33 +GLOBAL_ENTRY(ip_fast_csum)
   31.34 +	.prologue
   31.35 +	.body
   31.36 +	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
   31.37 +	and	r14=3,in0	// is it aligned on 4-byte?
   31.38 +	add	r15=4,in0	// second source pointer
   31.39 +	;;
   31.40 +	cmp.ne.or.andcm p6,p7=r14,r0
   31.41 +	;;
   31.42 +(p7)	ld4	r20=[in0],8
   31.43 +(p7)	ld4	r21=[r15],8
   31.44 +(p6)	br.spnt	.generic
   31.45 +	;;
   31.46 +	ld4	r22=[in0],8
   31.47 +	ld4	r23=[r15],8
   31.48 +	;;
   31.49 +	ld4	r24=[in0]
   31.50 +	add	r20=r20,r21
   31.51 +	add	r22=r22,r23
   31.52 +	;;
   31.53 +	add	r20=r20,r22
   31.54 +	;;
   31.55 +	add	r20=r20,r24
   31.56 +	;;
   31.57 +	shr.u	ret0=r20,16	// now need to add the carry
   31.58 +	zxt2	r20=r20
   31.59 +	;;
   31.60 +	add	r20=ret0,r20
   31.61 +	;;
   31.62 +	shr.u	ret0=r20,16	// add carry again
   31.63 +	zxt2	r20=r20
   31.64 +	;;
   31.65 +	add	r20=ret0,r20
   31.66 +	;;
   31.67 +	shr.u	ret0=r20,16
   31.68 +	zxt2	r20=r20
   31.69 +	;;
   31.70 +	add	r20=ret0,r20
   31.71 +	;;
   31.72 +	andcm	ret0=-1,r20
   31.73 +	.restore sp		// reset frame state
   31.74 +	br.ret.sptk.many b0
   31.75 +	;;
   31.76 +
   31.77 +.generic:
   31.78 +	.prologue
   31.79 +	.save ar.pfs, r35
   31.80 +	alloc	r35=ar.pfs,2,2,2,0
   31.81 +	.save rp, r34
   31.82 +	mov	r34=b0
   31.83 +	.body
   31.84 +	dep.z	out1=in1,2,30
   31.85 +	mov	out0=in0
   31.86 +	;;
   31.87 +	br.call.sptk.many b0=do_csum
   31.88 +	;;
   31.89 +	andcm	ret0=-1,ret0
   31.90 +	mov	ar.pfs=r35
   31.91 +	mov	b0=r34
   31.92 +	br.ret.sptk.many b0
   31.93 +END(ip_fast_csum)
    32.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.2 +++ b/xen/arch/ia64/linux/lib/memcpy.S	Tue Aug 02 15:59:09 2005 -0800
    32.3 @@ -0,0 +1,301 @@
    32.4 +/*
    32.5 + *
    32.6 + * Optimized version of the standard memcpy() function
    32.7 + *
    32.8 + * Inputs:
    32.9 + * 	in0:	destination address
   32.10 + *	in1:	source address
   32.11 + *	in2:	number of bytes to copy
   32.12 + * Output:
   32.13 + * 	no return value
   32.14 + *
   32.15 + * Copyright (C) 2000-2001 Hewlett-Packard Co
   32.16 + *	Stephane Eranian <eranian@hpl.hp.com>
   32.17 + *	David Mosberger-Tang <davidm@hpl.hp.com>
   32.18 + */
   32.19 +#include <asm/asmmacro.h>
   32.20 +
   32.21 +GLOBAL_ENTRY(memcpy)
   32.22 +
   32.23 +#	define MEM_LAT	21		/* latency to memory */
   32.24 +
   32.25 +#	define dst	r2
   32.26 +#	define src	r3
   32.27 +#	define retval	r8
   32.28 +#	define saved_pfs r9
   32.29 +#	define saved_lc	r10
   32.30 +#	define saved_pr	r11
   32.31 +#	define cnt	r16
   32.32 +#	define src2	r17
   32.33 +#	define t0	r18
   32.34 +#	define t1	r19
   32.35 +#	define t2	r20
   32.36 +#	define t3	r21
   32.37 +#	define t4	r22
   32.38 +#	define src_end	r23
   32.39 +
   32.40 +#	define N	(MEM_LAT + 4)
   32.41 +#	define Nrot	((N + 7) & ~7)
   32.42 +
   32.43 +	/*
   32.44 +	 * First, check if everything (src, dst, len) is a multiple of eight.  If
   32.45 +	 * so, we handle everything with no taken branches (other than the loop
   32.46 +	 * itself) and a small icache footprint.  Otherwise, we jump off to
   32.47 +	 * the more general copy routine handling arbitrary
   32.48 +	 * sizes/alignment etc.
   32.49 +	 */
   32.50 +	.prologue
   32.51 +	.save ar.pfs, saved_pfs
   32.52 +	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
   32.53 +	.save ar.lc, saved_lc
   32.54 +	mov saved_lc=ar.lc
   32.55 +	or t0=in0,in1
   32.56 +	;;
   32.57 +
   32.58 +	or t0=t0,in2
   32.59 +	.save pr, saved_pr
   32.60 +	mov saved_pr=pr
   32.61 +
   32.62 +	.body
   32.63 +
   32.64 +	cmp.eq p6,p0=in2,r0	// zero length?
   32.65 +	mov retval=in0		// return dst
   32.66 +(p6)	br.ret.spnt.many rp	// zero length, return immediately
   32.67 +	;;
   32.68 +
   32.69 +	mov dst=in0		// copy because of rotation
   32.70 +	shr.u cnt=in2,3		// number of 8-byte words to copy
   32.71 +	mov pr.rot=1<<16
   32.72 +	;;
   32.73 +
   32.74 +	adds cnt=-1,cnt		// br.ctop is repeat/until
   32.75 +	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
   32.76 +	mov ar.ec=N
   32.77 +	;;
   32.78 +
   32.79 +	and t0=0x7,t0
   32.80 +	mov ar.lc=cnt
   32.81 +	;;
   32.82 +	cmp.ne p6,p0=t0,r0
   32.83 +
   32.84 +	mov src=in1		// copy because of rotation
   32.85 +(p7)	br.cond.spnt.few .memcpy_short
   32.86 +(p6)	br.cond.spnt.few .memcpy_long
   32.87 +	;;
   32.88 +	nop.m	0
   32.89 +	;;
   32.90 +	nop.m	0
   32.91 +	nop.i	0
   32.92 +	;;
   32.93 +	nop.m	0
   32.94 +	;;
   32.95 +	.rotr val[N]
   32.96 +	.rotp p[N]
   32.97 +	.align 32
   32.98 +1: { .mib
   32.99 +(p[0])	ld8 val[0]=[src],8
  32.100 +	nop.i 0
  32.101 +	brp.loop.imp 1b, 2f
  32.102 +}
  32.103 +2: { .mfb
  32.104 +(p[N-1])st8 [dst]=val[N-1],8
  32.105 +	nop.f 0
  32.106 +	br.ctop.dptk.few 1b
  32.107 +}
  32.108 +	;;
  32.109 +	mov ar.lc=saved_lc
  32.110 +	mov pr=saved_pr,-1
  32.111 +	mov ar.pfs=saved_pfs
  32.112 +	br.ret.sptk.many rp
  32.113 +
  32.114 +	/*
  32.115 +	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
  32.116 +	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
  32.117 +	 * get used very often (gcc inlines small copies) and due to atomicity
  32.118 +	 * issues, we want to avoid read-modify-write of entire words.
  32.119 +	 */
  32.120 +	.align 32
  32.121 +.memcpy_short:
  32.122 +	adds cnt=-1,in2		// br.ctop is repeat/until
  32.123 +	mov ar.ec=MEM_LAT
  32.124 +	brp.loop.imp 1f, 2f
  32.125 +	;;
  32.126 +	mov ar.lc=cnt
  32.127 +	;;
  32.128 +	nop.m	0
  32.129 +	;;
  32.130 +	nop.m	0
  32.131 +	nop.i	0
  32.132 +	;;
  32.133 +	nop.m	0
  32.134 +	;;
  32.135 +	nop.m	0
  32.136 +	;;
  32.137 +	/*
  32.138 +	 * It is faster to put a stop bit in the loop here because it makes
  32.139 +	 * the pipeline shorter (and latency is what matters on short copies).
  32.140 +	 */
  32.141 +	.align 32
  32.142 +1: { .mib
  32.143 +(p[0])	ld1 val[0]=[src],1
  32.144 +	nop.i 0
  32.145 +	brp.loop.imp 1b, 2f
  32.146 +} ;;
  32.147 +2: { .mfb
  32.148 +(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
  32.149 +	nop.f 0
  32.150 +	br.ctop.dptk.few 1b
  32.151 +} ;;
  32.152 +	mov ar.lc=saved_lc
  32.153 +	mov pr=saved_pr,-1
  32.154 +	mov ar.pfs=saved_pfs
  32.155 +	br.ret.sptk.many rp
  32.156 +
  32.157 +	/*
  32.158 +	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
  32.159 +	 * an overriding concern here, but throughput is.  We first do
  32.160 +	 * sub-word copying until the destination is aligned, then we check
  32.161 +	 * if the source is also aligned.  If so, we do a simple load/store-loop
  32.162 +	 * until there are less than 8 bytes left over and then we do the tail,
  32.163 +	 * by storing the last few bytes using sub-word copying.  If the source
  32.164 +	 * is not aligned, we branch off to the non-congruent loop.
  32.165 +	 *
  32.166 +	 *   stage:   op:
  32.167 +	 *         0  ld
  32.168 +	 *	   :
  32.169 +	 * MEM_LAT+3  shrp
  32.170 +	 * MEM_LAT+4  st
  32.171 +	 *
  32.172 +	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
  32.173 +	 * seems to introduce an unavoidable bubble in the pipeline so the overall
  32.174 +	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
  32.175 +	 * of 4 byte/cycle.  Still not bad.
  32.176 +	 */
  32.177 +#	undef N
  32.178 +#	undef Nrot
  32.179 +#	define N	(MEM_LAT + 5)		/* number of stages */
  32.180 +#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
  32.181 +
  32.182 +#define LOG_LOOP_SIZE	6
  32.183 +
  32.184 +.memcpy_long:
  32.185 +	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
  32.186 +	and t0=-8,src		// t0 = src & ~7
  32.187 +	and t2=7,src		// t2 = src & 7
  32.188 +	;;
  32.189 +	ld8 t0=[t0]		// t0 = 1st source word
  32.190 +	adds src2=7,src		// src2 = (src + 7)
  32.191 +	sub t4=r0,dst		// t4 = -dst
  32.192 +	;;
  32.193 +	and src2=-8,src2	// src2 = (src + 7) & ~7
  32.194 +	shl t2=t2,3		// t2 = 8*(src & 7)
  32.195 +	shl t4=t4,3		// t4 = 8*(dst & 7)
  32.196 +	;;
  32.197 +	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
  32.198 +	sub t3=64,t2		// t3 = 64-8*(src & 7)
  32.199 +	shr.u t0=t0,t2
  32.200 +	;;
  32.201 +	add src_end=src,in2
  32.202 +	shl t1=t1,t3
  32.203 +	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
  32.204 +	;;
  32.205 +	or t0=t0,t1
  32.206 +	mov cnt=r0
  32.207 +	adds src_end=-1,src_end
  32.208 +	;;
  32.209 +(p3)	st1 [dst]=t0,1
  32.210 +(p3)	shr.u t0=t0,8
  32.211 +(p3)	adds cnt=1,cnt
  32.212 +	;;
  32.213 +(p4)	st2 [dst]=t0,2
  32.214 +(p4)	shr.u t0=t0,16
  32.215 +(p4)	adds cnt=2,cnt
  32.216 +	;;
  32.217 +(p5)	st4 [dst]=t0,4
  32.218 +(p5)	adds cnt=4,cnt
  32.219 +	and src_end=-8,src_end	// src_end = last word of source buffer
  32.220 +	;;
  32.221 +
  32.222 +	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
  32.223 +
  32.224 +1:{	add src=cnt,src			// make src point to remainder of source buffer
  32.225 +	sub cnt=in2,cnt			// cnt = number of bytes left to copy
  32.226 +	mov t4=ip
  32.227 +  }	;;
  32.228 +	and src2=-8,src			// align source pointer
  32.229 +	adds t4=.memcpy_loops-1b,t4
  32.230 +	mov ar.ec=N
  32.231 +
  32.232 +	and t0=7,src			// t0 = src & 7
  32.233 +	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
  32.234 +	shl cnt=cnt,3			// move bits 0-2 to 3-5
  32.235 +	;;
  32.236 +
  32.237 +	.rotr val[N+1], w[2]
  32.238 +	.rotp p[N]
  32.239 +
  32.240 +	cmp.ne p6,p0=t0,r0		// is src aligned, too?
  32.241 +	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
  32.242 +	adds t2=-1,t2			// br.ctop is repeat/until
  32.243 +	;;
  32.244 +	add t4=t0,t4
  32.245 +	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
  32.246 +	mov ar.lc=t2
  32.247 +	;;
  32.248 +	nop.m	0
  32.249 +	;;
  32.250 +	nop.m	0
  32.251 +	nop.i	0
  32.252 +	;;
  32.253 +	nop.m	0
  32.254 +	;;
  32.255 +(p6)	ld8 val[1]=[src2],8		// prime the pump...
  32.256 +	mov b6=t4
  32.257 +	br.sptk.few b6
  32.258 +	;;
  32.259 +
  32.260 +.memcpy_tail:
  32.261 +	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
  32.262 +	// less than 8) and t0 contains the last few bytes of the src buffer:
  32.263 +(p5)	st4 [dst]=t0,4
  32.264 +(p5)	shr.u t0=t0,32
  32.265 +	mov ar.lc=saved_lc
  32.266 +	;;
  32.267 +(p4)	st2 [dst]=t0,2
  32.268 +(p4)	shr.u t0=t0,16
  32.269 +	mov ar.pfs=saved_pfs
  32.270 +	;;
  32.271 +(p3)	st1 [dst]=t0
  32.272 +	mov pr=saved_pr,-1
  32.273 +	br.ret.sptk.many rp
  32.274 +
  32.275 +///////////////////////////////////////////////////////
  32.276 +	.align 64
  32.277 +
  32.278 +#define COPY(shift,index)									\
  32.279 + 1: { .mib											\
  32.280 +	(p[0])		ld8 val[0]=[src2],8;							\
  32.281 +	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
  32.282 +			brp.loop.imp 1b, 2f							\
  32.283 +    };												\
  32.284 + 2: { .mfb											\
  32.285 +	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
  32.286 +			nop.f 0;								\
  32.287 +			br.ctop.dptk.few 1b;							\
  32.288 +    };												\
  32.289 +			;;									\
  32.290 +			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
  32.291 +			;;									\
  32.292 +			shrp t0=val[N-1],val[N-index],shift;					\
  32.293 +			br .memcpy_tail
  32.294 +.memcpy_loops:
  32.295 +	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
  32.296 +	COPY(8, 0)
  32.297 +	COPY(16, 0)
  32.298 +	COPY(24, 0)
  32.299 +	COPY(32, 0)
  32.300 +	COPY(40, 0)
  32.301 +	COPY(48, 0)
  32.302 +	COPY(56, 0)
  32.303 +
  32.304 +END(memcpy)
    33.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.2 +++ b/xen/arch/ia64/linux/lib/memcpy_mck.S	Tue Aug 02 15:59:09 2005 -0800
    33.3 @@ -0,0 +1,661 @@
    33.4 +/*
    33.5 + * Itanium 2-optimized version of memcpy and copy_user function
    33.6 + *
    33.7 + * Inputs:
    33.8 + * 	in0:	destination address
    33.9 + *	in1:	source address
   33.10 + *	in2:	number of bytes to copy
   33.11 + * Output:
   33.12 + * 	0 if success, or number of byte NOT copied if error occurred.
   33.13 + *
   33.14 + * Copyright (C) 2002 Intel Corp.
   33.15 + * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
   33.16 + */
   33.17 +#include <linux/config.h>
   33.18 +#include <asm/asmmacro.h>
   33.19 +#include <asm/page.h>
   33.20 +
   33.21 +#define EK(y...) EX(y)
   33.22 +
   33.23 +/* McKinley specific optimization */
   33.24 +
   33.25 +#define retval		r8
   33.26 +#define saved_pfs	r31
   33.27 +#define saved_lc	r10
   33.28 +#define saved_pr	r11
   33.29 +#define saved_in0	r14
   33.30 +#define saved_in1	r15
   33.31 +#define saved_in2	r16
   33.32 +
   33.33 +#define src0		r2
   33.34 +#define src1		r3
   33.35 +#define dst0		r17
   33.36 +#define dst1		r18
   33.37 +#define cnt		r9
   33.38 +
   33.39 +/* r19-r30 are temp for each code section */
   33.40 +#define PREFETCH_DIST	8
   33.41 +#define src_pre_mem	r19
   33.42 +#define dst_pre_mem	r20
   33.43 +#define src_pre_l2	r21
   33.44 +#define dst_pre_l2	r22
   33.45 +#define t1		r23
   33.46 +#define t2		r24
   33.47 +#define t3		r25
   33.48 +#define t4		r26
   33.49 +#define t5		t1	// alias!
   33.50 +#define t6		t2	// alias!
   33.51 +#define t7		t3	// alias!
   33.52 +#define n8		r27
   33.53 +#define t9		t5	// alias!
   33.54 +#define t10		t4	// alias!
   33.55 +#define t11		t7	// alias!
   33.56 +#define t12		t6	// alias!
   33.57 +#define t14		t10	// alias!
   33.58 +#define t13		r28
   33.59 +#define t15		r29
   33.60 +#define tmp		r30
   33.61 +
   33.62 +/* defines for long_copy block */
   33.63 +#define	A	0
   33.64 +#define B	(PREFETCH_DIST)
   33.65 +#define C	(B + PREFETCH_DIST)
   33.66 +#define D	(C + 1)
   33.67 +#define N	(D + 1)
   33.68 +#define Nrot	((N + 7) & ~7)
   33.69 +
   33.70 +/* alias */
   33.71 +#define in0		r32
   33.72 +#define in1		r33
   33.73 +#define in2		r34
   33.74 +
   33.75 +GLOBAL_ENTRY(memcpy)
   33.76 +	and	r28=0x7,in0
   33.77 +	and	r29=0x7,in1
   33.78 +	mov	f6=f0
   33.79 +	br.cond.sptk .common_code
   33.80 +	;;
   33.81 +GLOBAL_ENTRY(__copy_user)
   33.82 +	.prologue
   33.83 +// check dest alignment
   33.84 +	and	r28=0x7,in0
   33.85 +	and	r29=0x7,in1
   33.86 +	mov	f6=f1
   33.87 +	mov	saved_in0=in0	// save dest pointer
   33.88 +	mov	saved_in1=in1	// save src pointer
   33.89 +	mov	saved_in2=in2	// save len
   33.90 +	;;
   33.91 +.common_code:
   33.92 +	cmp.gt	p15,p0=8,in2	// check for small size
   33.93 +	cmp.ne	p13,p0=0,r28	// check dest alignment
   33.94 +	cmp.ne	p14,p0=0,r29	// check src alignment
   33.95 +	add	src0=0,in1
   33.96 +	sub	r30=8,r28	// for .align_dest
   33.97 +	mov	retval=r0	// initialize return value
   33.98 +	;;
   33.99 +	add	dst0=0,in0
  33.100 +	add	dst1=1,in0	// dest odd index
  33.101 +	cmp.le	p6,p0 = 1,r30	// for .align_dest
  33.102 +(p15)	br.cond.dpnt .memcpy_short
  33.103 +(p13)	br.cond.dpnt .align_dest
  33.104 +(p14)	br.cond.dpnt .unaligned_src
  33.105 +	;;
  33.106 +
  33.107 +// both dest and src are aligned on 8-byte boundary
  33.108 +.aligned_src:
  33.109 +	.save ar.pfs, saved_pfs
  33.110 +	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
  33.111 +	.save pr, saved_pr
  33.112 +	mov	saved_pr=pr
  33.113 +
  33.114 +	shr.u	cnt=in2,7	// this much cache line
  33.115 +	;;
  33.116 +	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
  33.117 +	cmp.lt	p7,p8=1,cnt
  33.118 +	.save ar.lc, saved_lc
  33.119 +	mov	saved_lc=ar.lc
  33.120 +	.body
  33.121 +	add	cnt=-1,cnt
  33.122 +	add	src_pre_mem=0,in1	// prefetch src pointer
  33.123 +	add	dst_pre_mem=0,in0	// prefetch dest pointer
  33.124 +	;;
  33.125 +(p7)	mov	ar.lc=cnt	// prefetch count
  33.126 +(p8)	mov	ar.lc=r0
  33.127 +(p6)	br.cond.dpnt .long_copy
  33.128 +	;;
  33.129 +
  33.130 +.prefetch:
  33.131 +	lfetch.fault	  [src_pre_mem], 128
  33.132 +	lfetch.fault.excl [dst_pre_mem], 128
  33.133 +	br.cloop.dptk.few .prefetch
  33.134 +	;;
  33.135 +
  33.136 +.medium_copy:
  33.137 +	and	tmp=31,in2	// copy length after iteration
  33.138 +	shr.u	r29=in2,5	// number of 32-byte iteration
  33.139 +	add	dst1=8,dst0	// 2nd dest pointer
  33.140 +	;;
  33.141 +	add	cnt=-1,r29	// ctop iteration adjustment
  33.142 +	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
  33.143 +	add	src1=8,src0	// 2nd src pointer
  33.144 +	cmp.le	p6,p0=8,tmp
  33.145 +	;;
  33.146 +	cmp.le	p7,p0=16,tmp
  33.147 +	mov	ar.lc=cnt	// loop setup
  33.148 +	cmp.eq	p16,p17 = r0,r0
  33.149 +	mov	ar.ec=2
  33.150 +(p10)	br.dpnt.few .aligned_src_tail
  33.151 +	;;
  33.152 +	TEXT_ALIGN(32)
  33.153 +1:
  33.154 +EX(.ex_handler, (p16)	ld8	r34=[src0],16)
  33.155 +EK(.ex_handler, (p16)	ld8	r38=[src1],16)
  33.156 +EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
  33.157 +EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
  33.158 +	;;
  33.159 +EX(.ex_handler, (p16)	ld8	r32=[src0],16)
  33.160 +EK(.ex_handler, (p16)	ld8	r36=[src1],16)
  33.161 +EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
  33.162 +EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
  33.163 +	br.ctop.dptk.few 1b
  33.164 +	;;
  33.165 +
  33.166 +.aligned_src_tail:
  33.167 +EX(.ex_handler, (p6)	ld8	t1=[src0])
  33.168 +	mov	ar.lc=saved_lc
  33.169 +	mov	ar.pfs=saved_pfs
  33.170 +EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
  33.171 +	cmp.le	p8,p0=24,tmp
  33.172 +	and	r21=-8,tmp
  33.173 +	;;
  33.174 +EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
  33.175 +EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
  33.176 +	and	in2=7,tmp	// remaining length
  33.177 +EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
  33.178 +	add	src0=src0,r21	// setting up src pointer
  33.179 +	add	dst0=dst0,r21	// setting up dest pointer
  33.180 +	;;
  33.181 +EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
  33.182 +	mov	pr=saved_pr,-1
  33.183 +	br.dptk.many .memcpy_short
  33.184 +	;;
  33.185 +
  33.186 +/* code taken from copy_page_mck */
  33.187 +.long_copy:
  33.188 +	.rotr v[2*PREFETCH_DIST]
  33.189 +	.rotp p[N]
  33.190 +
  33.191 +	mov src_pre_mem = src0
  33.192 +	mov pr.rot = 0x10000
  33.193 +	mov ar.ec = 1				// special unrolled loop
  33.194 +
  33.195 +	mov dst_pre_mem = dst0
  33.196 +
  33.197 +	add src_pre_l2 = 8*8, src0
  33.198 +	add dst_pre_l2 = 8*8, dst0
  33.199 +	;;
  33.200 +	add src0 = 8, src_pre_mem		// first t1 src
  33.201 +	mov ar.lc = 2*PREFETCH_DIST - 1
  33.202 +	shr.u cnt=in2,7				// number of lines
  33.203 +	add src1 = 3*8, src_pre_mem		// first t3 src
  33.204 +	add dst0 = 8, dst_pre_mem		// first t1 dst
  33.205 +	add dst1 = 3*8, dst_pre_mem		// first t3 dst
  33.206 +	;;
  33.207 +	and tmp=127,in2				// remaining bytes after this block
  33.208 +	add cnt = -(2*PREFETCH_DIST) - 1, cnt
  33.209 +	// same as .line_copy loop, but with all predicated-off instructions removed:
  33.210 +.prefetch_loop:
  33.211 +EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
  33.212 +EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
  33.213 +	br.ctop.sptk .prefetch_loop
  33.214 +	;;
  33.215 +	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
  33.216 +	mov ar.lc = cnt
  33.217 +	mov ar.ec = N				// # of stages in pipeline
  33.218 +	;;
  33.219 +.line_copy:
  33.220 +EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
  33.221 +EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
  33.222 +EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
  33.223 +EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
  33.224 +	;;
  33.225 +EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
  33.226 +EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
  33.227 +EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
  33.228 +EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
  33.229 +	;;
  33.230 +EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
  33.231 +EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
  33.232 +EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
  33.233 +EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
  33.234 +	;;
  33.235 +EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
  33.236 +EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
  33.237 +EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
  33.238 +EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
  33.239 +	;;
  33.240 +EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
  33.241 +EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
  33.242 +EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
  33.243 +EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
  33.244 +	;;
  33.245 +EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
  33.246 +EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
  33.247 +EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
  33.248 +EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
  33.249 +	;;
  33.250 +EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
  33.251 +EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
  33.252 +EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
  33.253 +EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
  33.254 +	;;
  33.255 +EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
  33.256 +EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
  33.257 +EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
  33.258 +EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
  33.259 +	br.ctop.sptk .line_copy
  33.260 +	;;
  33.261 +
  33.262 +	add dst0=-8,dst0
  33.263 +	add src0=-8,src0
  33.264 +	mov in2=tmp
  33.265 +	.restore sp
  33.266 +	br.sptk.many .medium_copy
  33.267 +	;;
  33.268 +
  33.269 +#define BLOCK_SIZE	128*32
  33.270 +#define blocksize	r23
  33.271 +#define curlen		r24
  33.272 +
  33.273 +// dest is on 8-byte boundary, src is not. We need to do
  33.274 +// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
  33.275 +.unaligned_src:
  33.276 +	.prologue
  33.277 +	.save ar.pfs, saved_pfs
  33.278 +	alloc	saved_pfs=ar.pfs,3,5,0,8
  33.279 +	.save ar.lc, saved_lc
  33.280 +	mov	saved_lc=ar.lc
  33.281 +	.save pr, saved_pr
  33.282 +	mov	saved_pr=pr
  33.283 +	.body
  33.284 +.4k_block:
  33.285 +	mov	saved_in0=dst0	// need to save all input arguments
  33.286 +	mov	saved_in2=in2
  33.287 +	mov	blocksize=BLOCK_SIZE
  33.288 +	;;
  33.289 +	cmp.lt	p6,p7=blocksize,in2
  33.290 +	mov	saved_in1=src0
  33.291 +	;;
  33.292 +(p6)	mov	in2=blocksize
  33.293 +	;;
  33.294 +	shr.u	r21=in2,7	// this much cache line
  33.295 +	shr.u	r22=in2,4	// number of 16-byte iteration
  33.296 +	and	curlen=15,in2	// copy length after iteration
  33.297 +	and	r30=7,src0	// source alignment
  33.298 +	;;
  33.299 +	cmp.lt	p7,p8=1,r21
  33.300 +	add	cnt=-1,r21
  33.301 +	;;
  33.302 +
  33.303 +	add	src_pre_mem=0,src0	// prefetch src pointer
  33.304 +	add	dst_pre_mem=0,dst0	// prefetch dest pointer
  33.305 +	and	src0=-8,src0		// 1st src pointer
  33.306 +(p7)	mov	ar.lc = r21
  33.307 +(p8)	mov	ar.lc = r0
  33.308 +	;;
  33.309 +	TEXT_ALIGN(32)
  33.310 +1:	lfetch.fault	  [src_pre_mem], 128
  33.311 +	lfetch.fault.excl [dst_pre_mem], 128
  33.312 +	br.cloop.dptk.few 1b
  33.313 +	;;
  33.314 +
  33.315 +	shladd	dst1=r22,3,dst0	// 2nd dest pointer
  33.316 +	shladd	src1=r22,3,src0	// 2nd src pointer
  33.317 +	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
  33.318 +	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
  33.319 +	add	cnt=-1,r22	// ctop iteration adjustment
  33.320 +	;;
  33.321 +EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
  33.322 +EK(.ex_handler, (p9)	ld8	r37=[src1],8)
  33.323 +(p8)	br.dpnt.few .noloop
  33.324 +	;;
  33.325 +
  33.326 +// The jump address is calculated based on src alignment. The COPYU
  33.327 +// macro below need to confine its size to power of two, so an entry
  33.328 +// can be caulated using shl instead of an expensive multiply. The
  33.329 +// size is then hard coded by the following #define to match the
  33.330 +// actual size.  This make it somewhat tedious when COPYU macro gets
  33.331 +// changed and this need to be adjusted to match.
  33.332 +#define LOOP_SIZE 6
  33.333 +1:
  33.334 +	mov	r29=ip		// jmp_table thread
  33.335 +	mov	ar.lc=cnt
  33.336 +	;;
  33.337 +	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
  33.338 +	shl	r28=r30, LOOP_SIZE	// jmp_table thread
  33.339 +	mov	ar.ec=2		// loop setup
  33.340 +	;;
  33.341 +	add	r29=r29,r28		// jmp_table thread
  33.342 +	cmp.eq	p16,p17=r0,r0
  33.343 +	;;
  33.344 +	mov	b6=r29			// jmp_table thread
  33.345 +	;;
  33.346 +	br.cond.sptk.few b6
  33.347 +
  33.348 +// for 8-15 byte case
  33.349 +// We will skip the loop, but need to replicate the side effect
  33.350 +// that the loop produces.
  33.351 +.noloop:
  33.352 +EX(.ex_handler, (p6)	ld8	r37=[src1],8)
  33.353 +	add	src0=8,src0
  33.354 +(p6)	shl	r25=r30,3
  33.355 +	;;
  33.356 +EX(.ex_handler, (p6)	ld8	r27=[src1])
  33.357 +(p6)	shr.u	r28=r37,r25
  33.358 +(p6)	sub	r26=64,r25
  33.359 +	;;
  33.360 +(p6)	shl	r27=r27,r26
  33.361 +	;;
  33.362 +(p6)	or	r21=r28,r27
  33.363 +
  33.364 +.unaligned_src_tail:
  33.365 +/* check if we have more than blocksize to copy, if so go back */
  33.366 +	cmp.gt	p8,p0=saved_in2,blocksize
  33.367 +	;;
  33.368 +(p8)	add	dst0=saved_in0,blocksize
  33.369 +(p8)	add	src0=saved_in1,blocksize
  33.370 +(p8)	sub	in2=saved_in2,blocksize
  33.371 +(p8)	br.dpnt	.4k_block
  33.372 +	;;
  33.373 +
  33.374 +/* we have up to 15 byte to copy in the tail.
  33.375 + * part of work is already done in the jump table code
  33.376 + * we are at the following state.
  33.377 + * src side:
  33.378 + * 
  33.379 + *   xxxxxx xx                   <----- r21 has xxxxxxxx already
  33.380 + * -------- -------- --------
  33.381 + * 0        8        16
  33.382 + *          ^
  33.383 + *          |
  33.384 + *          src1
  33.385 + * 
  33.386 + * dst
  33.387 + * -------- -------- --------
  33.388 + * ^
  33.389 + * |
  33.390 + * dst1
  33.391 + */
  33.392 +EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
  33.393 +(p6)	add	curlen=-8,curlen	// update length
  33.394 +	mov	ar.pfs=saved_pfs
  33.395 +	;;
  33.396 +	mov	ar.lc=saved_lc
  33.397 +	mov	pr=saved_pr,-1
  33.398 +	mov	in2=curlen	// remaining length
  33.399 +	mov	dst0=dst1	// dest pointer
  33.400 +	add	src0=src1,r30	// forward by src alignment
  33.401 +	;;
  33.402 +
  33.403 +// 7 byte or smaller.
  33.404 +.memcpy_short:
  33.405 +	cmp.le	p8,p9   = 1,in2
  33.406 +	cmp.le	p10,p11 = 2,in2
  33.407 +	cmp.le	p12,p13 = 3,in2
  33.408 +	cmp.le	p14,p15 = 4,in2
  33.409 +	add	src1=1,src0	// second src pointer
  33.410 +	add	dst1=1,dst0	// second dest pointer
  33.411 +	;;
  33.412 +
  33.413 +EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
  33.414 +EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
  33.415 +(p9)	br.ret.dpnt rp		// 0 byte copy
  33.416 +	;;
  33.417 +
  33.418 +EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
  33.419 +EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
  33.420 +(p11)	br.ret.dpnt rp		// 1 byte copy
  33.421 +
  33.422 +EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
  33.423 +EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
  33.424 +(p13)	br.ret.dpnt rp		// 2 byte copy
  33.425 +	;;
  33.426 +
  33.427 +	cmp.le	p6,p7   = 5,in2
  33.428 +	cmp.le	p8,p9   = 6,in2
  33.429 +	cmp.le	p10,p11 = 7,in2
  33.430 +
  33.431 +EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
  33.432 +EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
  33.433 +(p15)	br.ret.dpnt rp		// 3 byte copy
  33.434 +	;;
  33.435 +
  33.436 +EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
  33.437 +EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
  33.438 +(p7)	br.ret.dpnt rp		// 4 byte copy
  33.439 +	;;
  33.440 +
  33.441 +EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
  33.442 +EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
  33.443 +(p9)	br.ret.dptk rp		// 5 byte copy
  33.444 +
  33.445 +EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
  33.446 +(p11)	br.ret.dptk rp		// 6 byte copy
  33.447 +	;;
  33.448 +
  33.449 +EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
  33.450 +	br.ret.dptk rp		// done all cases
  33.451 +
  33.452 +
  33.453 +/* Align dest to nearest 8-byte boundary. We know we have at
  33.454 + * least 7 bytes to copy, enough to crawl to 8-byte boundary.
  33.455 + * Actual number of byte to crawl depend on the dest alignment.
  33.456 + * 7 byte or less is taken care at .memcpy_short
  33.457 +
  33.458 + * src0 - source even index
  33.459 + * src1 - source  odd index
  33.460 + * dst0 - dest even index
  33.461 + * dst1 - dest  odd index
  33.462 + * r30  - distance to 8-byte boundary
  33.463 + */
  33.464 +
  33.465 +.align_dest:
  33.466 +	add	src1=1,in1	// source odd index
  33.467 +	cmp.le	p7,p0 = 2,r30	// for .align_dest
  33.468 +	cmp.le	p8,p0 = 3,r30	// for .align_dest
  33.469 +EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
  33.470 +	cmp.le	p9,p0 = 4,r30	// for .align_dest
  33.471 +	cmp.le	p10,p0 = 5,r30
  33.472 +	;;
  33.473 +EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
  33.474 +EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
  33.475 +	cmp.le	p11,p0 = 6,r30
  33.476 +EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
  33.477 +	cmp.le	p12,p0 = 7,r30
  33.478 +	;;
  33.479 +EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
  33.480 +EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
  33.481 +EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
  33.482 +EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
  33.483 +	;;
  33.484 +EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
  33.485 +EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
  33.486 +	cmp.eq	p6,p7=r28,r29
  33.487 +EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
  33.488 +EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
  33.489 +	sub	in2=in2,r30
  33.490 +	;;
  33.491 +EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
  33.492 +EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
  33.493 +	add	dst0=in0,r30	// setup arguments
  33.494 +	add	src0=in1,r30
  33.495 +(p6)	br.cond.dptk .aligned_src
  33.496 +(p7)	br.cond.dpnt .unaligned_src
  33.497 +	;;
  33.498 +
  33.499 +/* main loop body in jump table format */
  33.500 +#define COPYU(shift)									\
  33.501 +1:											\
  33.502 +EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
  33.503 +EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
  33.504 +		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
  33.505 +EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
  33.506 +		 nop.m	0;								\
  33.507 +		 (p16)	shrp	r38=r36,r37,shift;					\
  33.508 +EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
  33.509 +EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
  33.510 +		 br.ctop.dptk.few 1b;;							\
  33.511 +		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
  33.512 +		 shrp	r21=r22,r38,shift;	/* speculative work */			\
  33.513 +		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
  33.514 +		 ;;
  33.515 +	TEXT_ALIGN(32)
  33.516 +.jump_table:
  33.517 +	COPYU(8)	// unaligned cases
  33.518 +.jmp1:
  33.519 +	COPYU(16)
  33.520 +	COPYU(24)
  33.521 +	COPYU(32)
  33.522 +	COPYU(40)
  33.523 +	COPYU(48)
  33.524 +	COPYU(56)
  33.525 +
  33.526 +#undef A
  33.527 +#undef B
  33.528 +#undef C
  33.529 +#undef D
  33.530 +END(memcpy)
  33.531 +
  33.532 +/*
  33.533 + * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
  33.534 + * instruction failed in the bundle.  The exception algorithm is that we
  33.535 + * first figure out the faulting address, then detect if there is any
  33.536 + * progress made on the copy, if so, redo the copy from last known copied
  33.537 + * location up to the faulting address (exclusive). In the copy_from_user
  33.538 + * case, remaining byte in kernel buffer will be zeroed.
  33.539 + *
  33.540 + * Take copy_from_user as an example, in the code there are multiple loads
  33.541 + * in a bundle and those multiple loads could span over two pages, the
  33.542 + * faulting address is calculated as page_round_down(max(src0, src1)).
  33.543 + * This is based on knowledge that if we can access one byte in a page, we
  33.544 + * can access any byte in that page.
  33.545 + *
  33.546 + * predicate used in the exception handler:
  33.547 + * p6-p7: direction
  33.548 + * p10-p11: src faulting addr calculation
  33.549 + * p12-p13: dst faulting addr calculation
  33.550 + */
  33.551 +
  33.552 +#define A	r19
  33.553 +#define B	r20
  33.554 +#define C	r21
  33.555 +#define D	r22
  33.556 +#define F	r28
  33.557 +
  33.558 +#define memset_arg0	r32
  33.559 +#define memset_arg2	r33
  33.560 +
  33.561 +#define saved_retval	loc0
  33.562 +#define saved_rtlink	loc1
  33.563 +#define saved_pfs_stack	loc2
  33.564 +
  33.565 +.ex_hndlr_s:
  33.566 +	add	src0=8,src0
  33.567 +	br.sptk .ex_handler
  33.568 +	;;
  33.569 +.ex_hndlr_d:
  33.570 +	add	dst0=8,dst0
  33.571 +	br.sptk .ex_handler
  33.572 +	;;
  33.573 +.ex_hndlr_lcpy_1:
  33.574 +	mov	src1=src_pre_mem
  33.575 +	mov	dst1=dst_pre_mem
  33.576 +	cmp.gtu	p10,p11=src_pre_mem,saved_in1
  33.577 +	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
  33.578 +	;;
  33.579 +(p10)	add	src0=8,saved_in1
  33.580 +(p11)	mov	src0=saved_in1
  33.581 +(p12)	add	dst0=8,saved_in0
  33.582 +(p13)	mov	dst0=saved_in0
  33.583 +	br.sptk	.ex_handler
  33.584 +.ex_handler_lcpy:
  33.585 +	// in line_copy block, the preload addresses should always ahead
  33.586 +	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
  33.587 +	// always ahead of src0/dst0.
  33.588 +	mov	src1=src_pre_mem
  33.589 +	mov	dst1=dst_pre_mem
  33.590 +.ex_handler:
  33.591 +	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
  33.592 +	mov	ar.lc=saved_lc
  33.593 +	mov	ar.pfs=saved_pfs
  33.594 +	;;
  33.595 +.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
  33.596 +	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
  33.597 +	cmp.ltu	p10,p11=src0,src1
  33.598 +	cmp.ltu	p12,p13=dst0,dst1
  33.599 +	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
  33.600 +	mov	tmp = dst0
  33.601 +	;;
  33.602 +(p11)	mov	src1 = src0		// pick the larger of the two
  33.603 +(p13)	mov	dst0 = dst1		// make dst0 the smaller one
  33.604 +(p13)	mov	dst1 = tmp		// and dst1 the larger one
  33.605 +	;;
  33.606 +(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
  33.607 +(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
  33.608 +	;;
  33.609 +(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
  33.610 +(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
  33.611 +	mov	retval=saved_in2
  33.612 +(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
  33.613 +(p8)	st1	[dst1]=r0		// force an oops for memcpy call
  33.614 +(p14)	br.ret.sptk.many rp
  33.615 +
  33.616 +/*
  33.617 + * The remaining byte to copy is calculated as:
  33.618 + *
  33.619 + * A =	(faulting_addr - orig_src)	-> len to faulting ld address
  33.620 + *	or 
  33.621 + * 	(faulting_addr - orig_dst)	-> len to faulting st address
  33.622 + * B =	(cur_dst - orig_dst)		-> len copied so far
  33.623 + * C =	A - B				-> len need to be copied
  33.624 + * D =	orig_len - A			-> len need to be zeroed
  33.625 + */
  33.626 +(p6)	sub	A = F, saved_in0
  33.627 +(p7)	sub	A = F, saved_in1
  33.628 +	clrrrb
  33.629 +	;;
  33.630 +	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
  33.631 +	sub	B = dst0, saved_in0	// how many byte copied so far
  33.632 +	;;
  33.633 +	sub	C = A, B
  33.634 +	sub	D = saved_in2, A
  33.635 +	;;
  33.636 +	cmp.gt	p8,p0=C,r0		// more than 1 byte?
  33.637 +	add	memset_arg0=saved_in0, A
  33.638 +(p6)	mov	memset_arg2=0		// copy_to_user should not call memset
  33.639 +(p7)	mov	memset_arg2=D		// copy_from_user need to have kbuf zeroed
  33.640 +	mov	r8=0
  33.641 +	mov	saved_retval = D
  33.642 +	mov	saved_rtlink = b0
  33.643 +
  33.644 +	add	out0=saved_in0, B
  33.645 +	add	out1=saved_in1, B
  33.646 +	mov	out2=C
  33.647 +(p8)	br.call.sptk.few b0=__copy_user	// recursive call
  33.648 +	;;
  33.649 +
  33.650 +	add	saved_retval=saved_retval,r8	// above might return non-zero value
  33.651 +	cmp.gt	p8,p0=memset_arg2,r0	// more than 1 byte?
  33.652 +	mov	out0=memset_arg0	// *s
  33.653 +	mov	out1=r0			// c
  33.654 +	mov	out2=memset_arg2	// n
  33.655 +(p8)	br.call.sptk.few b0=memset
  33.656 +	;;
  33.657 +
  33.658 +	mov	retval=saved_retval
  33.659 +	mov	ar.pfs=saved_pfs_stack
  33.660 +	mov	b0=saved_rtlink
  33.661 +	br.ret.sptk.many rp
  33.662 +
  33.663 +/* end of McKinley specific optimization */
  33.664 +END(__copy_user)
    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/xen/arch/ia64/linux/lib/memset.S	Tue Aug 02 15:59:09 2005 -0800
    34.3 @@ -0,0 +1,362 @@
    34.4 +/* Optimized version of the standard memset() function.
    34.5 +
    34.6 +   Copyright (c) 2002 Hewlett-Packard Co/CERN
    34.7 +	Sverre Jarp <Sverre.Jarp@cern.ch>
    34.8 +
    34.9 +   Return: dest
   34.10 +
   34.11 +   Inputs:
   34.12 +        in0:    dest
   34.13 +        in1:    value
   34.14 +        in2:    count
   34.15 +
   34.16 +   The algorithm is fairly straightforward: set byte by byte until we
   34.17 +   we get to a 16B-aligned address, then loop on 128 B chunks using an
   34.18 +   early store as prefetching, then loop on 32B chucks, then clear remaining
   34.19 +   words, finally clear remaining bytes.
   34.20 +   Since a stf.spill f0 can store 16B in one go, we use this instruction
   34.21 +   to get peak speed when value = 0.  */
   34.22 +
   34.23 +#include <asm/asmmacro.h>
   34.24 +#undef ret
   34.25 +
   34.26 +#define dest		in0
   34.27 +#define value		in1
   34.28 +#define	cnt		in2
   34.29 +
   34.30 +#define tmp		r31
   34.31 +#define save_lc		r30
   34.32 +#define ptr0		r29
   34.33 +#define ptr1		r28
   34.34 +#define ptr2		r27
   34.35 +#define ptr3		r26
   34.36 +#define ptr9 		r24
   34.37 +#define	loopcnt		r23
   34.38 +#define linecnt		r22
   34.39 +#define bytecnt		r21
   34.40 +
   34.41 +#define fvalue		f6
   34.42 +
   34.43 +// This routine uses only scratch predicate registers (p6 - p15)
   34.44 +#define p_scr		p6			// default register for same-cycle branches
   34.45 +#define p_nz		p7
   34.46 +#define p_zr		p8
   34.47 +#define p_unalgn	p9
   34.48 +#define p_y		p11
   34.49 +#define p_n		p12
   34.50 +#define p_yy		p13
   34.51 +#define p_nn		p14
   34.52 +
   34.53 +#define MIN1		15
   34.54 +#define MIN1P1HALF	8
   34.55 +#define LINE_SIZE	128
   34.56 +#define LSIZE_SH        7			// shift amount
   34.57 +#define PREF_AHEAD	8
   34.58 +
   34.59 +GLOBAL_ENTRY(memset)
   34.60 +{ .mmi
   34.61 +	.prologue
   34.62 +	alloc	tmp = ar.pfs, 3, 0, 0, 0
   34.63 +	.body
   34.64 +	lfetch.nt1 [dest]			//
   34.65 +	.save   ar.lc, save_lc
   34.66 +	mov.i	save_lc = ar.lc
   34.67 +} { .mmi
   34.68 +	mov	ret0 = dest			// return value
   34.69 +	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
   34.70 +	cmp.eq	p_scr, p0 = cnt, r0
   34.71 +;; }
   34.72 +{ .mmi
   34.73 +	and	ptr2 = -(MIN1+1), dest		// aligned address
   34.74 +	and	tmp = MIN1, dest		// prepare to check for correct alignment
   34.75 +	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
   34.76 +} { .mib
   34.77 +	mov	ptr1 = dest
   34.78 +	mux1	value = value, @brcst		// create 8 identical bytes in word
   34.79 +(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
   34.80 +;; }
   34.81 +{ .mib
   34.82 +	cmp.ne	p_unalgn, p0 = tmp, r0		//
   34.83 +} { .mib
   34.84 +	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
   34.85 +	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
   34.86 +(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
   34.87 +;; }
   34.88 +{ .mmi
   34.89 +(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
   34.90 +(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
   34.91 +(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
   34.92 +;; }
   34.93 +{ .mib
   34.94 +(p_y)	add	cnt = -8, cnt			//
   34.95 +(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
   34.96 +} { .mib
   34.97 +(p_y)	st8	[ptr2] = value,-4		//
   34.98 +(p_n)	add	ptr2 = 4, ptr2			//
   34.99 +;; }
  34.100 +{ .mib
  34.101 +(p_yy)	add	cnt = -4, cnt			//
  34.102 +(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
  34.103 +} { .mib
  34.104 +(p_yy)	st4	[ptr2] = value,-2		//
  34.105 +(p_nn)	add	ptr2 = 2, ptr2			//
  34.106 +;; }
  34.107 +{ .mmi
  34.108 +	mov	tmp = LINE_SIZE+1		// for compare
  34.109 +(p_y)	add	cnt = -2, cnt			//
  34.110 +(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
  34.111 +} { .mmi
  34.112 +	setf.sig fvalue=value			// transfer value to FLP side
  34.113 +(p_y)	st2	[ptr2] = value,-1		//
  34.114 +(p_n)	add	ptr2 = 1, ptr2			//
  34.115 +;; }
  34.116 +
  34.117 +{ .mmi
  34.118 +(p_yy)	st1	[ptr2] = value 			//
  34.119 +  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
  34.120 +} { .mbb
  34.121 +(p_yy)	add	cnt = -1, cnt			//
  34.122 +(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
  34.123 +;; }
  34.124 +
  34.125 +{ .mib
  34.126 +	nop.m 0
  34.127 +	shr.u	linecnt = cnt, LSIZE_SH
  34.128 +(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
  34.129 +;; }
  34.130 +
  34.131 +	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
  34.132 +{ .mmi
  34.133 +	and	tmp = -(LINE_SIZE), cnt		// compute end of range
  34.134 +	mov	ptr9 = ptr1			// used for prefetching
  34.135 +	and	cnt = (LINE_SIZE-1), cnt	// remainder
  34.136 +} { .mmi
  34.137 +	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
  34.138 +	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
  34.139 +;; }
  34.140 +{ .mmi
  34.141 +(p_scr)	add	loopcnt = -1, linecnt		//
  34.142 +	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
  34.143 +	add	ptr1 = tmp, ptr1		// first address beyond total range
  34.144 +;; }
  34.145 +{ .mmi
  34.146 +	add	tmp = -1, linecnt		// next loop count
  34.147 +	mov.i	ar.lc = loopcnt			//
  34.148 +;; }
  34.149 +.pref_l1a:
  34.150 +{ .mib
  34.151 +	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
  34.152 +	nop.i	0
  34.153 +	br.cloop.dptk.few .pref_l1a
  34.154 +;; }
  34.155 +{ .mmi
  34.156 +	add	ptr0 = 16, ptr2			// Two stores in parallel
  34.157 +	mov.i	ar.lc = tmp			//
  34.158 +;; }
  34.159 +.l1ax:
  34.160 + { .mmi
  34.161 +	stf8 [ptr2] = fvalue, 8
  34.162 +	stf8 [ptr0] = fvalue, 8
  34.163 + ;; }
  34.164 + { .mmi
  34.165 +	stf8 [ptr2] = fvalue, 24
  34.166 +	stf8 [ptr0] = fvalue, 24
  34.167 + ;; }
  34.168 + { .mmi
  34.169 +	stf8 [ptr2] = fvalue, 8
  34.170 +	stf8 [ptr0] = fvalue, 8
  34.171 + ;; }
  34.172 + { .mmi
  34.173 +	stf8 [ptr2] = fvalue, 24
  34.174 +	stf8 [ptr0] = fvalue, 24
  34.175 + ;; }
  34.176 + { .mmi
  34.177 +	stf8 [ptr2] = fvalue, 8
  34.178 +	stf8 [ptr0] = fvalue, 8
  34.179 + ;; }
  34.180 + { .mmi
  34.181 +	stf8 [ptr2] = fvalue, 24
  34.182 +	stf8 [ptr0] = fvalue, 24
  34.183 + ;; }
  34.184 + { .mmi
  34.185 +	stf8 [ptr2] = fvalue, 8
  34.186 +	stf8 [ptr0] = fvalue, 32
  34.187 + 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
  34.188 + ;; }
  34.189 +{ .mmb
  34.190 +	stf8 [ptr2] = fvalue, 24
  34.191 +(p_scr)	stf8 [ptr9] = fvalue, 128
  34.192 +	br.cloop.dptk.few .l1ax
  34.193 +;; }
  34.194 +{ .mbb
  34.195 +	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
  34.196 +(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
  34.197 +	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
  34.198 +;; }
  34.199 +
  34.200 +	TEXT_ALIGN(32)
  34.201 +.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
  34.202 +{ .mmi
  34.203 +	and	tmp = -(LINE_SIZE), cnt		// compute end of range
  34.204 +	mov	ptr9 = ptr1			// used for prefetching
  34.205 +	and	cnt = (LINE_SIZE-1), cnt	// remainder
  34.206 +} { .mmi
  34.207 +	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
  34.208 +	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
  34.209 +;; }
  34.210 +{ .mmi
  34.211 +(p_scr)	add	loopcnt = -1, linecnt
  34.212 +	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
  34.213 +	add	ptr1 = tmp, ptr1		// first address beyond total range
  34.214 +;; }
  34.215 +{ .mmi
  34.216 +	add	tmp = -1, linecnt		// next loop count
  34.217 +	mov.i	ar.lc = loopcnt
  34.218 +;; }
  34.219 +.pref_l1b:
  34.220 +{ .mib
  34.221 +	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
  34.222 +	nop.i   0
  34.223 +	br.cloop.dptk.few .pref_l1b
  34.224 +;; }
  34.225 +{ .mmi
  34.226 +	add	ptr0 = 16, ptr2			// Two stores in parallel
  34.227 +	mov.i	ar.lc = tmp
  34.228 +;; }
  34.229 +.l1bx:
  34.230 + { .mmi
  34.231 +	stf.spill [ptr2] = f0, 32
  34.232 +	stf.spill [ptr0] = f0, 32
  34.233 + ;; }
  34.234 + { .mmi
  34.235 +	stf.spill [ptr2] = f0, 32
  34.236 +	stf.spill [ptr0] = f0, 32
  34.237 + ;; }
  34.238 + { .mmi
  34.239 +	stf.spill [ptr2] = f0, 32
  34.240 +	stf.spill [ptr0] = f0, 64
  34.241 + 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
  34.242 + ;; }
  34.243 +{ .mmb
  34.244 +	stf.spill [ptr2] = f0, 32
  34.245 +(p_scr)	stf.spill [ptr9] = f0, 128
  34.246 +	br.cloop.dptk.few .l1bx
  34.247 +;; }
  34.248 +{ .mib
  34.249 +	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
  34.250 +(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
  34.251 +;; }
  34.252 +
  34.253 +.fraction_of_line:
  34.254 +{ .mib
  34.255 +	add	ptr2 = 16, ptr1
  34.256 +	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
  34.257 +;; }
  34.258 +{ .mib
  34.259 +	cmp.eq	p_scr, p0 = loopcnt, r0
  34.260 +	add	loopcnt = -1, loopcnt
  34.261 +(p_scr)	br.cond.dpnt.many .store_words
  34.262 +;; }
  34.263 +{ .mib
  34.264 +	and	cnt = 0x1f, cnt			// compute the remaining cnt
  34.265 +	mov.i   ar.lc = loopcnt
  34.266 +;; }
  34.267 +	TEXT_ALIGN(32)
  34.268 +.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
  34.269 +{ .mmb
  34.270 +	stf8	[ptr1] = fvalue, 8
  34.271 +	stf8	[ptr2] = fvalue, 8
  34.272 +;; } { .mmb
  34.273 +	stf8	[ptr1] = fvalue, 24
  34.274 +	stf8	[ptr2] = fvalue, 24
  34.275 +	br.cloop.dptk.many .l2
  34.276 +;; }
  34.277 +.store_words:
  34.278 +{ .mib
  34.279 +	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
  34.280 +(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
  34.281 +;; }
  34.282 +
  34.283 +{ .mmi
  34.284 +	stf8	[ptr1] = fvalue, 8		// store
  34.285 +	cmp.le	p_y, p_n = 16, cnt
  34.286 +	add	cnt = -8, cnt			// subtract
  34.287 +;; }
  34.288 +{ .mmi
  34.289 +(p_y)	stf8	[ptr1] = fvalue, 8		// store
  34.290 +(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
  34.291 +(p_y)	add	cnt = -8, cnt			// subtract
  34.292 +;; }
  34.293 +{ .mmi						// store
  34.294 +(p_yy)	stf8	[ptr1] = fvalue, 8
  34.295 +(p_yy)	add	cnt = -8, cnt			// subtract
  34.296 +;; }
  34.297 +
  34.298 +.move_bytes_from_alignment:
  34.299 +{ .mib
  34.300 +	cmp.eq	p_scr, p0 = cnt, r0
  34.301 +	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
  34.302 +(p_scr)	br.cond.dpnt.few .restore_and_exit
  34.303 +;; }
  34.304 +{ .mib
  34.305 +(p_y)	st4	[ptr1] = value,4
  34.306 +	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
  34.307 +;; }
  34.308 +{ .mib
  34.309 +(p_yy)	st2	[ptr1] = value,2
  34.310 +	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
  34.311 +;; }
  34.312 +
  34.313 +{ .mib
  34.314 +(p_y)	st1	[ptr1] = value
  34.315 +;; }
  34.316 +.restore_and_exit:
  34.317 +{ .mib
  34.318 +	nop.m	0
  34.319 +	mov.i	ar.lc = save_lc
  34.320 +	br.ret.sptk.many rp
  34.321 +;; }
  34.322 +
  34.323 +.move_bytes_unaligned:
  34.324 +{ .mmi
  34.325 +       .pred.rel "mutex",p_y, p_n
  34.326 +       .pred.rel "mutex",p_yy, p_nn
  34.327 +(p_n)	cmp.le  p_yy, p_nn = 4, cnt
  34.328 +(p_y)	cmp.le  p_yy, p_nn = 5, cnt
  34.329 +(p_n)	add	ptr2 = 2, ptr1
  34.330 +} { .mmi
  34.331 +(p_y)	add	ptr2 = 3, ptr1
  34.332 +(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
  34.333 +(p_y)	add	cnt = -1, cnt
  34.334 +;; }
  34.335 +{ .mmi
  34.336 +(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
  34.337 +	add	ptr3 = ptr1, cnt		// prepare last store
  34.338 +	mov.i	ar.lc = save_lc
  34.339 +} { .mmi
  34.340 +(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
  34.341 +(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
  34.342 +(p_yy)	add	cnt = -4, cnt
  34.343 +;; }
  34.344 +{ .mmi
  34.345 +(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
  34.346 +	add	ptr3 = -1, ptr3			// last store
  34.347 +	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
  34.348 +} { .mmi
  34.349 +(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
  34.350 +(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
  34.351 +(p_y)	add	cnt = -4, cnt
  34.352 +;; }
  34.353 +{ .mmi
  34.354 +(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
  34.355 +(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
  34.356 +	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
  34.357 +} { .mmi
  34.358 +(p_yy)	add	cnt = -4, cnt
  34.359 +;; }
  34.360 +{ .mmb
  34.361 +(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
  34.362 +(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
  34.363 +	br.ret.sptk.many rp
  34.364 +}
  34.365 +END(memset)
    35.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.2 +++ b/xen/arch/ia64/linux/lib/strlen.S	Tue Aug 02 15:59:09 2005 -0800
    35.3 @@ -0,0 +1,192 @@
    35.4 +/*
    35.5 + *
    35.6 + * Optimized version of the standard strlen() function
    35.7 + *
    35.8 + *
    35.9 + * Inputs:
   35.10 + *	in0	address of string
   35.11 + *
   35.12 + * Outputs:
   35.13 + *	ret0	the number of characters in the string (0 if empty string)
   35.14 + *	does not count the \0
   35.15 + *
   35.16 + * Copyright (C) 1999, 2001 Hewlett-Packard Co
   35.17 + *	Stephane Eranian <eranian@hpl.hp.com>
   35.18 + *
   35.19 + * 09/24/99 S.Eranian add speculation recovery code
   35.20 + */
   35.21 +
   35.22 +#include <asm/asmmacro.h>
   35.23 +
   35.24 +//
   35.25 +//
   35.26 +// This is an enhanced version of the basic strlen. it includes a combination
   35.27 +// of compute zero index (czx), parallel comparisons, speculative loads and
   35.28 +// loop unroll using rotating registers.
   35.29 +//
   35.30 +// General Ideas about the algorithm:
   35.31 +//	  The goal is to look at the string in chunks of 8 bytes.
   35.32 +//	  so we need to do a few extra checks at the beginning because the
   35.33 +//	  string may not be 8-byte aligned. In this case we load the 8byte
   35.34 +//	  quantity which includes the start of the string and mask the unused
   35.35 +//	  bytes with 0xff to avoid confusing czx.
   35.36 +//	  We use speculative loads and software pipelining to hide memory
   35.37 +//	  latency and do read ahead safely. This way we defer any exception.
   35.38 +//
   35.39 +//	  Because we don't want the kernel to be relying on particular
   35.40 +//	  settings of the DCR register, we provide recovery code in case
   35.41 +//	  speculation fails. The recovery code is going to "redo" the work using
   35.42 +//	  only normal loads. If we still get a fault then we generate a
   35.43 +//	  kernel panic. Otherwise we return the strlen as usual.
   35.44 +//
   35.45 +//	  The fact that speculation may fail can be caused, for instance, by
   35.46 +//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
   35.47 +//	  a NaT bit will be set if the translation is not present. The normal
   35.48 +//	  load, on the other hand, will cause the translation to be inserted
   35.49 +//	  if the mapping exists.
   35.50 +//
   35.51 +//	  It should be noted that we execute recovery code only when we need
   35.52 +//	  to use the data that has been speculatively loaded: we don't execute
   35.53 +//	  recovery code on pure read ahead data.
   35.54 +//
   35.55 +// Remarks:
   35.56 +//	- the cmp r0,r0 is used as a fast way to initialize a predicate
   35.57 +//	  register to 1. This is required to make sure that we get the parallel
   35.58 +//	  compare correct.
   35.59 +//
   35.60 +//	- we don't use the epilogue counter to exit the loop but we need to set
   35.61 +//	  it to zero beforehand.
   35.62 +//
   35.63 +//	- after the loop we must test for Nat values because neither the
   35.64 +//	  czx nor cmp instruction raise a NaT consumption fault. We must be
   35.65 +//	  careful not to look too far for a Nat for which we don't care.
   35.66 +//	  For instance we don't need to look at a NaT in val2 if the zero byte
   35.67 +//	  was in val1.
   35.68 +//
   35.69 +//	- Clearly performance tuning is required.
   35.70 +//
   35.71 +//
   35.72 +//
   35.73 +#define saved_pfs	r11
   35.74 +#define	tmp		r10
   35.75 +#define base		r16
   35.76 +#define orig		r17
   35.77 +#define saved_pr	r18
   35.78 +#define src		r19
   35.79 +#define mask		r20
   35.80 +#define val		r21
   35.81 +#define val1		r22
   35.82 +#define val2		r23
   35.83 +
   35.84 +GLOBAL_ENTRY(strlen)
   35.85 +	.prologue
   35.86 +	.save ar.pfs, saved_pfs
   35.87 +	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
   35.88 +
   35.89 +	.rotr v[2], w[2]	// declares our 4 aliases
   35.90 +
   35.91 +	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
   35.92 +	mov orig=in0		// keep trackof initial byte address
   35.93 +	dep src=0,in0,0,3	// src=8byte-aligned in0 address
   35.94 +	.save pr, saved_pr
   35.95 +	mov saved_pr=pr		// preserve predicates (rotation)
   35.96 +	;;
   35.97 +
   35.98 +	.body
   35.99 +
  35.100 +	ld8 v[1]=[src],8	// must not speculate: can fail here
  35.101 +	shl tmp=tmp,3		// multiply by 8bits/byte
  35.102 +	mov mask=-1		// our mask
  35.103 +	;;
  35.104 +	ld8.s w[1]=[src],8	// speculatively load next
  35.105 +	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
  35.106 +	sub tmp=64,tmp		// how many bits to shift our mask on the right
  35.107 +	;;
  35.108 +	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
  35.109 +	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
  35.110 +	;;
  35.111 +	add base=-16,src	// keep track of aligned base
  35.112 +	or v[1]=v[1],mask	// now we have a safe initial byte pattern
  35.113 +	;;
  35.114 +1:
  35.115 +	ld8.s v[0]=[src],8	// speculatively load next
  35.116 +	czx1.r val1=v[1]	// search 0 byte from right
  35.117 +	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
  35.118 +	;;
  35.119 +	ld8.s w[0]=[src],8	// speculatively load next to next
  35.120 +	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
  35.121 +	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
  35.122 +(p6)	br.wtop.dptk 1b		// loop until p6 == 0
  35.123 +	;;
  35.124 +	//
  35.125 +	// We must return try the recovery code iff
  35.126 +	// val1_is_nat || (val1==8 && val2_is_nat)
  35.127 +	//
  35.128 +	// XXX Fixme
  35.129 +	//	- there must be a better way of doing the test
  35.130 +	//
  35.131 +	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
  35.132 +	tnat.nz p6,p7=val1	// test NaT on val1
  35.133 +(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
  35.134 +	;;
  35.135 +	//
  35.136 +	// if we come here p7 is true, i.e., initialized for // cmp
  35.137 +	//
  35.138 +	cmp.eq.and  p7,p0=8,val1// val1==8?
  35.139 +	tnat.nz.and p7,p0=val2	// test NaT if val2
  35.140 +(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
  35.141 +	;;
  35.142 +(p8)	mov val1=val2		// the other test got us out of the loop
  35.143 +(p8)	adds src=-16,src	// correct position when 3 ahead
  35.144 +(p9)	adds src=-24,src	// correct position when 4 ahead
  35.145 +	;;
  35.146 +	sub ret0=src,orig	// distance from base
  35.147 +	sub tmp=8,val1		// which byte in word
  35.148 +	mov pr=saved_pr,0xffffffffffff0000
  35.149 +	;;
  35.150 +	sub ret0=ret0,tmp	// adjust
  35.151 +	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  35.152 +	br.ret.sptk.many rp	// end of normal execution
  35.153 +
  35.154 +	//
  35.155 +	// Outlined recovery code when speculation failed
  35.156 +	//
  35.157 +	// This time we don't use speculation and rely on the normal exception
  35.158 +	// mechanism. that's why the loop is not as good as the previous one
  35.159 +	// because read ahead is not possible
  35.160 +	//
  35.161 +	// IMPORTANT:
  35.162 +	// Please note that in the case of strlen() as opposed to strlen_user()
  35.163 +	// we don't use the exception mechanism, as this function is not
  35.164 +	// supposed to fail. If that happens it means we have a bug and the
  35.165 +	// code will cause of kernel fault.
  35.166 +	//
  35.167 +	// XXX Fixme
  35.168 +	//	- today we restart from the beginning of the string instead
  35.169 +	//	  of trying to continue where we left off.
  35.170 +	//
  35.171 +.recover:
  35.172 +	ld8 val=[base],8	// will fail if unrecoverable fault
  35.173 +	;;
  35.174 +	or val=val,mask		// remask first bytes
  35.175 +	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
  35.176 +	;;
  35.177 +	//
  35.178 +	// ar.ec is still zero here
  35.179 +	//
  35.180 +2:
  35.181 +(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
  35.182 +	;;
  35.183 +	czx1.r val1=val		// search 0 byte from right
  35.184 +	;;
  35.185 +	cmp.eq p6,p0=8,val1	// val1==8 ?
  35.186 +(p6)	br.wtop.dptk 2b		// loop until p6 == 0
  35.187 +	;;			// (avoid WAW on p63)
  35.188 +	sub ret0=base,orig	// distance from base
  35.189 +	sub tmp=8,val1
  35.190 +	mov pr=saved_pr,0xffffffffffff0000
  35.191 +	;;
  35.192 +	sub ret0=ret0,tmp	// length=now - back -1
  35.193 +	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  35.194 +	br.ret.sptk.many rp	// end of successful recovery code
  35.195 +END(strlen)
    36.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    36.2 +++ b/xen/arch/ia64/linux/lib/strlen_user.S	Tue Aug 02 15:59:09 2005 -0800
    36.3 @@ -0,0 +1,198 @@
    36.4 +/*
    36.5 + * Optimized version of the strlen_user() function
    36.6 + *
    36.7 + * Inputs:
    36.8 + *	in0	address of buffer
    36.9 + *
   36.10 + * Outputs:
   36.11 + *	ret0	0 in case of fault, strlen(buffer)+1 otherwise
   36.12 + *
   36.13 + * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
   36.14 + *	David Mosberger-Tang <davidm@hpl.hp.com>
   36.15 + *	Stephane Eranian <eranian@hpl.hp.com>
   36.16 + *
   36.17 + * 01/19/99 S.Eranian heavily enhanced version (see details below)
   36.18 + * 09/24/99 S.Eranian added speculation recovery code
   36.19 + */
   36.20 +
   36.21 +#include <asm/asmmacro.h>
   36.22 +
   36.23 +//
   36.24 +// int strlen_user(char *)
   36.25 +// ------------------------
   36.26 +// Returns:
   36.27 +//	- length of string + 1
   36.28 +//	- 0 in case an exception is raised
   36.29 +//
   36.30 +// This is an enhanced version of the basic strlen_user. it includes a
   36.31 +// combination of compute zero index (czx), parallel comparisons, speculative
   36.32 +// loads and loop unroll using rotating registers.
   36.33 +//
   36.34 +// General Ideas about the algorithm:
   36.35 +//	  The goal is to look at the string in chunks of 8 bytes.
   36.36 +//	  so we need to do a few extra checks at the beginning because the
   36.37 +//	  string may not be 8-byte aligned. In this case we load the 8byte
   36.38 +//	  quantity which includes the start of the string and mask the unused
   36.39 +//	  bytes with 0xff to avoid confusing czx.
   36.40 +//	  We use speculative loads and software pipelining to hide memory
   36.41 +//	  latency and do read ahead safely. This way we defer any exception.
   36.42 +//
   36.43 +//	  Because we don't want the kernel to be relying on particular
   36.44 +//	  settings of the DCR register, we provide recovery code in case
   36.45 +//	  speculation fails. The recovery code is going to "redo" the work using
   36.46 +//	  only normal loads. If we still get a fault then we return an
   36.47 +//	  error (ret0=0). Otherwise we return the strlen+1 as usual.
   36.48 +//	  The fact that speculation may fail can be caused, for instance, by
   36.49 +//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
   36.50 +//	  a NaT bit will be set if the translation is not present. The normal
   36.51 +//	  load, on the other hand, will cause the translation to be inserted
   36.52 +//	  if the mapping exists.
   36.53 +//
   36.54 +//	  It should be noted that we execute recovery code only when we need
   36.55 +//	  to use the data that has been speculatively loaded: we don't execute
   36.56 +//	  recovery code on pure read ahead data.
   36.57 +//
   36.58 +// Remarks:
   36.59 +//	- the cmp r0,r0 is used as a fast way to initialize a predicate
   36.60 +//	  register to 1. This is required to make sure that we get the parallel
   36.61 +//	  compare correct.
   36.62 +//
   36.63 +//	- we don't use the epilogue counter to exit the loop but we need to set
   36.64 +//	  it to zero beforehand.
   36.65 +//
   36.66 +//	- after the loop we must test for Nat values because neither the
   36.67 +//	  czx nor cmp instruction raise a NaT consumption fault. We must be
   36.68 +//	  careful not to look too far for a Nat for which we don't care.
   36.69 +//	  For instance we don't need to look at a NaT in val2 if the zero byte
   36.70 +//	  was in val1.
   36.71 +//
   36.72 +//	- Clearly performance tuning is required.
   36.73 +//
   36.74 +
   36.75 +#define saved_pfs	r11
   36.76 +#define	tmp		r10
   36.77 +#define base		r16
   36.78 +#define orig		r17
   36.79 +#define saved_pr	r18
   36.80 +#define src		r19
   36.81 +#define mask		r20
   36.82 +#define val		r21
   36.83 +#define val1		r22
   36.84 +#define val2		r23
   36.85 +
   36.86 +GLOBAL_ENTRY(__strlen_user)
   36.87 +	.prologue
   36.88 +	.save ar.pfs, saved_pfs
   36.89 +	alloc saved_pfs=ar.pfs,11,0,0,8
   36.90 +
   36.91 +	.rotr v[2], w[2]	// declares our 4 aliases
   36.92 +
   36.93 +	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
   36.94 +	mov orig=in0		// keep trackof initial byte address
   36.95 +	dep src=0,in0,0,3	// src=8byte-aligned in0 address
   36.96 +	.save pr, saved_pr
   36.97 +	mov saved_pr=pr		// preserve predicates (rotation)
   36.98 +	;;
   36.99 +
  36.100 +	.body
  36.101 +
  36.102 +	ld8.s v[1]=[src],8	// load the initial 8bytes (must speculate)
  36.103 +	shl tmp=tmp,3		// multiply by 8bits/byte
  36.104 +	mov mask=-1		// our mask
  36.105 +	;;
  36.106 +	ld8.s w[1]=[src],8	// load next 8 bytes in 2nd pipeline
  36.107 +	cmp.eq p6,p0=r0,r0	// sets p6 (required because of // cmp.and)
  36.108 +	sub tmp=64,tmp		// how many bits to shift our mask on the right
  36.109 +	;;
  36.110 +	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
  36.111 +	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
  36.112 +	;;
  36.113 +	add base=-16,src	// keep track of aligned base
  36.114 +	chk.s v[1], .recover	// if already NaT, then directly skip to recover
  36.115 +	or v[1]=v[1],mask	// now we have a safe initial byte pattern
  36.116 +	;;
  36.117 +1:
  36.118 +	ld8.s v[0]=[src],8	// speculatively load next
  36.119 +	czx1.r val1=v[1]	// search 0 byte from right
  36.120 +	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
  36.121 +	;;
  36.122 +	ld8.s w[0]=[src],8	// speculatively load next to next
  36.123 +	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
  36.124 +	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
  36.125 +(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
  36.126 +	;;
  36.127 +	//
  36.128 +	// We must return try the recovery code iff
  36.129 +	// val1_is_nat || (val1==8 && val2_is_nat)
  36.130 +	//
  36.131 +	// XXX Fixme
  36.132 +	//	- there must be a better way of doing the test
  36.133 +	//
  36.134 +	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
  36.135 +	tnat.nz p6,p7=val1	// test NaT on val1
  36.136 +(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
  36.137 +	;;
  36.138 +	//
  36.139 +	// if we come here p7 is true, i.e., initialized for // cmp
  36.140 +	//
  36.141 +	cmp.eq.and  p7,p0=8,val1// val1==8?
  36.142 +	tnat.nz.and p7,p0=val2	// test NaT if val2
  36.143 +(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
  36.144 +	;;
  36.145 +(p8)	mov val1=val2		// val2 contains the value
  36.146 +(p8)	adds src=-16,src	// correct position when 3 ahead
  36.147 +(p9)	adds src=-24,src	// correct position when 4 ahead
  36.148 +	;;
  36.149 +	sub ret0=src,orig	// distance from origin
  36.150 +	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
  36.151 +	mov pr=saved_pr,0xffffffffffff0000
  36.152 +	;;
  36.153 +	sub ret0=ret0,tmp	// length=now - back -1
  36.154 +	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  36.155 +	br.ret.sptk.many rp	// end of normal execution
  36.156 +
  36.157 +	//
  36.158 +	// Outlined recovery code when speculation failed
  36.159 +	//
  36.160 +	// This time we don't use speculation and rely on the normal exception
  36.161 +	// mechanism. that's why the loop is not as good as the previous one
  36.162 +	// because read ahead is not possible
  36.163 +	//
  36.164 +	// XXX Fixme
  36.165 +	//	- today we restart from the beginning of the string instead
  36.166 +	//	  of trying to continue where we left off.
  36.167 +	//
  36.168 +.recover:
  36.169 +	EX(.Lexit1, ld8 val=[base],8)	// load the initial bytes
  36.170 +	;;
  36.171 +	or val=val,mask			// remask first bytes
  36.172 +	cmp.eq p0,p6=r0,r0		// nullify first ld8 in loop
  36.173 +	;;
  36.174 +	//
  36.175 +	// ar.ec is still zero here
  36.176 +	//
  36.177 +2:
  36.178 +	EX(.Lexit1, (p6) ld8 val=[base],8)
  36.179 +	;;
  36.180 +	czx1.r val1=val		// search 0 byte from right
  36.181 +	;;
  36.182 +	cmp.eq p6,p0=8,val1	// val1==8 ?
  36.183 +(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
  36.184 +	;;
  36.185 +	sub ret0=base,orig	// distance from base
  36.186 +	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
  36.187 +	mov pr=saved_pr,0xffffffffffff0000
  36.188 +	;;
  36.189 +	sub ret0=ret0,tmp	// length=now - back -1
  36.190 +	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  36.191 +	br.ret.sptk.many rp	// end of successful recovery code
  36.192 +
  36.193 +	//
  36.194 +	// We failed even on the normal load (called from exception handler)
  36.195 +	//
  36.196 +.Lexit1:
  36.197 +	mov ret0=0
  36.198 +	mov pr=saved_pr,0xffffffffffff0000
  36.199 +	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
  36.200 +	br.ret.sptk.many rp
  36.201 +END(__strlen_user)
    37.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    37.2 +++ b/xen/arch/ia64/linux/lib/strncpy_from_user.S	Tue Aug 02 15:59:09 2005 -0800
    37.3 @@ -0,0 +1,44 @@
    37.4 +/*
    37.5 + * Just like strncpy() except that if a fault occurs during copying,
    37.6 + * -EFAULT is returned.
    37.7 + *
    37.8 + * Inputs:
    37.9 + *	in0:	address of destination buffer
   37.10 + *	in1:	address of string to be copied
   37.11 + *	in2:	length of buffer in bytes
   37.12 + * Outputs:
   37.13 + *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
   37.14 + *
   37.15 + * Copyright (C) 1998-2001 Hewlett-Packard Co
   37.16 + * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
   37.17 + *
   37.18 + * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
   37.19 + *			 by Andreas Schwab <schwab@suse.de>).
   37.20 + */
   37.21 +
   37.22 +#include <asm/asmmacro.h>
   37.23 +
   37.24 +GLOBAL_ENTRY(__strncpy_from_user)
   37.25 +	alloc r2=ar.pfs,3,0,0,0
   37.26 +	mov r8=0
   37.27 +	mov r9=in1
   37.28 +	;;
   37.29 +	add r10=in1,in2
   37.30 +	cmp.eq p6,p0=r0,in2
   37.31 +(p6)	br.ret.spnt.many rp
   37.32 +
   37.33 +	// XXX braindead copy loop---this needs to be optimized
   37.34 +.Loop1:
   37.35 +	EX(.Lexit, ld1 r8=[in1],1)
   37.36 +	;;
   37.37 +	EX(.Lexit, st1 [in0]=r8,1)
   37.38 +	cmp.ne p6,p7=r8,r0
   37.39 +	;;
   37.40 +(p6)	cmp.ne.unc p8,p0=in1,r10
   37.41 +(p8)	br.cond.dpnt.few .Loop1
   37.42 +	;;
   37.43 +(p6)	mov r8=in2		// buffer filled up---return buffer length
   37.44 +(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
   37.45 +[.Lexit:]
   37.46 +	br.ret.sptk.many rp
   37.47 +END(__strncpy_from_user)
    38.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    38.2 +++ b/xen/arch/ia64/linux/lib/strnlen_user.S	Tue Aug 02 15:59:09 2005 -0800
    38.3 @@ -0,0 +1,45 @@
    38.4 +/*
    38.5 + * Returns 0 if exception before NUL or reaching the supplied limit (N),
    38.6 + * a value greater than N if the string is longer than the limit, else
    38.7 + * strlen.
    38.8 + *
    38.9 + * Inputs:
   38.10 + *	in0:	address of buffer
   38.11 + *	in1:	string length limit N
   38.12 + * Outputs:
   38.13 + *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
   38.14 + *
   38.15 + * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
   38.16 + */
   38.17 +
   38.18 +#include <asm/asmmacro.h>
   38.19 +
   38.20 +GLOBAL_ENTRY(__strnlen_user)
   38.21 +	.prologue
   38.22 +	alloc r2=ar.pfs,2,0,0,0
   38.23 +	.save ar.lc, r16
   38.24 +	mov r16=ar.lc			// preserve ar.lc
   38.25 +
   38.26 +	.body
   38.27 +
   38.28 +	add r3=-1,in1
   38.29 +	;;
   38.30 +	mov ar.lc=r3
   38.31 +	mov r9=0
   38.32 +	;;
   38.33 +	// XXX braindead strlen loop---this needs to be optimized
   38.34 +.Loop1:
   38.35 +	EXCLR(.Lexit, ld1 r8=[in0],1)
   38.36 +	add r9=1,r9
   38.37 +	;;
   38.38 +	cmp.eq p6,p0=r8,r0
   38.39 +(p6)	br.cond.dpnt .Lexit
   38.40 +	br.cloop.dptk.few .Loop1
   38.41 +
   38.42 +	add r9=1,in1			// NUL not found---return N+1
   38.43 +	;;
   38.44 +.Lexit:
   38.45 +	mov r8=r9
   38.46 +	mov ar.lc=r16			// restore ar.lc
   38.47 +	br.ret.sptk.many rp
   38.48 +END(__strnlen_user)
    39.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    39.2 +++ b/xen/arch/ia64/linux/lib/xor.S	Tue Aug 02 15:59:09 2005 -0800
    39.3 @@ -0,0 +1,184 @@
    39.4 +/*
    39.5 + * arch/ia64/lib/xor.S
    39.6 + *
    39.7 + * Optimized RAID-5 checksumming functions for IA-64.
    39.8 + *
    39.9 + * This program is free software; you can redistribute it and/or modify
   39.10 + * it under the terms of the GNU General Public License as published by
   39.11 + * the Free Software Foundation; either version 2, or (at your option)
   39.12 + * any later version.
   39.13 + *
   39.14 + * You should have received a copy of the GNU General Public License
   39.15 + * (for example /usr/src/linux/COPYING); if not, write to the Free
   39.16 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   39.17 + */
   39.18 +
   39.19 +#include <asm/asmmacro.h>
   39.20 +
   39.21 +GLOBAL_ENTRY(xor_ia64_2)
   39.22 +	.prologue
   39.23 +	.fframe 0
   39.24 +	.save ar.pfs, r31
   39.25 +	alloc r31 = ar.pfs, 3, 0, 13, 16
   39.26 +	.save ar.lc, r30
   39.27 +	mov r30 = ar.lc
   39.28 +	.save pr, r29
   39.29 +	mov r29 = pr
   39.30 +	;;
   39.31 +	.body
   39.32 +	mov r8 = in1
   39.33 +	mov ar.ec = 6 + 2
   39.34 +	shr in0 = in0, 3
   39.35 +	;;
   39.36 +	adds in0 = -1, in0
   39.37 +	mov r16 = in1
   39.38 +	mov r17 = in2
   39.39 +	;;
   39.40 +	mov ar.lc = in0
   39.41 +	mov pr.rot = 1 << 16
   39.42 +	;;
   39.43 +	.rotr s1[6+1], s2[6+1], d[2]
   39.44 +	.rotp p[6+2]
   39.45 +0:
   39.46 +(p[0])	ld8.nta s1[0] = [r16], 8
   39.47 +(p[0])	ld8.nta s2[0] = [r17], 8
   39.48 +(p[6])	xor d[0] = s1[6], s2[6]
   39.49 +(p[6+1])st8.nta [r8] = d[1], 8
   39.50 +	nop.f 0
   39.51 +	br.ctop.dptk.few 0b
   39.52 +	;;
   39.53 +	mov ar.lc = r30
   39.54 +	mov pr = r29, -1
   39.55 +	br.ret.sptk.few rp
   39.56 +END(xor_ia64_2)
   39.57 +
   39.58 +GLOBAL_ENTRY(xor_ia64_3)
   39.59 +	.prologue
   39.60 +	.fframe 0
   39.61 +	.save ar.pfs, r31
   39.62 +	alloc r31 = ar.pfs, 4, 0, 20, 24
   39.63 +	.save ar.lc, r30
   39.64 +	mov r30 = ar.lc
   39.65 +	.save pr, r29
   39.66 +	mov r29 = pr
   39.67 +	;;
   39.68 +	.body
   39.69 +	mov r8 = in1
   39.70 +	mov ar.ec = 6 + 2
   39.71 +	shr in0 = in0, 3
   39.72 +	;;
   39.73 +	adds in0 = -1, in0
   39.74 +	mov r16 = in1
   39.75 +	mov r17 = in2
   39.76 +	;;
   39.77 +	mov r18 = in3
   39.78 +	mov ar.lc = in0
   39.79 +	mov pr.rot = 1 << 16
   39.80 +	;;
   39.81 +	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
   39.82 +	.rotp p[6+2]
   39.83 +0:
   39.84 +(p[0])	ld8.nta s1[0] = [r16], 8
   39.85 +(p[0])	ld8.nta s2[0] = [r17], 8
   39.86 +(p[6])	xor d[0] = s1[6], s2[6]
   39.87 +	;;
   39.88 +(p[0])	ld8.nta s3[0] = [r18], 8
   39.89 +(p[6+1])st8.nta [r8] = d[1], 8
   39.90 +(p[6])	xor d[0] = d[0], s3[6]
   39.91 +	br.ctop.dptk.few 0b
   39.92 +	;;
   39.93 +	mov ar.lc = r30
   39.94 +	mov pr = r29, -1
   39.95 +	br.ret.sptk.few rp
   39.96 +END(xor_ia64_3)
   39.97 +
   39.98 +GLOBAL_ENTRY(xor_ia64_4)
   39.99 +	.prologue
  39.100 +	.fframe 0
  39.101 +	.save ar.pfs, r31
  39.102 +	alloc r31 = ar.pfs, 5, 0, 27, 32
  39.103 +	.save ar.lc, r30
  39.104 +	mov r30 = ar.lc
  39.105 +	.save pr, r29
  39.106 +	mov r29 = pr
  39.107 +	;;
  39.108 +	.body
  39.109 +	mov r8 = in1
  39.110 +	mov ar.ec = 6 + 2
  39.111 +	shr in0 = in0, 3
  39.112 +	;;
  39.113 +	adds in0 = -1, in0
  39.114 +	mov r16 = in1
  39.115 +	mov r17 = in2
  39.116 +	;;
  39.117 +	mov r18 = in3
  39.118 +	mov ar.lc = in0
  39.119 +	mov pr.rot = 1 << 16
  39.120 +	mov r19 = in4
  39.121 +	;;
  39.122 +	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
  39.123 +	.rotp p[6+2]
  39.124 +0:
  39.125 +(p[0])	ld8.nta s1[0] = [r16], 8
  39.126 +(p[0])	ld8.nta s2[0] = [r17], 8
  39.127 +(p[6])	xor d[0] = s1[6], s2[6]
  39.128 +(p[0])	ld8.nta s3[0] = [r18], 8
  39.129 +(p[0])	ld8.nta s4[0] = [r19], 8
  39.130 +(p[6])	xor r20 = s3[6], s4[6]
  39.131 +	;;
  39.132 +(p[6+1])st8.nta [r8] = d[1], 8
  39.133 +(p[6])	xor d[0] = d[0], r20
  39.134 +	br.ctop.dptk.few 0b
  39.135 +	;;
  39.136 +	mov ar.lc = r30
  39.137 +	mov pr = r29, -1
  39.138 +	br.ret.sptk.few rp
  39.139 +END(xor_ia64_4)
  39.140 +
  39.141 +GLOBAL_ENTRY(xor_ia64_5)
  39.142 +	.prologue
  39.143 +	.fframe 0
  39.144 +	.save ar.pfs, r31
  39.145 +	alloc r31 = ar.pfs, 6, 0, 34, 40
  39.146 +	.save ar.lc, r30
  39.147 +	mov r30 = ar.lc
  39.148 +	.save pr, r29
  39.149 +	mov r29 = pr
  39.150 +	;;
  39.151 +	.body
  39.152 +	mov r8 = in1
  39.153 +	mov ar.ec = 6 + 2
  39.154 +	shr in0 = in0, 3
  39.155 +	;;
  39.156 +	adds in0 = -1, in0
  39.157 +	mov r16 = in1
  39.158 +	mov r17 = in2
  39.159 +	;;
  39.160 +	mov r18 = in3
  39.161 +	mov ar.lc = in0
  39.162 +	mov pr.rot = 1 << 16
  39.163 +	mov r19 = in4
  39.164 +	mov r20 = in5
  39.165 +	;;
  39.166 +	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
  39.167 +	.rotp p[6+2]
  39.168 +0:
  39.169 +(p[0])	ld8.nta s1[0] = [r16], 8
  39.170 +(p[0])	ld8.nta s2[0] = [r17], 8
  39.171 +(p[6])	xor d[0] = s1[6], s2[6]
  39.172 +(p[0])	ld8.nta s3[0] = [r18], 8
  39.173 +(p[0])	ld8.nta s4[0] = [r19], 8
  39.174 +(p[6])	xor r21 = s3[6], s4[6]
  39.175 +	;;
  39.176 +(p[0])	ld8.nta s5[0] = [r20], 8
  39.177 +(p[6+1])st8.nta [r8] = d[1], 8
  39.178 +(p[6])	xor d[0] = d[0], r21
  39.179 +	;;
  39.180 +(p[6])	  xor d[0] = d[0], s5[6]
  39.181 +	nop.f 0
  39.182 +	br.ctop.dptk.few 0b
  39.183 +	;;
  39.184 +	mov ar.lc = r30
  39.185 +	mov pr = r29, -1
  39.186 +	br.ret.sptk.few rp
  39.187 +END(xor_ia64_5)
    40.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    40.2 +++ b/xen/arch/ia64/linux/linuxextable.c	Tue Aug 02 15:59:09 2005 -0800
    40.3 @@ -0,0 +1,67 @@
    40.4 +/* Rewritten by Rusty Russell, on the backs of many others...
    40.5 +   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
    40.6 +
    40.7 +    This program is free software; you can redistribute it and/or modify
    40.8 +    it under the terms of the GNU General Public License as published by
    40.9 +    the Free Software Foundation; either version 2 of the License, or
   40.10 +    (at your option) any later version.
   40.11 +
   40.12 +    This program is distributed in the hope that it will be useful,
   40.13 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   40.14 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   40.15 +    GNU General Public License for more details.
   40.16 +
   40.17 +    You should have received a copy of the GNU General Public License
   40.18 +    along with this program; if not, write to the Free Software
   40.19 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   40.20 +*/
   40.21 +#include <linux/module.h>
   40.22 +#include <linux/init.h>
   40.23 +#include <asm/uaccess.h>
   40.24 +#include <asm/sections.h>
   40.25 +
   40.26 +extern struct exception_table_entry __start___ex_table[];
   40.27 +extern struct exception_table_entry __stop___ex_table[];
   40.28 +
   40.29 +/* Sort the kernel's built-in exception table */
   40.30 +void __init sort_main_extable(void)
   40.31 +{
   40.32 +	sort_extable(__start___ex_table, __stop___ex_table);
   40.33 +}
   40.34 +
   40.35 +/* Given an address, look for it in the exception tables. */
   40.36 +const struct exception_table_entry *search_exception_tables(unsigned long addr)
   40.37 +{
   40.38 +	const struct exception_table_entry *e;
   40.39 +
   40.40 +	e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
   40.41 +	if (!e)
   40.42 +		e = search_module_extables(addr);
   40.43 +	return e;
   40.44 +}
   40.45 +
   40.46 +static int core_kernel_text(unsigned long addr)
   40.47 +{
   40.48 +	if (addr >= (unsigned long)_stext &&
   40.49 +	    addr <= (unsigned long)_etext)
   40.50 +		return 1;
   40.51 +
   40.52 +	if (addr >= (unsigned long)_sinittext &&
   40.53 +	    addr <= (unsigned long)_einittext)
   40.54 +		return 1;
   40.55 +	return 0;
   40.56 +}
   40.57 +
   40.58 +int __kernel_text_address(unsigned long addr)
   40.59 +{
   40.60 +	if (core_kernel_text(addr))
   40.61 +		return 1;
   40.62 +	return __module_text_address(addr) != NULL;
   40.63 +}
   40.64 +
   40.65 +int kernel_text_address(unsigned long addr)
   40.66 +{
   40.67 +	if (core_kernel_text(addr))
   40.68 +		return 1;
   40.69 +	return module_text_address(addr) != NULL;
   40.70 +}
    41.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    41.2 +++ b/xen/arch/ia64/linux/machvec.c	Tue Aug 02 15:59:09 2005 -0800
    41.3 @@ -0,0 +1,70 @@
    41.4 +#include <linux/config.h>
    41.5 +#include <linux/module.h>
    41.6 +
    41.7 +#include <asm/machvec.h>
    41.8 +#include <asm/system.h>
    41.9 +
   41.10 +#ifdef CONFIG_IA64_GENERIC
   41.11 +
   41.12 +#include <linux/kernel.h>
   41.13 +#include <linux/string.h>
   41.14 +
   41.15 +#include <asm/page.h>
   41.16 +
   41.17 +struct ia64_machine_vector ia64_mv;
   41.18 +EXPORT_SYMBOL(ia64_mv);
   41.19 +
   41.20 +static struct ia64_machine_vector *
   41.21 +lookup_machvec (const char *name)
   41.22 +{
   41.23 +	extern struct ia64_machine_vector machvec_start[];
   41.24 +	extern struct ia64_machine_vector machvec_end[];
   41.25 +	struct ia64_machine_vector *mv;
   41.26 +
   41.27 +	for (mv = machvec_start; mv < machvec_end; ++mv)
   41.28 +		if (strcmp (mv->name, name) == 0)
   41.29 +			return mv;
   41.30 +
   41.31 +	return 0;
   41.32 +}
   41.33 +
   41.34 +void
   41.35 +machvec_init (const char *name)
   41.36 +{
   41.37 +	struct ia64_machine_vector *mv;
   41.38 +
   41.39 +	mv = lookup_machvec(name);
   41.40 +	if (!mv) {
   41.41 +		panic("generic kernel failed to find machine vector for platform %s!", name);
   41.42 +	}
   41.43 +	ia64_mv = *mv;
   41.44 +	printk(KERN_INFO "booting generic kernel on platform %s\n", name);
   41.45 +}
   41.46 +
   41.47 +#endif /* CONFIG_IA64_GENERIC */
   41.48 +
   41.49 +void
   41.50 +machvec_setup (char **arg)
   41.51 +{
   41.52 +}
   41.53 +EXPORT_SYMBOL(machvec_setup);
   41.54 +
   41.55 +void
   41.56 +machvec_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
   41.57 +{
   41.58 +}
   41.59 +EXPORT_SYMBOL(machvec_timer_interrupt);
   41.60 +
   41.61 +void
   41.62 +machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir)
   41.63 +{
   41.64 +	mb();
   41.65 +}
   41.66 +EXPORT_SYMBOL(machvec_dma_sync_single);
   41.67 +
   41.68 +void
   41.69 +machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir)
   41.70 +{
   41.71 +	mb();
   41.72 +}
   41.73 +EXPORT_SYMBOL(machvec_dma_sync_sg);
    42.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    42.2 +++ b/xen/arch/ia64/linux/minstate.h	Tue Aug 02 15:59:09 2005 -0800
    42.3 @@ -0,0 +1,251 @@
    42.4 +#include <linux/config.h>
    42.5 +
    42.6 +#include <asm/cache.h>
    42.7 +
    42.8 +#include "entry.h"
    42.9 +
   42.10 +/*
   42.11 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
   42.12 + * on interrupts.
   42.13 + *
   42.14 + *  On entry:
   42.15 + *	r1:	pointer to current task (ar.k6)
   42.16 + */
   42.17 +#define MINSTATE_START_SAVE_MIN_VIRT								\
   42.18 +(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
   42.19 +	;;											\
   42.20 +(pUStk)	mov.m r24=ar.rnat;									\
   42.21 +(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
   42.22 +(pKStk) mov r1=sp;					/* get sp  */				\
   42.23 +	;;											\
   42.24 +(pUStk) lfetch.fault.excl.nt1 [r22];								\
   42.25 +(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
   42.26 +(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
   42.27 +	;;											\
   42.28 +(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
   42.29 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
   42.30 +	;;											\
   42.31 +(pUStk)	mov r18=ar.bsp;										\
   42.32 +(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
   42.33 +
   42.34 +#define MINSTATE_END_SAVE_MIN_VIRT								\
   42.35 +	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
   42.36 +	;;
   42.37 +
   42.38 +/*
   42.39 + * For mca_asm.S we want to access the stack physically since the state is saved before we
   42.40 + * go virtual and don't want to destroy the iip or ipsr.
   42.41 + */
   42.42 +#define MINSTATE_START_SAVE_MIN_PHYS								\
   42.43 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;								\
   42.44 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;							\
   42.45 +(pKStk) ld8 r3 = [r3];;										\
   42.46 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;						\
   42.47 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;						\
   42.48 +(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
   42.49 +(pUStk)	addl r22=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	\
   42.50 +	;;											\
   42.51 +(pUStk)	mov r24=ar.rnat;									\
   42.52 +(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
   42.53 +(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
   42.54 +(pUStk)	dep r22=-1,r22,61,3;			/* compute kernel virtual addr of RBS */	\
   42.55 +	;;											\
   42.56 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		\
   42.57 +(pUStk)	mov ar.bspstore=r22;			/* switch to kernel RBS */			\
   42.58 +	;;											\
   42.59 +(pUStk)	mov r18=ar.bsp;										\
   42.60 +(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
   42.61 +
   42.62 +#define MINSTATE_END_SAVE_MIN_PHYS								\
   42.63 +	dep r12=-1,r12,61,3;		/* make sp a kernel virtual address */			\
   42.64 +	;;
   42.65 +
   42.66 +#ifdef MINSTATE_VIRT
   42.67 +# define MINSTATE_GET_CURRENT(reg)	mov reg=IA64_KR(CURRENT)
   42.68 +# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_VIRT
   42.69 +# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_VIRT
   42.70 +#endif
   42.71 +
   42.72 +#ifdef MINSTATE_PHYS
   42.73 +# define MINSTATE_GET_CURRENT(reg)	mov reg=IA64_KR(CURRENT);; tpa reg=reg
   42.74 +# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_PHYS
   42.75 +# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_PHYS
   42.76 +#endif
   42.77 +
   42.78 +/*
   42.79 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
   42.80 + * the minimum state necessary that allows us to turn psr.ic back
   42.81 + * on.
   42.82 + *
   42.83 + * Assumed state upon entry:
   42.84 + *	psr.ic: off
   42.85 + *	r31:	contains saved predicates (pr)
   42.86 + *
   42.87 + * Upon exit, the state is as follows:
   42.88 + *	psr.ic: off
   42.89 + *	 r2 = points to &pt_regs.r16
   42.90 + *	 r8 = contents of ar.ccv
   42.91 + *	 r9 = contents of ar.csd
   42.92 + *	r10 = contents of ar.ssd
   42.93 + *	r11 = FPSR_DEFAULT
   42.94 + *	r12 = kernel sp (kernel virtual address)
   42.95 + *	r13 = points to current task_struct (kernel virtual address)
   42.96 + *	p15 = TRUE if psr.i is set in cr.ipsr
   42.97 + *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
   42.98 + *		preserved
   42.99 + *
  42.100 + * Note that psr.ic is NOT turned on by this macro.  This is so that
  42.101 + * we can pass interruption state as arguments to a handler.
  42.102 + */
  42.103 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)							\
  42.104 +	MINSTATE_GET_CURRENT(r16);	/* M (or M;;I) */					\
  42.105 +	mov r27=ar.rsc;			/* M */							\
  42.106 +	mov r20=r1;			/* A */							\
  42.107 +	mov r25=ar.unat;		/* M */							\
  42.108 +	mov r29=cr.ipsr;		/* M */							\
  42.109 +	mov r26=ar.pfs;			/* I */							\
  42.110 +	mov r28=cr.iip;			/* M */							\
  42.111 +	mov r21=ar.fpsr;		/* M */							\
  42.112 +	COVER;				/* B;; (or nothing) */					\
  42.113 +	;;											\
  42.114 +	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
  42.115 +	;;											\
  42.116 +	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
  42.117 +	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
  42.118 +	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
  42.119 +	/* switch from user to kernel RBS: */							\
  42.120 +	;;											\
  42.121 +	invala;				/* M */							\
  42.122 +	SAVE_IFS;										\
  42.123 +	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
  42.124 +	;;											\
  42.125 +	MINSTATE_START_SAVE_MIN									\
  42.126 +	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
  42.127 +	adds r16=PT(CR_IPSR),r1;								\
  42.128 +	;;											\
  42.129 +	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
  42.130 +	st8 [r16]=r29;		/* save cr.ipsr */						\
  42.131 +	;;											\
  42.132 +	lfetch.fault.excl.nt1 [r17];								\
  42.133 +	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
  42.134 +	mov r29=b0										\
  42.135 +	;;											\
  42.136 +	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
  42.137 +	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
  42.138 +(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
  42.139 +	;;											\
  42.140 +.mem.offset 0,0; st8.spill [r16]=r8,16;								\
  42.141 +.mem.offset 8,0; st8.spill [r17]=r9,16;								\
  42.142 +        ;;											\
  42.143 +.mem.offset 0,0; st8.spill [r16]=r10,24;							\
  42.144 +.mem.offset 8,0; st8.spill [r17]=r11,24;							\
  42.145 +        ;;											\
  42.146 +	st8 [r16]=r28,16;	/* save cr.iip */						\
  42.147 +	st8 [r17]=r30,16;	/* save cr.ifs */						\
  42.148 +(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
  42.149 +	mov r8=ar.ccv;										\
  42.150 +	mov r9=ar.csd;										\
  42.151 +	mov r10=ar.ssd;										\
  42.152 +	movl r11=FPSR_DEFAULT;   /* L-unit */							\
  42.153 +	;;											\
  42.154 +	st8 [r16]=r25,16;	/* save ar.unat */						\
  42.155 +	st8 [r17]=r26,16;	/* save ar.pfs */						\
  42.156 +	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
  42.157 +	;;											\
  42.158 +	st8 [r16]=r27,16;	/* save ar.rsc */						\
  42.159 +(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
  42.160 +(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
  42.161 +	;;			/* avoid RAW on r16 & r17 */					\
  42.162 +(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
  42.163 +	st8 [r17]=r31,16;	/* save predicates */						\
  42.164 +(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
  42.165 +	;;											\
  42.166 +	st8 [r16]=r29,16;	/* save b0 */							\
  42.167 +	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
  42.168 +	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
  42.169 +	;;											\
  42.170 +.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
  42.171 +.mem.offset 8,0; st8.spill [r17]=r12,16;							\
  42.172 +	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
  42.173 +	;;											\
  42.174 +.mem.offset 0,0; st8.spill [r16]=r13,16;							\
  42.175 +.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
  42.176 +	mov r13=IA64_KR(CURRENT);	/* establish `current' */				\
  42.177 +	;;											\
  42.178 +.mem.offset 0,0; st8.spill [r16]=r15,16;							\
  42.179 +.mem.offset 8,0; st8.spill [r17]=r14,16;							\
  42.180 +	;;											\
  42.181 +.mem.offset 0,0; st8.spill [r16]=r2,16;								\
  42.182 +.mem.offset 8,0; st8.spill [r17]=r3,16;								\
  42.183 +	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
  42.184 +	;;											\
  42.185 +	EXTRA;											\
  42.186 +	movl r1=__gp;		/* establish kernel global pointer */				\
  42.187 +	;;											\
  42.188 +	MINSTATE_END_SAVE_MIN
  42.189 +
  42.190 +/*
  42.191 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
  42.192 + *
  42.193 + * Assumed state upon entry:
  42.194 + *	psr.ic: on
  42.195 + *	r2:	points to &pt_regs.r16
  42.196 + *	r3:	points to &pt_regs.r17
  42.197 + *	r8:	contents of ar.ccv
  42.198 + *	r9:	contents of ar.csd
  42.199 + *	r10:	contents of ar.ssd
  42.200 + *	r11:	FPSR_DEFAULT
  42.201 + *
  42.202 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
  42.203 + */
  42.204 +#define SAVE_REST				\
  42.205 +.mem.offset 0,0; st8.spill [r2]=r16,16;		\
  42.206 +.mem.offset 8,0; st8.spill [r3]=r17,16;		\
  42.207 +	;;					\
  42.208 +.mem.offset 0,0; st8.spill [r2]=r18,16;		\
  42.209 +.mem.offset 8,0; st8.spill [r3]=r19,16;		\
  42.210 +	;;					\
  42.211 +.mem.offset 0,0; st8.spill [r2]=r20,16;		\
  42.212 +.mem.offset 8,0; st8.spill [r3]=r21,16;		\
  42.213 +	mov r18=b6;				\
  42.214 +	;;					\
  42.215 +.mem.offset 0,0; st8.spill [r2]=r22,16;		\
  42.216 +.mem.offset 8,0; st8.spill [r3]=r23,16;		\
  42.217 +	mov r19=b7;				\
  42.218 +	;;					\
  42.219 +.mem.offset 0,0; st8.spill [r2]=r24,16;		\
  42.220 +.mem.offset 8,0; st8.spill [r3]=r25,16;		\
  42.221 +	;;					\
  42.222 +.mem.offset 0,0; st8.spill [r2]=r26,16;		\
  42.223 +.mem.offset 8,0; st8.spill [r3]=r27,16;		\
  42.224 +	;;					\
  42.225 +.mem.offset 0,0; st8.spill [r2]=r28,16;		\
  42.226 +.mem.offset 8,0; st8.spill [r3]=r29,16;		\
  42.227 +	;;					\
  42.228 +.mem.offset 0,0; st8.spill [r2]=r30,16;		\
  42.229 +.mem.offset 8,0; st8.spill [r3]=r31,32;		\
  42.230 +	;;					\
  42.231 +	mov ar.fpsr=r11;	/* M-unit */	\
  42.232 +	st8 [r2]=r8,8;		/* ar.ccv */	\
  42.233 +	adds r24=PT(B6)-PT(F7),r3;		\
  42.234 +	;;					\
  42.235 +	stf.spill [r2]=f6,32;			\
  42.236 +	stf.spill [r3]=f7,32;			\
  42.237 +	;;					\
  42.238 +	stf.spill [r2]=f8,32;			\
  42.239 +	stf.spill [r3]=f9,32;			\
  42.240 +	;;					\
  42.241 +	stf.spill [r2]=f10;			\
  42.242 +	stf.spill [r3]=f11;			\
  42.243 +	adds r25=PT(B7)-PT(F11),r3;		\
  42.244 +	;;					\
  42.245 +	st8 [r24]=r18,16;       /* b6 */	\
  42.246 +	st8 [r25]=r19,16;       /* b7 */	\
  42.247 +	;;					\
  42.248 +	st8 [r24]=r9;        	/* ar.csd */	\
  42.249 +	st8 [r25]=r10;      	/* ar.ssd */	\
  42.250 +	;;
  42.251 +
  42.252 +#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(cover, mov r30=cr.ifs,)
  42.253 +#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
  42.254 +#define SAVE_MIN		DO_SAVE_MIN(     , mov r30=r0, )
    43.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    43.2 +++ b/xen/arch/ia64/linux/patch.c	Tue Aug 02 15:59:09 2005 -0800
    43.3 @@ -0,0 +1,189 @@
    43.4 +/*
    43.5 + * Instruction-patching support.
    43.6 + *
    43.7 + * Copyright (C) 2003 Hewlett-Packard Co
    43.8 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    43.9 + */
   43.10 +#include <linux/init.h>
   43.11 +#include <linux/string.h>
   43.12 +
   43.13 +#include <asm/patch.h>
   43.14 +#include <asm/processor.h>
   43.15 +#include <asm/sections.h>
   43.16 +#include <asm/system.h>
   43.17 +#include <asm/unistd.h>
   43.18 +
   43.19 +/*
   43.20 + * This was adapted from code written by Tony Luck:
   43.21 + *
   43.22 + * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle
   43.23 + * like this:
   43.24 + *
   43.25 + * 6  6         5         4         3         2         1
   43.26 + * 3210987654321098765432109876543210987654321098765432109876543210
   43.27 + * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG
   43.28 + *
   43.29 + * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
   43.30 + * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB
   43.31 + */
   43.32 +static u64
   43.33 +get_imm64 (u64 insn_addr)
   43.34 +{
   43.35 +	u64 *p = (u64 *) (insn_addr & -16);	/* mask out slot number */
   43.36 +
   43.37 +	return ( (p[1] & 0x0800000000000000UL) << 4)  | /*A*/
   43.38 +		((p[1] & 0x00000000007fffffUL) << 40) | /*B*/
   43.39 +		((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/
   43.40 +		((p[1] & 0x0000100000000000UL) >> 23) | /*D*/
   43.41 +		((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/
   43.42 +		((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/
   43.43 +		((p[1] & 0x000007f000000000UL) >> 36);  /*G*/
   43.44 +}
   43.45 +
   43.46 +/* Patch instruction with "val" where "mask" has 1 bits. */
   43.47 +void
   43.48 +ia64_patch (u64 insn_addr, u64 mask, u64 val)
   43.49 +{
   43.50 +	u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16);
   43.51 +#	define insn_mask ((1UL << 41) - 1)
   43.52 +	unsigned long shift;
   43.53 +
   43.54 +	b0 = b[0]; b1 = b[1];
   43.55 +	shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */
   43.56 +	if (shift >= 64) {
   43.57 +		m1 = mask << (shift - 64);
   43.58 +		v1 = val << (shift - 64);
   43.59 +	} else {
   43.60 +		m0 = mask << shift; m1 = mask >> (64 - shift);
   43.61 +		v0 = val  << shift; v1 = val >> (64 - shift);
   43.62 +		b[0] = (b0 & ~m0) | (v0 & m0);
   43.63 +	}
   43.64 +	b[1] = (b1 & ~m1) | (v1 & m1);
   43.65 +}
   43.66 +
   43.67 +void
   43.68 +ia64_patch_imm64 (u64 insn_addr, u64 val)
   43.69 +{
   43.70 +	ia64_patch(insn_addr,
   43.71 +		   0x01fffefe000UL, (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
   43.72 +				     | ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
   43.73 +				     | ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
   43.74 +				     | ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
   43.75 +				     | ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */));
   43.76 +	ia64_patch(insn_addr - 1, 0x1ffffffffffUL, val >> 22);
   43.77 +}
   43.78 +
   43.79 +void
   43.80 +ia64_patch_imm60 (u64 insn_addr, u64 val)
   43.81 +{
   43.82 +	ia64_patch(insn_addr,
   43.83 +		   0x011ffffe000UL, (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
   43.84 +				     | ((val & 0x00000000000fffffUL) << 13) /* bit  0 -> 13 */));
   43.85 +	ia64_patch(insn_addr - 1, 0x1fffffffffcUL, val >> 18);
   43.86 +}
   43.87 +
   43.88 +/*
   43.89 + * We need sometimes to load the physical address of a kernel
   43.90 + * object.  Often we can convert the virtual address to physical
   43.91 + * at execution time, but sometimes (either for performance reasons
   43.92 + * or during error recovery) we cannot to this.  Patch the marked
   43.93 + * bundles to load the physical address.
   43.94 + */
   43.95 +void __init
   43.96 +ia64_patch_vtop (unsigned long start, unsigned long end)
   43.97 +{
   43.98 +	s32 *offp = (s32 *) start;
   43.99 +	u64 ip;
  43.100 +
  43.101 +	while (offp < (s32 *) end) {
  43.102 +		ip = (u64) offp + *offp;
  43.103 +
  43.104 +		/* replace virtual address with corresponding physical address: */
  43.105 +		ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip)));
  43.106 +		ia64_fc((void *) ip);
  43.107 +		++offp;
  43.108 +	}
  43.109 +	ia64_sync_i();
  43.110 +	ia64_srlz_i();
  43.111 +}
  43.112 +
  43.113 +void
  43.114 +ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
  43.115 +{
  43.116 +	static int first_time = 1;
  43.117 +	int need_workaround;
  43.118 +	s32 *offp = (s32 *) start;
  43.119 +	u64 *wp;
  43.120 +
  43.121 +	need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0);
  43.122 +
  43.123 +	if (first_time) {
  43.124 +		first_time = 0;
  43.125 +		if (need_workaround)
  43.126 +			printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
  43.127 +		else
  43.128 +			printk(KERN_INFO "McKinley Errata 9 workaround not needed; "
  43.129 +			       "disabling it\n");
  43.130 +	}
  43.131 +	if (need_workaround)
  43.132 +		return;
  43.133 +
  43.134 +	while (offp < (s32 *) end) {
  43.135 +		wp = (u64 *) ia64_imva((char *) offp + *offp);
  43.136 +		wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
  43.137 +		wp[1] = 0x0004000000000200UL;
  43.138 +		wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
  43.139 +		wp[3] = 0x0084006880000200UL;
  43.140 +		ia64_fc(wp); ia64_fc(wp + 2);
  43.141 +		++offp;
  43.142 +	}
  43.143 +	ia64_sync_i();
  43.144 +	ia64_srlz_i();
  43.145 +}
  43.146 +
  43.147 +static void
  43.148 +patch_fsyscall_table (unsigned long start, unsigned long end)
  43.149 +{
  43.150 +	extern unsigned long fsyscall_table[NR_syscalls];
  43.151 +	s32 *offp = (s32 *) start;
  43.152 +	u64 ip;
  43.153 +
  43.154 +	while (offp < (s32 *) end) {
  43.155 +		ip = (u64) ia64_imva((char *) offp + *offp);
  43.156 +		ia64_patch_imm64(ip, (u64) fsyscall_table);
  43.157 +		ia64_fc((void *) ip);
  43.158 +		++offp;
  43.159 +	}
  43.160 +	ia64_sync_i();
  43.161 +	ia64_srlz_i();
  43.162 +}
  43.163 +
  43.164 +static void
  43.165 +patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
  43.166 +{
  43.167 +	extern char fsys_bubble_down[];
  43.168 +	s32 *offp = (s32 *) start;
  43.169 +	u64 ip;
  43.170 +
  43.171 +	while (offp < (s32 *) end) {
  43.172 +		ip = (u64) offp + *offp;
  43.173 +		ia64_patch_imm60((u64) ia64_imva((void *) ip),
  43.174 +				 (u64) (fsys_bubble_down - (ip & -16)) / 16);
  43.175 +		ia64_fc((void *) ip);
  43.176 +		++offp;
  43.177 +	}
  43.178 +	ia64_sync_i();
  43.179 +	ia64_srlz_i();
  43.180 +}
  43.181 +
  43.182 +void
  43.183 +ia64_patch_gate (void)
  43.184 +{
  43.185 +#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
  43.186 +#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
  43.187 +
  43.188 +	patch_fsyscall_table(START(fsyscall), END(fsyscall));
  43.189 +	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
  43.190 +	ia64_patch_vtop(START(vtop), END(vtop));
  43.191 +	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
  43.192 +}
    44.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    44.2 +++ b/xen/arch/ia64/linux/pcdp.h	Tue Aug 02 15:59:09 2005 -0800
    44.3 @@ -0,0 +1,84 @@
    44.4 +/*
    44.5 + * Definitions for PCDP-defined console devices
    44.6 + *
    44.7 + * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
    44.8 + * v2.0:  http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf
    44.9 + *
   44.10 + * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
   44.11 + *	Khalid Aziz <khalid.aziz@hp.com>
   44.12 + *	Bjorn Helgaas <bjorn.helgaas@hp.com>
   44.13 + *
   44.14 + * This program is free software; you can redistribute it and/or modify
   44.15 + * it under the terms of the GNU General Public License version 2 as
   44.16 + * published by the Free Software Foundation.
   44.17 + */
   44.18 +
   44.19 +#define PCDP_CONSOLE			0
   44.20 +#define PCDP_DEBUG			1
   44.21 +#define PCDP_CONSOLE_OUTPUT		2
   44.22 +#define PCDP_CONSOLE_INPUT		3
   44.23 +
   44.24 +#define PCDP_UART			(0 << 3)
   44.25 +#define PCDP_VGA			(1 << 3)
   44.26 +#define PCDP_USB			(2 << 3)
   44.27 +
   44.28 +/* pcdp_uart.type and pcdp_device.type */
   44.29 +#define PCDP_CONSOLE_UART		(PCDP_UART | PCDP_CONSOLE)
   44.30 +#define PCDP_DEBUG_UART			(PCDP_UART | PCDP_DEBUG)
   44.31 +#define PCDP_CONSOLE_VGA		(PCDP_VGA  | PCDP_CONSOLE_OUTPUT)
   44.32 +#define PCDP_CONSOLE_USB		(PCDP_USB  | PCDP_CONSOLE_INPUT)
   44.33 +
   44.34 +/* pcdp_uart.flags */
   44.35 +#define PCDP_UART_EDGE_SENSITIVE	(1 << 0)
   44.36 +#define PCDP_UART_ACTIVE_LOW		(1 << 1)
   44.37 +#define PCDP_UART_PRIMARY_CONSOLE	(1 << 2)
   44.38 +#define PCDP_UART_IRQ			(1 << 6) /* in pci_func for rev < 3 */
   44.39 +#define PCDP_UART_PCI			(1 << 7) /* in pci_func for rev < 3 */
   44.40 +
   44.41 +struct pcdp_uart {
   44.42 +	u8				type;
   44.43 +	u8				bits;
   44.44 +	u8				parity;
   44.45 +	u8				stop_bits;
   44.46 +	u8				pci_seg;
   44.47 +	u8				pci_bus;
   44.48 +	u8				pci_dev;
   44.49 +	u8				pci_func;
   44.50 +	u64				baud;
   44.51 +	struct acpi_generic_address	addr;
   44.52 +	u16				pci_dev_id;
   44.53 +	u16				pci_vendor_id;
   44.54 +	u32				gsi;
   44.55 +	u32				clock_rate;
   44.56 +	u8				pci_prog_intfc;
   44.57 +	u8				flags;
   44.58 +};
   44.59 +
   44.60 +struct pcdp_vga {
   44.61 +	u8			count;		/* address space descriptors */
   44.62 +};
   44.63 +
   44.64 +/* pcdp_device.flags */
   44.65 +#define PCDP_PRIMARY_CONSOLE	1
   44.66 +
   44.67 +struct pcdp_device {
   44.68 +	u8			type;
   44.69 +	u8			flags;
   44.70 +	u16			length;
   44.71 +	u16			efi_index;
   44.72 +};
   44.73 +
   44.74 +struct pcdp {
   44.75 +	u8			signature[4];
   44.76 +	u32			length;
   44.77 +	u8			rev;		/* PCDP v2.0 is rev 3 */
   44.78 +	u8			chksum;
   44.79 +	u8			oemid[6];
   44.80 +	u8			oem_tabid[8];
   44.81 +	u32			oem_rev;
   44.82 +	u8			creator_id[4];
   44.83 +	u32			creator_rev;
   44.84 +	u32			num_uarts;
   44.85 +	struct pcdp_uart	uart[0];	/* actual size is num_uarts */
   44.86 +	/* remainder of table is pcdp_device structures */
   44.87 +};
    45.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    45.2 +++ b/xen/arch/ia64/linux/sal.c	Tue Aug 02 15:59:09 2005 -0800
    45.3 @@ -0,0 +1,302 @@
    45.4 +/*
    45.5 + * System Abstraction Layer (SAL) interface routines.
    45.6 + *
    45.7 + * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
    45.8 + *	David Mosberger-Tang <davidm@hpl.hp.com>
    45.9 + * Copyright (C) 1999 VA Linux Systems
   45.10 + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
   45.11 + */
   45.12 +#include <linux/config.h>
   45.13 +
   45.14 +#include <linux/kernel.h>
   45.15 +#include <linux/init.h>
   45.16 +#include <linux/module.h>
   45.17 +#include <linux/spinlock.h>
   45.18 +#include <linux/string.h>
   45.19 +
   45.20 +#include <asm/page.h>
   45.21 +#include <asm/sal.h>
   45.22 +#include <asm/pal.h>
   45.23 +
   45.24 + __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
   45.25 +unsigned long sal_platform_features;
   45.26 +
   45.27 +unsigned short sal_revision;
   45.28 +unsigned short sal_version;
   45.29 +
   45.30 +#define SAL_MAJOR(x) ((x) >> 8)
   45.31 +#define SAL_MINOR(x) ((x) & 0xff)
   45.32 +
   45.33 +static struct {
   45.34 +	void *addr;	/* function entry point */
   45.35 +	void *gpval;	/* gp value to use */
   45.36 +} pdesc;
   45.37 +
   45.38 +static long
   45.39 +default_handler (void)
   45.40 +{
   45.41 +	return -1;
   45.42 +}
   45.43 +
   45.44 +ia64_sal_handler ia64_sal = (ia64_sal_handler) default_handler;
   45.45 +ia64_sal_desc_ptc_t *ia64_ptc_domain_info;
   45.46 +
   45.47 +const char *
   45.48 +ia64_sal_strerror (long status)
   45.49 +{
   45.50 +	const char *str;
   45.51 +	switch (status) {
   45.52 +	      case 0: str = "Call completed without error"; break;
   45.53 +	      case 1: str = "Effect a warm boot of the system to complete "
   45.54 +			      "the update"; break;
   45.55 +	      case -1: str = "Not implemented"; break;
   45.56 +	      case -2: str = "Invalid argument"; break;
   45.57 +	      case -3: str = "Call completed with error"; break;
   45.58 +	      case -4: str = "Virtual address not registered"; break;
   45.59 +	      case -5: str = "No information available"; break;
   45.60 +	      case -6: str = "Insufficient space to add the entry"; break;
   45.61 +	      case -7: str = "Invalid entry_addr value"; break;
   45.62 +	      case -8: str = "Invalid interrupt vector"; break;
   45.63 +	      case -9: str = "Requested memory not available"; break;
   45.64 +	      case -10: str = "Unable to write to the NVM device"; break;
   45.65 +	      case -11: str = "Invalid partition type specified"; break;
   45.66 +	      case -12: str = "Invalid NVM_Object id specified"; break;
   45.67 +	      case -13: str = "NVM_Object already has the maximum number "
   45.68 +				"of partitions"; break;
   45.69 +	      case -14: str = "Insufficient space in partition for the "
   45.70 +				"requested write sub-function"; break;
   45.71 +	      case -15: str = "Insufficient data buffer space for the "
   45.72 +				"requested read record sub-function"; break;
   45.73 +	      case -16: str = "Scratch buffer required for the write/delete "
   45.74 +				"sub-function"; break;
   45.75 +	      case -17: str = "Insufficient space in the NVM_Object for the "
   45.76 +				"requested create sub-function"; break;
   45.77 +	      case -18: str = "Invalid value specified in the partition_rec "
   45.78 +				"argument"; break;
   45.79 +	      case -19: str = "Record oriented I/O not supported for this "
   45.80 +				"partition"; break;
   45.81 +	      case -20: str = "Bad format of record to be written or "
   45.82 +				"required keyword variable not "
   45.83 +				"specified"; break;
   45.84 +	      default: str = "Unknown SAL status code"; break;
   45.85 +	}
   45.86 +	return str;
   45.87 +}
   45.88 +
   45.89 +void __init
   45.90 +ia64_sal_handler_init (void *entry_point, void *gpval)
   45.91 +{
   45.92 +	/* fill in the SAL procedure descriptor and point ia64_sal to it: */
   45.93 +	pdesc.addr = entry_point;
   45.94 +	pdesc.gpval = gpval;
   45.95 +	ia64_sal = (ia64_sal_handler) &pdesc;
   45.96 +}
   45.97 +
   45.98 +static void __init
   45.99 +check_versions (struct ia64_sal_systab *systab)
  45.100 +{
  45.101 +	sal_revision = (systab->sal_rev_major << 8) | systab->sal_rev_minor;
  45.102 +	sal_version = (systab->sal_b_rev_major << 8) | systab->sal_b_rev_minor;
  45.103 +
  45.104 +	/* Check for broken firmware */
  45.105 +	if ((sal_revision == SAL_VERSION_CODE(49, 29))
  45.106 +	    && (sal_version == SAL_VERSION_CODE(49, 29)))
  45.107 +	{
  45.108 +		/*
  45.109 +		 * Old firmware for zx2000 prototypes have this weird version number,
  45.110 +		 * reset it to something sane.
  45.111 +		 */
  45.112 +		sal_revision = SAL_VERSION_CODE(2, 8);
  45.113 +		sal_version = SAL_VERSION_CODE(0, 0);
  45.114 +	}
  45.115 +}
  45.116 +
  45.117 +static void __init
  45.118 +sal_desc_entry_point (void *p)
  45.119 +{
  45.120 +	struct ia64_sal_desc_entry_point *ep = p;
  45.121 +	ia64_pal_handler_init(__va(ep->pal_proc));
  45.122 +	ia64_sal_handler_init(__va(ep->sal_proc), __va(ep->gp));
  45.123 +}
  45.124 +
  45.125 +#ifdef CONFIG_SMP
  45.126 +static void __init
  45.127 +set_smp_redirect (int flag)
  45.128 +{
  45.129 +#ifndef CONFIG_HOTPLUG_CPU
  45.130 +	if (no_int_routing)
  45.131 +		smp_int_redirect &= ~flag;
  45.132 +	else
  45.133 +		smp_int_redirect |= flag;
  45.134 +#else
  45.135 +	/*
  45.136 +	 * For CPU Hotplug we dont want to do any chipset supported
  45.137 +	 * interrupt redirection. The reason is this would require that
  45.138 +	 * All interrupts be stopped and hard bind the irq to a cpu.
  45.139 +	 * Later when the interrupt is fired we need to set the redir hint
  45.140 +	 * on again in the vector. This is combersome for something that the
  45.141 +	 * user mode irq balancer will solve anyways.
  45.142 +	 */
  45.143 +	no_int_routing=1;
  45.144 +	smp_int_redirect &= ~flag;
  45.145 +#endif
  45.146 +}
  45.147 +#else
  45.148 +#define set_smp_redirect(flag)	do { } while (0)
  45.149 +#endif
  45.150 +
  45.151 +static void __init
  45.152 +sal_desc_platform_feature (void *p)
  45.153 +{
  45.154 +	struct ia64_sal_desc_platform_feature *pf = p;
  45.155 +	sal_platform_features = pf->feature_mask;
  45.156 +
  45.157 +	printk(KERN_INFO "SAL Platform features:");
  45.158 +	if (!sal_platform_features) {
  45.159 +		printk(" None\n");
  45.160 +		return;
  45.161 +	}
  45.162 +
  45.163 +	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_BUS_LOCK)
  45.164 +		printk(" BusLock");
  45.165 +	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT) {
  45.166 +		printk(" IRQ_Redirection");
  45.167 +		set_smp_redirect(SMP_IRQ_REDIRECTION);
  45.168 +	}
  45.169 +	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT) {
  45.170 +		printk(" IPI_Redirection");
  45.171 +		set_smp_redirect(SMP_IPI_REDIRECTION);
  45.172 +	}
  45.173 +	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)
  45.174 +		printk(" ITC_Drift");
  45.175 +	printk("\n");
  45.176 +}
  45.177 +
  45.178 +#ifdef CONFIG_SMP
  45.179 +static void __init
  45.180 +sal_desc_ap_wakeup (void *p)
  45.181 +{
  45.182 +	struct ia64_sal_desc_ap_wakeup *ap = p;
  45.183 +
  45.184 +	switch (ap->mechanism) {
  45.185 +	case IA64_SAL_AP_EXTERNAL_INT:
  45.186 +		ap_wakeup_vector = ap->vector;
  45.187 +		printk(KERN_INFO "SAL: AP wakeup using external interrupt "
  45.188 +				"vector 0x%lx\n", ap_wakeup_vector);
  45.189 +		break;
  45.190 +	default:
  45.191 +		printk(KERN_ERR "SAL: AP wakeup mechanism unsupported!\n");
  45.192 +		break;
  45.193 +	}
  45.194 +}
  45.195 +
  45.196 +static void __init
  45.197