ia64/xen-unstable

changeset 7515:dd0db9019177

Fast tlb miss reflection (mostly working, missing robustness, default off)
Signed-off by: Dan Magenheimer <dan.magenheimer@hp.com>
I've been maintaining this out-of-tree partially finished for some time
so am now checking it in but default off. The current version needs
some additional error checking and only works (fast) for domain0, but
it makes a significant performance differenece.
author djm@kirby.fc.hp.com
date Mon Nov 07 10:40:42 2005 -0600 (2005-11-07)
parents 5a4893a537ca
children b6cce4237ded
files xen/arch/ia64/xen/hyperprivop.S xen/arch/ia64/xen/ivt.S xen/arch/ia64/xen/privop.c xen/arch/ia64/xen/vcpu.c
line diff
     1.1 --- a/xen/arch/ia64/xen/hyperprivop.S	Fri Nov 04 10:40:29 2005 -0600
     1.2 +++ b/xen/arch/ia64/xen/hyperprivop.S	Mon Nov 07 10:40:42 2005 -0600
     1.3 @@ -14,6 +14,7 @@
     1.4  #include <asm/system.h>
     1.5  #include <public/arch-ia64.h>
     1.6  
     1.7 +
     1.8  #define	_PAGE_PPN_MASK	0x0003fffffffff000 //asm/pgtable.h doesn't do assembly
     1.9  #define PAGE_PHYS	0x0010000000000761 //__pgprot(__DIRTY_BITS|_PAGE_PL_2|_PAGE_AR_RWX)
    1.10  #define _PAGE_PL_2	(2<<7)
    1.11 @@ -22,13 +23,14 @@
    1.12  #define FAST_HYPERPRIVOPS
    1.13  #define FAST_HYPERPRIVOP_CNT
    1.14  #define FAST_REFLECT_CNT
    1.15 -//#define FAST_TICK
    1.16 +//#define FAST_TICK // mostly working (unat problems) but default off for now
    1.17 +//#define FAST_TLB_MISS_REFLECT	// mostly working but default off for now
    1.18 +//#define FAST_ITC	// working but default off for now
    1.19  #define FAST_BREAK
    1.20  #define FAST_ACCESS_REFLECT
    1.21  #define FAST_RFI
    1.22  #define FAST_SSM_I
    1.23  #define FAST_PTC_GA
    1.24 -#undef FAST_ITC	// working but default off for now
    1.25  #undef RFI_TO_INTERRUPT // not working yet
    1.26  #endif
    1.27  
    1.28 @@ -705,6 +707,251 @@ GLOBAL_ENTRY(fast_access_reflect)
    1.29  	st8 [r23]=r22;;
    1.30  	br.cond.sptk.many fast_reflect;;
    1.31  
    1.32 +// when we get to here, VHPT_CCHAIN_LOOKUP has failed and everything
    1.33 +// is as it was at the time of original miss.  We want to preserve that
    1.34 +// so if we get a nested fault, we can just branch to page_fault
    1.35 +GLOBAL_ENTRY(fast_tlb_miss_reflect)
    1.36 +#ifndef FAST_TLB_MISS_REFLECT // see beginning of file
    1.37 +	br.spnt.few page_fault ;;
    1.38 +#endif
    1.39 +	mov r31=pr
    1.40 +	mov r30=cr.ipsr
    1.41 +	mov r29=cr.iip
    1.42 +	mov r16=cr.isr
    1.43 +	mov r17=cr.ifa;;
    1.44 +	// for now, always take slow path for region 0 (e.g. metaphys mode)
    1.45 +	extr.u r21=r17,61,3;;
    1.46 +	cmp.eq p7,p0=r0,r21
    1.47 +(p7)	br.spnt.few page_fault ;;
    1.48 +	// always take slow path for PL0 (e.g. __copy_from_user)
    1.49 +	extr.u r21=r30,IA64_PSR_CPL0_BIT,2 ;;
    1.50 +	cmp.eq p7,p0=r21,r0
    1.51 +(p7)	br.spnt.few page_fault ;;
    1.52 +	// slow path if strange ipsr or isr bits set
    1.53 +	extr.u r21=r30,IA64_PSR_BE_BIT,1 ;;
    1.54 +	cmp.ne p7,p0=r21,r0
    1.55 +(p7)	br.spnt.few page_fault ;;
    1.56 +	extr.u r21=r30,IA64_PSR_PP_BIT,1 ;;
    1.57 +	cmp.ne p7,p0=r21,r0
    1.58 +(p7)	br.spnt.few page_fault ;;
    1.59 +	movl r21=IA64_ISR_IR|IA64_ISR_SP|IA64_ISR_NA ;;
    1.60 +	and r21=r16,r21;;
    1.61 +	cmp.ne p7,p0=r0,r21
    1.62 +(p7)	br.spnt.few page_fault ;;
    1.63 +	// also take slow path if virtual psr.ic=0
    1.64 +	movl r18=XSI_PSR_IC;;
    1.65 +	ld4 r21=[r18];;
    1.66 +	cmp.eq p7,p0=r0,r21
    1.67 +(p7)	br.spnt.few page_fault ;;
    1.68 +	// OK, if we get to here, we are doing a fast vcpu_translate.  Need to:
    1.69 +	// 1) look in the virtual TR's (pinned), if not there
    1.70 +	// 2) look in the 1-entry TLB (pinned), if not there
    1.71 +	// 3) check the domain VHPT (NOT pinned, accesses domain memory!)
    1.72 +	// If we find it in any of these places, we need to effectively do
    1.73 +	// a hyper_itc_i/d
    1.74 +
    1.75 +	// short-term hack for now, if in region 5-7, take slow path
    1.76 +	// since all Linux TRs are in region 5 or 7, we need not check TRs
    1.77 +	extr.u r21=r17,61,3;;
    1.78 +	cmp.le p7,p0=5,r21
    1.79 +(p7)	br.spnt.few page_fault ;;
    1.80 +fast_tlb_no_tr_match:
    1.81 +	movl r27=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;
    1.82 +	ld8 r27=[r27];;
    1.83 +	tbit.nz p6,p7=r16,IA64_ISR_X_BIT;;
    1.84 +(p6)	adds r25=IA64_VCPU_ITLB_OFFSET,r27;;
    1.85 +(p7)	adds r25=IA64_VCPU_DTLB_OFFSET,r27;;
    1.86 +	ld8 r20=[r25],8;;
    1.87 +	tbit.z p7,p0=r20,0;;	// present?
    1.88 +(p7)	br.cond.spnt.few 1f;;
    1.89 +	// if ifa is in range of tlb, don't bother to check rid, go slow path
    1.90 +	ld8 r21=[r25],8;;
    1.91 +	mov r23=1
    1.92 +	extr.u r21=r21,2,6;;
    1.93 +	shl r22=r23,r21
    1.94 +	ld8 r21=[r25],8;;
    1.95 +	cmp.ltu p7,p0=r17,r21
    1.96 +(p7)	br.cond.sptk.many 1f;
    1.97 +	add r21=r22,r21;;
    1.98 +	cmp.ltu p7,p0=r17,r21
    1.99 +(p7)	br.cond.spnt.few page_fault;;
   1.100 +	
   1.101 +1:	// check the guest VHPT
   1.102 +	adds r19 = XSI_PTA_OFS-XSI_PSR_IC_OFS, r18;;
   1.103 +	ld8 r19=[r19];;
   1.104 +	tbit.nz p7,p0=r19,IA64_PTA_VF_BIT;;	// long format VHPT
   1.105 +(p7)	br.cond.spnt.few page_fault;;
   1.106 +	// if (!rr.ve || !(pta & IA64_PTA_VE)) take slow way for now
   1.107 +	// FIXME: later, we deliver an alt_d/i vector after thash and itir
   1.108 +	tbit.z p7,p0=r19,IA64_PTA_VE_BIT;;	// 
   1.109 +(p7)	br.cond.spnt.few page_fault;;
   1.110 +	extr.u r25=r17,61,3;;
   1.111 +	adds r21=XSI_RR0_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.112 +	shl r25=r25,3;;
   1.113 +	add r21=r21,r25;;
   1.114 +	ld8 r22=[r21];;
   1.115 +	tbit.z p7,p0=r22,0
   1.116 +(p7)	br.cond.spnt.few page_fault;;
   1.117 +
   1.118 +	// compute and save away itir (r22 & RR_PS_MASK)
   1.119 +	movl r21=0xfc;;
   1.120 +	and r22=r22,r21;;
   1.121 +	adds r21=XSI_ITIR_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.122 +	st8 [r21]=r22;;
   1.123 +	
   1.124 +	// save away ifa
   1.125 +	adds r21=XSI_IFA_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.126 +	st8 [r21]=r17;;
   1.127 +	// see vcpu_thash to save away iha
   1.128 +	shr.u r20 = r17, 61
   1.129 +	addl r25 = 1, r0
   1.130 +	movl r30 = 0xe000000000000000
   1.131 +	;;
   1.132 +	and r21 = r30, r17		// VHPT_Addr1
   1.133 +	;;
   1.134 +	shladd r28 = r20, 3, r18
   1.135 +	adds r19 = XSI_PTA_OFS-XSI_PSR_IC_OFS, r18
   1.136 +	;;
   1.137 +	adds r27 = XSI_RR0_OFS-XSI_PSR_IC_OFS, r28
   1.138 +	addl r28 = 32767, r0
   1.139 +	ld8 r24 = [r19]			// pta
   1.140 +	;;
   1.141 +	ld8 r23 = [r27]			// rrs[vadr>>61]
   1.142 +	extr.u r26 = r24, 2, 6
   1.143 +	;;
   1.144 +	extr.u r22 = r23, 2, 6
   1.145 +	shl r30 = r25, r26
   1.146 +	;;
   1.147 +	shr.u r19 = r17, r22
   1.148 +	shr.u r29 = r24, 15
   1.149 +	;;
   1.150 +	adds r30 = -1, r30
   1.151 +	;;
   1.152 +	shladd r27 = r19, 3, r0
   1.153 +	extr.u r26 = r30, 15, 46
   1.154 +	;;
   1.155 +	andcm r24 = r29, r26
   1.156 +	and r19 = r28, r27
   1.157 +	shr.u r25 = r27, 15
   1.158 +	;;
   1.159 +	and r23 = r26, r25
   1.160 +	;;
   1.161 +	or r22 = r24, r23
   1.162 +	;;
   1.163 +	dep.z r20 = r22, 15, 46
   1.164 +	;;
   1.165 +	or r30 = r20, r21
   1.166 +	;;
   1.167 +	//or r8 = r19, r30
   1.168 +	or r19 = r19, r30
   1.169 +	;;
   1.170 +	adds r23=XSI_IHA_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.171 +	st8 [r23]=r19;;
   1.172 +	// done with thash, check guest VHPT
   1.173 +
   1.174 +	adds r20 = XSI_PTA_OFS-XSI_PSR_IC_OFS, r18;;
   1.175 +	ld8 r24 = [r20];;			// pta
   1.176 +	// avoid recursively walking the VHPT
   1.177 +	// if (((r17=address ^ r24=pta) & ((itir_mask(pta) << 3) >> 3)) != 0) {
   1.178 +	mov r20=-8
   1.179 +	xor r21=r17,r24
   1.180 +	extr.u r24=r24,2,6;;
   1.181 +	shl r20=r20,r24;;
   1.182 +	shr.u r20=r20,3;;
   1.183 +	and r21=r20,r21;;
   1.184 +	cmp.eq p7,p0=r21,r0
   1.185 +(p7)	br.cond.spnt.few 1f;;
   1.186 +	// __copy_from_user(&pte, r19=(void *)(*iha), sizeof(pte)=8)
   1.187 +	// prepare for possible nested dtlb fault
   1.188 +	mov r29=b0
   1.189 +	movl r30=guest_vhpt_miss;;
   1.190 +	// now go fetch the entry from the guest VHPT
   1.191 +	ld8 r20=[r19];;
   1.192 +	// if we wind up here, we successfully loaded the VHPT entry
   1.193 +
   1.194 +	// this VHPT walker aborts on non-present pages instead
   1.195 +	// of inserting a not-present translation, this allows
   1.196 +	// vectoring directly to the miss handler
   1.197 +	tbit.z p7,p0=r20,0
   1.198 +(p7)	br.cond.spnt.few page_not_present;;
   1.199 +
   1.200 +#ifdef FAST_REFLECT_CNT
   1.201 +	movl r21=fast_vhpt_translate_count;;
   1.202 +	ld8 r22=[r21];;
   1.203 +	adds r22=1,r22;;
   1.204 +	st8 [r21]=r22;;
   1.205 +#endif
   1.206 +
   1.207 +// prepare for fast_insert(PSCB(ifa),PSCB(itir),r16=pte)
   1.208 +//	r16 == pte
   1.209 +//	r17 == bit0: 1=inst, 0=data; bit1: 1=itc, 0=vcpu_translate
   1.210 +//	r18 == XSI_PSR_IC_OFS
   1.211 +//	r24 == ps
   1.212 +//	r29 == saved value of b0 in case of recovery
   1.213 +//	r30 == recovery ip if failure occurs
   1.214 +//	r31 == pr
   1.215 +	tbit.nz p6,p7=r16,IA64_ISR_X_BIT;;
   1.216 +(p6)	mov r17=1;;
   1.217 +(p7)	mov r17=0;;
   1.218 +	mov r16=r20
   1.219 +	mov r29=b0 ;;
   1.220 +	movl r30=recover_and_page_fault ;;
   1.221 +	adds r21=XSI_ITIR_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.222 +	ld8 r24=[r21];;
   1.223 +	extr.u r24=r24,2,6;;
   1.224 +	// IFA already in PSCB
   1.225 +	br.cond.sptk.many fast_insert;;
   1.226 +
   1.227 +// we get here if fast_insert fails (e.g. due to metaphysical lookup)
   1.228 +ENTRY(recover_and_page_fault)
   1.229 +#ifdef FAST_REFLECT_CNT
   1.230 +	movl r21=recover_to_page_fault_count;;
   1.231 +	ld8 r22=[r21];;
   1.232 +	adds r22=1,r22;;
   1.233 +	st8 [r21]=r22;;
   1.234 +#endif
   1.235 +	mov b0=r29;;
   1.236 +	br.cond.sptk.many page_fault;;
   1.237 +
   1.238 +// if we wind up here, we missed in guest VHPT so recover
   1.239 +// from nested dtlb fault and reflect a tlb fault to the guest
   1.240 +guest_vhpt_miss:
   1.241 +	mov b0=r29;;
   1.242 +	// fault = IA64_VHPT_FAULT
   1.243 +	mov r20=r0
   1.244 +	br.cond.sptk.many 1f;
   1.245 +
   1.246 +	// if we get to here, we are ready to reflect
   1.247 +	// need to set up virtual ifa, iha, itir (fast_reflect handles
   1.248 +	// virtual isr, iip, ipsr, ifs
   1.249 +	// see vcpu_get_itir_on_fault: get ps,rid,(FIXME key) from rr[ifa]
   1.250 +page_not_present:
   1.251 +	tbit.nz p6,p7=r16,IA64_ISR_X_BIT;;
   1.252 +(p6)	movl r20=0x400;;
   1.253 +(p7)	movl r20=0x800;;
   1.254 +
   1.255 +1:	extr.u r25=r17,61,3;;
   1.256 +	adds r21=XSI_RR0_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.257 +	shl r25=r25,3;;
   1.258 +	add r21=r21,r25;;
   1.259 +	ld8 r22=[r21];;
   1.260 +	extr.u r22=r22,2,30;;
   1.261 +	dep.z r22=r22,2,30;;
   1.262 +	adds r23=XSI_ITIR_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.263 +	st8 [r23]=r22;;
   1.264 +
   1.265 +	// fast reflect expects
   1.266 +	//	r16 == cr.isr
   1.267 +	//	r18 == XSI_PSR_IC
   1.268 +	//	r20 == offset into ivt
   1.269 +	//	r29 == iip
   1.270 +	//	r30 == ipsr
   1.271 +	//	r31 == pr
   1.272 +	//mov r16=cr.isr
   1.273 +	mov r29=cr.iip
   1.274 +	mov r30=cr.ipsr
   1.275 +	br.sptk.many fast_reflect;;
   1.276 +END(fast_tlb_miss_reflect)
   1.277  
   1.278  // ensure that, if giving up, registers at entry to fast_hyperprivop unchanged
   1.279  ENTRY(hyper_rfi)
   1.280 @@ -1673,6 +1920,17 @@ 2:
   1.281  	;;
   1.282  END(hyper_ptc_ga)
   1.283  
   1.284 +// recovery block for hyper_itc metaphysical memory lookup
   1.285 +ENTRY(recover_and_dispatch_break_fault)
   1.286 +#ifdef FAST_REFLECT_CNT
   1.287 +	movl r21=recover_to_break_fault_count;;
   1.288 +	ld8 r22=[r21];;
   1.289 +	adds r22=1,r22;;
   1.290 +	st8 [r21]=r22;;
   1.291 +#endif
   1.292 +	mov b0=r29 ;;
   1.293 +	br.sptk.many dispatch_break_fault;;
   1.294 +
   1.295  //  Registers at entry
   1.296  //	r17 = break immediate (XEN_HYPER_ITC_D or I)
   1.297  //	r18 == XSI_PSR_IC_OFS
   1.298 @@ -1682,24 +1940,14 @@ ENTRY(hyper_itc_i)
   1.299  	// fall through, hyper_itc_d handles both i and d
   1.300  ENTRY(hyper_itc_d)
   1.301  #ifndef FAST_ITC
   1.302 -	br.spnt.many dispatch_break_fault ;;
   1.303 +	br.sptk.many dispatch_break_fault ;;
   1.304  #endif
   1.305 +	// ensure itir.ps >= xen's pagesize
   1.306  	adds r23=XSI_ITIR_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.307  	ld8 r23=[r23];;
   1.308  	extr.u r24=r23,2,6;;		// r24==logps
   1.309  	cmp.gt p7,p0=PAGE_SHIFT,r24
   1.310  (p7)	br.spnt.many dispatch_break_fault ;;
   1.311 -	// translate_domain_pte(r8=pteval,PSCB(ifa)=address,r24=itir)
   1.312 -	mov r19=1;;
   1.313 -	shl r20=r19,r24;;
   1.314 -	adds r20=-1,r20;;	// r20 == mask
   1.315 -	movl r19=_PAGE_PPN_MASK;;
   1.316 -	and r22=r8,r19;;	// r22 == pteval & _PAGE_PPN_MASK
   1.317 -	andcm r19=r22,r20;;
   1.318 -	adds r21=XSI_IFA_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.319 -	ld8 r21=[r21];;
   1.320 -	and r20=r21,r20;;
   1.321 -	or r19=r19,r20;;	// r19 == mpaddr
   1.322  	movl r27=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;
   1.323  	ld8 r27=[r27];;
   1.324  	adds r27=IA64_VCPU_DOMAIN_OFFSET,r27;;
   1.325 @@ -1710,7 +1958,6 @@ ENTRY(hyper_itc_d)
   1.326  // FIXME: for now, only handle dom0 (see lookup_domain_mpa below)
   1.327  	cmp.ne p7,p0=r27,r28
   1.328  (p7)	br.spnt.many dispatch_break_fault ;;
   1.329 -	// if region 6, go slow way
   1.330  #ifdef FAST_HYPERPRIVOP_CNT
   1.331  	cmp.eq p6,p7=XEN_HYPER_ITC_D,r17;;
   1.332  (p6)	movl r20=fast_hyperpriv_cnt+(8*XEN_HYPER_ITC_D);;
   1.333 @@ -1719,19 +1966,47 @@ ENTRY(hyper_itc_d)
   1.334  	adds r21=1,r21;;
   1.335  	st8 [r20]=r21;;
   1.336  #endif
   1.337 +(p6)	mov r17=2;;
   1.338 +(p7)	mov r17=3;;
   1.339 +	mov r29=b0 ;;
   1.340 +	movl r30=recover_and_dispatch_break_fault ;;
   1.341 +	mov r16=r8;;
   1.342 +	// fall through
   1.343 +
   1.344 +
   1.345 +// fast_insert(PSCB(ifa),r24=ps,r16=pte)
   1.346 +//	r16 == pte
   1.347 +//	r17 == bit0: 1=inst, 0=data; bit1: 1=itc, 0=vcpu_translate
   1.348 +//	r18 == XSI_PSR_IC_OFS
   1.349 +//	r24 == ps
   1.350 +//	r29 == saved value of b0 in case of recovery
   1.351 +//	r30 == recovery ip if failure occurs
   1.352 +//	r31 == pr
   1.353 +GLOBAL_ENTRY(fast_insert)
   1.354 +	// translate_domain_pte(r16=pteval,PSCB(ifa)=address,r24=itir)
   1.355 +	mov r19=1;;
   1.356 +	shl r20=r19,r24;;
   1.357 +	adds r20=-1,r20;;	// r20 == mask
   1.358 +	movl r19=_PAGE_PPN_MASK;;
   1.359 +	and r22=r16,r19;;	// r22 == pteval & _PAGE_PPN_MASK
   1.360 +	andcm r19=r22,r20;;
   1.361 +	adds r21=XSI_IFA_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.362 +	ld8 r21=[r21];;
   1.363 +	and r20=r21,r20;;
   1.364 +	or r19=r19,r20;;	// r19 == mpaddr
   1.365  // FIXME: for now, just do domain0 and skip mpaddr range checks
   1.366  	dep r20=r0,r19,0,PAGE_SHIFT
   1.367  	movl r21=PAGE_PHYS ;;
   1.368  	or r20=r20,r21 ;;	// r20==return value from lookup_domain_mpa
   1.369 -	// r8=pteval,r20=pteval2
   1.370 +	// r16=pteval,r20=pteval2
   1.371  	movl r19=_PAGE_PPN_MASK
   1.372  	movl r21=_PAGE_PL_2;;
   1.373 -	andcm r25=r8,r19;;	// r25==pteval & ~_PAGE_PPN_MASK
   1.374 +	andcm r25=r16,r19;;	// r25==pteval & ~_PAGE_PPN_MASK
   1.375  	and r22=r20,r19;;
   1.376  	or r22=r22,r21;;
   1.377  	or r22=r22,r25;;	// r22==return value from translate_domain_pte
   1.378  	// done with translate_domain_pte
   1.379 -	// now do vcpu_itc_no_srlz(vcpu,IorD,ifa,r22=pte,r8=mppte,r24=logps)
   1.380 +	// now do vcpu_itc_no_srlz(vcpu,IorD,ifa,r22=pte,r16=mppte,r24=logps)
   1.381  // FIXME: for now, just domain0 and skip range check
   1.382  	// psr.ic already cleared
   1.383  	// NOTE: r24 still contains ps (from above)
   1.384 @@ -1740,7 +2015,7 @@ ENTRY(hyper_itc_d)
   1.385  	adds r23=XSI_IFA_OFS-XSI_PSR_IC_OFS,r18 ;;
   1.386  	ld8 r23=[r23];;
   1.387  	mov cr.ifa=r23;;
   1.388 -	cmp.eq p6,p7=XEN_HYPER_ITC_D,r17;;
   1.389 +	tbit.z p6,p7=r17,0;;
   1.390  (p6)	itc.d r22;;
   1.391  (p7)	itc.i r22;;
   1.392  	dv_serialize_data
   1.393 @@ -1757,7 +2032,7 @@ ENTRY(hyper_itc_d)
   1.394  	st8 [r20]=r21;;
   1.395  	// vcpu_set_tr_entry(trp,r22=pte|1,r24=itir,r23=ifa)
   1.396  	// TR_ENTRY = {page_flags,itir,addr,rid}
   1.397 -	cmp.eq p6,p7=XEN_HYPER_ITC_D,r17
   1.398 +	tbit.z p6,p7=r17,0;;
   1.399  	movl r27=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;
   1.400  	ld8 r27=[r27];;
   1.401  	adds r28=IA64_VCPU_STARTING_RID_OFFSET,r27
   1.402 @@ -1768,7 +2043,6 @@ ENTRY(hyper_itc_d)
   1.403  	mov r19=-4096;;
   1.404  	and r23=r23,r19;;
   1.405  	st8 [r27]=r23,8;;	// ifa & ~0xfff
   1.406 -// ?? is virtualize_rid(v,get_rr(ifa))==vcpu_get_rr(ifa)?? YES!!
   1.407  	adds r29 = XSI_RR0_OFS-XSI_PSR_IC_OFS,r18
   1.408  	extr.u r25=r23,61,3;;
   1.409  	shladd r29=r25,3,r29;;
   1.410 @@ -1806,13 +2080,16 @@ 1:	// done with vcpu_set_tr_entry
   1.411  	//PSCBX(vcpu,i/dtlb_pte) = mp_pte
   1.412  	movl r27=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;
   1.413  	ld8 r27=[r27];;
   1.414 -	cmp.eq p6,p7=XEN_HYPER_ITC_D,r17;;
   1.415 +	tbit.z p6,p7=r17,0;;
   1.416  (p6)	adds r27=IA64_VCPU_DTLB_PTE_OFFSET,r27
   1.417  (p7)	adds r27=IA64_VCPU_ITLB_PTE_OFFSET,r27;;
   1.418 -	st8 [r27]=r8;;
   1.419 +	st8 [r27]=r16;;
   1.420  	// done with vcpu_itc_no_srlz
   1.421  
   1.422 -	// done, increment to point to next instruction
   1.423 +	// if hyper_itc, increment to point to next instruction
   1.424 +	tbit.z p7,p0=r17,1
   1.425 +(p7)	br.cond.sptk.few no_inc_iip;;
   1.426 +
   1.427  	mov r29=cr.ipsr
   1.428  	mov r30=cr.iip;;
   1.429  	extr.u r26=r29,41,2 ;;
   1.430 @@ -1824,8 +2101,11 @@ 1:	// done with vcpu_set_tr_entry
   1.431  	dep r29=r26,r29,41,2
   1.432  	;;
   1.433  	mov cr.ipsr=r29
   1.434 -	mov cr.iip=r30
   1.435 +	mov cr.iip=r30;;
   1.436 +
   1.437 +no_inc_iip:
   1.438  	mov pr=r31,-1 ;;
   1.439  	rfi
   1.440  	;;
   1.441 -END(hyper_itc_d)
   1.442 +END(fast_insert)
   1.443 +
     2.1 --- a/xen/arch/ia64/xen/ivt.S	Fri Nov 04 10:40:29 2005 -0600
     2.2 +++ b/xen/arch/ia64/xen/ivt.S	Mon Nov 07 10:40:42 2005 -0600
     2.3 @@ -246,7 +246,8 @@ ENTRY(itlb_miss)
     2.4  #ifdef XEN
     2.5  	VHPT_CCHAIN_LOOKUP(itlb_miss,i)
     2.6  #ifdef VHPT_GLOBAL
     2.7 -	br.cond.sptk page_fault
     2.8 +//	br.cond.sptk page_fault
     2.9 +	br.cond.sptk fast_tlb_miss_reflect
    2.10  	;;
    2.11  #endif
    2.12  #endif
    2.13 @@ -297,7 +298,8 @@ ENTRY(dtlb_miss)
    2.14  #ifdef XEN
    2.15  	VHPT_CCHAIN_LOOKUP(dtlb_miss,d)
    2.16  #ifdef VHPT_GLOBAL
    2.17 -	br.cond.sptk page_fault
    2.18 +//	br.cond.sptk page_fault
    2.19 +	br.cond.sptk fast_tlb_miss_reflect
    2.20  	;;
    2.21  #endif
    2.22  #endif
    2.23 @@ -485,6 +487,11 @@ END(alt_dtlb_miss)
    2.24  // 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
    2.25  ENTRY(nested_dtlb_miss)
    2.26  	DBG_FAULT(5)
    2.27 +#ifdef XEN
    2.28 +	mov b0=r30
    2.29 +	br.sptk.many b0				// return to continuation point
    2.30 +	;;
    2.31 +#endif
    2.32  	/*
    2.33  	 * In the absence of kernel bugs, we get here when the virtually mapped linear
    2.34  	 * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
    2.35 @@ -562,7 +569,11 @@ END(ikey_miss)
    2.36  
    2.37  	//-----------------------------------------------------------------------------------
    2.38  	// call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
    2.39 +#ifdef XEN
    2.40 +GLOBAL_ENTRY(page_fault)
    2.41 +#else
    2.42  ENTRY(page_fault)
    2.43 +#endif
    2.44  	ssm psr.dt
    2.45  	;;
    2.46  	srlz.i
     3.1 --- a/xen/arch/ia64/xen/privop.c	Fri Nov 04 10:40:29 2005 -0600
     3.2 +++ b/xen/arch/ia64/xen/privop.c	Mon Nov 07 10:40:42 2005 -0600
     3.3 @@ -1040,6 +1040,9 @@ extern unsigned long dtlb_translate_coun
     3.4  extern unsigned long tr_translate_count;
     3.5  extern unsigned long phys_translate_count;
     3.6  extern unsigned long vhpt_translate_count;
     3.7 +extern unsigned long fast_vhpt_translate_count;
     3.8 +extern unsigned long recover_to_page_fault_count;
     3.9 +extern unsigned long recover_to_break_fault_count;
    3.10  extern unsigned long lazy_cover_count;
    3.11  extern unsigned long idle_when_pending;
    3.12  extern unsigned long pal_halt_light_count;
    3.13 @@ -1049,9 +1052,12 @@ int dump_misc_stats(char *buf)
    3.14  {
    3.15  	char *s = buf;
    3.16  	s += sprintf(s,"Virtual TR translations: %d\n",tr_translate_count);
    3.17 -	s += sprintf(s,"Virtual VHPT translations: %d\n",vhpt_translate_count);
    3.18 +	s += sprintf(s,"Virtual VHPT slow translations: %d\n",vhpt_translate_count);
    3.19 +	s += sprintf(s,"Virtual VHPT fast translations: %d\n",fast_vhpt_translate_count);
    3.20  	s += sprintf(s,"Virtual DTLB translations: %d\n",dtlb_translate_count);
    3.21  	s += sprintf(s,"Physical translations: %d\n",phys_translate_count);
    3.22 +	s += sprintf(s,"Recoveries to page fault: %d\n",recover_to_page_fault_count);
    3.23 +	s += sprintf(s,"Recoveries to break fault: %d\n",recover_to_break_fault_count);
    3.24  	s += sprintf(s,"Idle when pending: %d\n",idle_when_pending);
    3.25  	s += sprintf(s,"PAL_HALT_LIGHT (no pending): %d\n",pal_halt_light_count);
    3.26  	s += sprintf(s,"context switches: %d\n",context_switch_count);
    3.27 @@ -1065,6 +1071,9 @@ void zero_misc_stats(void)
    3.28  	tr_translate_count = 0;
    3.29  	phys_translate_count = 0;
    3.30  	vhpt_translate_count = 0;
    3.31 +	fast_vhpt_translate_count = 0;
    3.32 +	recover_to_page_fault_count = 0;
    3.33 +	recover_to_break_fault_count = 0;
    3.34  	lazy_cover_count = 0;
    3.35  	pal_halt_light_count = 0;
    3.36  	idle_when_pending = 0;
     4.1 --- a/xen/arch/ia64/xen/vcpu.c	Fri Nov 04 10:40:29 2005 -0600
     4.2 +++ b/xen/arch/ia64/xen/vcpu.c	Mon Nov 07 10:40:42 2005 -0600
     4.3 @@ -1283,6 +1283,9 @@ IA64FAULT vcpu_ttag(VCPU *vcpu, UINT64 v
     4.4  #define itir_mask(itir) (~((1UL << itir_ps(itir)) - 1))
     4.5  
     4.6  unsigned long vhpt_translate_count = 0;
     4.7 +unsigned long fast_vhpt_translate_count = 0;
     4.8 +unsigned long recover_to_page_fault_count = 0;
     4.9 +unsigned long recover_to_break_fault_count = 0;
    4.10  
    4.11  IA64FAULT vcpu_translate(VCPU *vcpu, UINT64 address, BOOLEAN is_data, UINT64 *pteval, UINT64 *itir, UINT64 *iha)
    4.12  {