ia64/xen-unstable

view xen/arch/ia64/xen/vhpt.c @ 16817:564fa97594a6

[IA64] Introduce dom0_vhpt_size_log2 boot option to change dom0 vhpt size

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Alex Williamson <alex.williamson@hp.com>
date Tue Jan 22 08:26:20 2008 -0700 (2008-01-22)
parents 09cd682ac68e
children 7823534b28ca
line source
1 /*
2 * Initialize VHPT support.
3 *
4 * Copyright (C) 2004 Hewlett-Packard Co
5 * Dan Magenheimer <dan.magenheimer@hp.com>
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * per vcpu vhpt support
10 */
11 #include <linux/config.h>
12 #include <linux/kernel.h>
13 #include <linux/init.h>
15 #include <asm/processor.h>
16 #include <asm/system.h>
17 #include <asm/pgalloc.h>
18 #include <asm/page.h>
19 #include <asm/vhpt.h>
20 #include <asm/vcpu.h>
21 #include <asm/vcpumask.h>
22 #include <asm/vmmu.h>
24 DEFINE_PER_CPU (unsigned long, vhpt_paddr);
25 DEFINE_PER_CPU (unsigned long, vhpt_pend);
26 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
27 DEFINE_PER_CPU(volatile u32, vhpt_tlbflush_timestamp);
28 #endif
30 static void
31 __vhpt_flush(unsigned long vhpt_maddr, unsigned long vhpt_size_log2)
32 {
33 struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
34 unsigned long num_entries = 1 << (vhpt_size_log2 - 5);
35 int i;
37 for (i = 0; i < num_entries; i++, v++)
38 v->ti_tag = INVALID_TI_TAG;
39 }
41 void
42 local_vhpt_flush(void)
43 {
44 /* increment flush clock before flush */
45 u32 flush_time = tlbflush_clock_inc_and_return();
46 __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr), VHPT_SIZE_LOG2);
47 /* this must be after flush */
48 tlbflush_update_time(&__get_cpu_var(vhpt_tlbflush_timestamp),
49 flush_time);
50 perfc_incr(local_vhpt_flush);
51 }
53 void
54 vcpu_vhpt_flush(struct vcpu* v)
55 {
56 unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
57 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
58 if (HAS_PERVCPU_VHPT(v->domain))
59 vhpt_size_log2 = v->arch.pta.size;
60 #endif
61 __vhpt_flush(vcpu_vhpt_maddr(v), vhpt_size_log2);
62 perfc_incr(vcpu_vhpt_flush);
63 }
65 static void
66 vhpt_erase(unsigned long vhpt_maddr, unsigned long vhpt_size_log2)
67 {
68 struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
69 unsigned long num_entries = 1 << (vhpt_size_log2 - 5);
70 int i;
72 for (i = 0; i < num_entries; i++, v++) {
73 v->itir = 0;
74 v->CChain = 0;
75 v->page_flags = 0;
76 v->ti_tag = INVALID_TI_TAG;
77 }
78 // initialize cache too???
79 }
81 void vhpt_insert (unsigned long vadr, unsigned long pte, unsigned long itir)
82 {
83 struct vhpt_lf_entry *vlfe = (struct vhpt_lf_entry *)ia64_thash(vadr);
84 unsigned long tag = ia64_ttag (vadr);
86 /* Even though VHPT is per VCPU, still need to first disable the entry,
87 * because the processor may support speculative VHPT walk. */
88 vlfe->ti_tag = INVALID_TI_TAG;
89 wmb();
90 vlfe->itir = itir;
91 vlfe->page_flags = pte | _PAGE_P;
92 *(volatile unsigned long*)&vlfe->ti_tag = tag;
93 }
95 void vhpt_multiple_insert(unsigned long vaddr, unsigned long pte,
96 unsigned long itir)
97 {
98 unsigned char ps = current->arch.vhpt_pg_shift;
99 ia64_itir_t _itir = {.itir = itir};
100 unsigned long mask = (1L << _itir.ps) - 1;
101 int i;
103 if (_itir.ps - ps > 10 && !running_on_sim) {
104 // if this happens, we may want to revisit this algorithm
105 panic("vhpt_multiple_insert:logps-PAGE_SHIFT>10,spinning..\n");
106 }
107 if (_itir.ps - ps > 2) {
108 // FIXME: Should add counter here to see how often this
109 // happens (e.g. for 16MB pages!) and determine if it
110 // is a performance problem. On a quick look, it takes
111 // about 39000 instrs for a 16MB page and it seems to occur
112 // only a few times/second, so OK for now.
113 // An alternate solution would be to just insert the one
114 // 16KB in the vhpt (but with the full mapping)?
115 //printk("vhpt_multiple_insert: logps-PAGE_SHIFT==%d,"
116 //"va=%p, pa=%p, pa-masked=%p\n",
117 //logps-PAGE_SHIFT,vaddr,pte&_PFN_MASK,
118 //(pte&_PFN_MASK)&~mask);
119 }
120 vaddr &= ~mask;
121 pte = ((pte & _PFN_MASK) & ~mask) | (pte & ~_PFN_MASK);
122 for (i = 1L << (_itir.ps - ps); i > 0; i--) {
123 vhpt_insert(vaddr, pte, _itir.itir);
124 vaddr += (1L << ps);
125 }
126 }
128 void __init vhpt_init(void)
129 {
130 unsigned long paddr;
131 struct page_info *page;
132 #if !VHPT_ENABLED
133 return;
134 #endif
135 /* This allocation only holds true if vhpt table is unique for
136 * all domains. Or else later new vhpt table should be allocated
137 * from domain heap when each domain is created. Assume xen buddy
138 * allocator can provide natural aligned page by order?
139 */
140 page = alloc_domheap_pages(NULL, VHPT_SIZE_LOG2 - PAGE_SHIFT, 0);
141 if (!page)
142 panic("vhpt_init: can't allocate VHPT!\n");
143 paddr = page_to_maddr(page);
144 if (paddr & ((1 << VHPT_SIZE_LOG2) - 1))
145 panic("vhpt_init: bad VHPT alignment!\n");
146 __get_cpu_var(vhpt_paddr) = paddr;
147 __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
148 printk(XENLOG_DEBUG "vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
149 paddr, __get_cpu_var(vhpt_pend));
150 vhpt_erase(paddr, VHPT_SIZE_LOG2);
151 // we don't enable VHPT here.
152 // context_switch() or schedule_tail() does it.
153 }
155 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
156 void
157 domain_set_vhpt_size(struct domain *d, int8_t vhpt_size_log2)
158 {
159 if (vhpt_size_log2 == -1) {
160 d->arch.has_pervcpu_vhpt = 0;
161 printk(XENLOG_INFO "XEN_DOMCTL_arch_setup: "
162 "domain %d VHPT is global.\n", d->domain_id);
163 } else {
164 d->arch.has_pervcpu_vhpt = 1;
165 d->arch.vhpt_size_log2 = vhpt_size_log2;
166 printk(XENLOG_INFO "XEN_DOMCTL_arch_setup: "
167 "domain %d VHPT is per vcpu. size=2**%d\n",
168 d->domain_id, vhpt_size_log2);
169 }
170 }
172 int
173 pervcpu_vhpt_alloc(struct vcpu *v)
174 {
175 unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
177 if (v->domain->arch.vhpt_size_log2 > 0)
178 vhpt_size_log2 =
179 canonicalize_vhpt_size(v->domain->arch.vhpt_size_log2);
180 printk(XENLOG_DEBUG "%s vhpt_size_log2=%ld\n",
181 __func__, vhpt_size_log2);
182 v->arch.vhpt_entries =
183 (1UL << vhpt_size_log2) / sizeof(struct vhpt_lf_entry);
184 v->arch.vhpt_page =
185 alloc_domheap_pages(NULL, vhpt_size_log2 - PAGE_SHIFT, 0);
186 if (!v->arch.vhpt_page)
187 return -ENOMEM;
189 v->arch.vhpt_maddr = page_to_maddr(v->arch.vhpt_page);
190 if (v->arch.vhpt_maddr & ((1 << VHPT_SIZE_LOG2) - 1))
191 panic("pervcpu_vhpt_init: bad VHPT alignment!\n");
193 v->arch.pta.val = 0; // to zero reserved bits
194 v->arch.pta.ve = 1; // enable vhpt
195 v->arch.pta.size = vhpt_size_log2;
196 v->arch.pta.vf = 1; // long format
197 v->arch.pta.base = __va_ul(v->arch.vhpt_maddr) >> 15;
199 vhpt_erase(v->arch.vhpt_maddr, vhpt_size_log2);
200 smp_mb(); // per vcpu vhpt may be used by another physical cpu.
201 return 0;
202 }
204 void
205 pervcpu_vhpt_free(struct vcpu *v)
206 {
207 if (likely(v->arch.vhpt_page != NULL))
208 free_domheap_pages(v->arch.vhpt_page,
209 v->arch.pta.size - PAGE_SHIFT);
210 }
211 #endif
213 void
214 domain_purge_swtc_entries(struct domain *d)
215 {
216 struct vcpu* v;
217 for_each_vcpu(d, v) {
218 if (!v->is_initialised)
219 continue;
221 /* Purge TC entries.
222 FIXME: clear only if match. */
223 vcpu_purge_tr_entry(&PSCBX(v,dtlb));
224 vcpu_purge_tr_entry(&PSCBX(v,itlb));
225 }
226 }
228 void
229 domain_purge_swtc_entries_vcpu_dirty_mask(struct domain* d,
230 vcpumask_t vcpu_dirty_mask)
231 {
232 int vcpu;
234 for_each_vcpu_mask(vcpu, vcpu_dirty_mask) {
235 struct vcpu* v = d->vcpu[vcpu];
236 if (!v->is_initialised)
237 continue;
239 /* Purge TC entries.
240 FIXME: clear only if match. */
241 vcpu_purge_tr_entry(&PSCBX(v, dtlb));
242 vcpu_purge_tr_entry(&PSCBX(v, itlb));
243 }
244 }
246 // SMP: we can't assume v == current, vcpu might move to another physical cpu.
247 // So memory barrier is necessary.
248 // if we can guranttee that vcpu can run on only this physical cpu
249 // (e.g. vcpu == current), smp_mb() is unnecessary.
250 void vcpu_flush_vtlb_all(struct vcpu *v)
251 {
252 if (VMX_DOMAIN(v)) {
253 /* This code may be call for remapping shared_info and
254 grant_table share page from guest_physmap_remove_page()
255 in arch_memory_op() XENMEM_add_to_physmap to realize
256 PV-on-HVM feature. */
257 /* FIXME: This is not SMP-safe yet about p2m table */
258 /* Purge vTLB for VT-i domain */
259 thash_purge_all(v);
260 }
261 else {
262 /* First VCPU tlb. */
263 vcpu_purge_tr_entry(&PSCBX(v,dtlb));
264 vcpu_purge_tr_entry(&PSCBX(v,itlb));
265 smp_mb();
267 /* Then VHPT. */
268 if (HAS_PERVCPU_VHPT(v->domain))
269 vcpu_vhpt_flush(v);
270 else
271 local_vhpt_flush();
272 smp_mb();
274 /* Then mTLB. */
275 local_flush_tlb_all();
276 }
278 /* We could clear bit in d->domain_dirty_cpumask only if domain d in
279 not running on this processor. There is currently no easy way to
280 check this. */
282 perfc_incr(vcpu_flush_vtlb_all);
283 }
285 static void __vcpu_flush_vtlb_all(void *vcpu)
286 {
287 vcpu_flush_vtlb_all((struct vcpu*)vcpu);
288 }
290 // caller must incremented reference count to d somehow.
291 void domain_flush_vtlb_all(struct domain* d)
292 {
293 int cpu = smp_processor_id ();
294 struct vcpu *v;
296 for_each_vcpu(d, v) {
297 if (!v->is_initialised)
298 continue;
300 if (v->processor == cpu)
301 vcpu_flush_vtlb_all(v);
302 else
303 // SMP: it is racy to reference v->processor.
304 // vcpu scheduler may move this vcpu to another
305 // physicall processor, and change the value
306 // using plain store.
307 // We may be seeing the old value of it.
308 // In such case, flush_vtlb_for_context_switch()
309 // takes care of mTLB flush.
310 smp_call_function_single(v->processor,
311 __vcpu_flush_vtlb_all,
312 v, 1, 1);
313 }
314 perfc_incr(domain_flush_vtlb_all);
315 }
317 // Callers may need to call smp_mb() before/after calling this.
318 // Be carefull.
319 static void
320 __flush_vhpt_range(unsigned long vhpt_maddr, u64 vadr, u64 addr_range)
321 {
322 void *vhpt_base = __va(vhpt_maddr);
323 u64 pgsz = 1L << current->arch.vhpt_pg_shift;
324 u64 purge_addr = vadr & PAGE_MASK;
326 addr_range += vadr - purge_addr;
327 addr_range = PAGE_ALIGN(addr_range);
328 while ((long)addr_range > 0) {
329 /* Get the VHPT entry. */
330 unsigned int off = ia64_thash(purge_addr) -
331 __va_ul(vcpu_vhpt_maddr(current));
332 struct vhpt_lf_entry *v = vhpt_base + off;
333 v->ti_tag = INVALID_TI_TAG;
334 addr_range -= pgsz;
335 purge_addr += pgsz;
336 }
337 }
339 static void
340 cpu_flush_vhpt_range(int cpu, u64 vadr, u64 addr_range)
341 {
342 __flush_vhpt_range(per_cpu(vhpt_paddr, cpu), vadr, addr_range);
343 }
345 static void
346 vcpu_flush_vhpt_range(struct vcpu* v, u64 vadr, u64 addr_range)
347 {
348 __flush_vhpt_range(vcpu_vhpt_maddr(v), vadr, addr_range);
349 }
351 void vcpu_flush_tlb_vhpt_range (u64 vadr, u64 log_range)
352 {
353 if (HAS_PERVCPU_VHPT(current->domain))
354 vcpu_flush_vhpt_range(current, vadr, 1UL << log_range);
355 else
356 cpu_flush_vhpt_range(current->processor,
357 vadr, 1UL << log_range);
358 ia64_ptcl(vadr, log_range << 2);
359 ia64_srlz_i();
360 perfc_incr(vcpu_flush_tlb_vhpt_range);
361 }
363 void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range)
364 {
365 struct vcpu *v;
367 #if 0
368 // this only seems to occur at shutdown, but it does occur
369 if ((!addr_range) || addr_range & (addr_range - 1)) {
370 printk("vhpt_flush_address: weird range, spinning...\n");
371 while(1);
372 }
373 #endif
375 domain_purge_swtc_entries(d);
376 smp_mb();
378 for_each_vcpu (d, v) {
379 if (!v->is_initialised)
380 continue;
382 if (HAS_PERVCPU_VHPT(d)) {
383 vcpu_flush_vhpt_range(v, vadr, addr_range);
384 } else {
385 // SMP: it is racy to reference v->processor.
386 // vcpu scheduler may move this vcpu to another
387 // physicall processor, and change the value
388 // using plain store.
389 // We may be seeing the old value of it.
390 // In such case, flush_vtlb_for_context_switch()
391 /* Invalidate VHPT entries. */
392 cpu_flush_vhpt_range(v->processor, vadr, addr_range);
393 }
394 }
395 // ptc.ga has release semantics.
397 /* ptc.ga */
398 platform_global_tlb_purge(vadr, vadr + addr_range,
399 current->arch.vhpt_pg_shift);
400 perfc_incr(domain_flush_vtlb_range);
401 }
403 #ifdef CONFIG_XEN_IA64_TLB_TRACK
404 #include <asm/tlb_track.h>
405 #include <asm/vmx_vcpu.h>
406 void
407 __domain_flush_vtlb_track_entry(struct domain* d,
408 const struct tlb_track_entry* entry)
409 {
410 unsigned long rr7_rid;
411 int swap_rr0 = 0;
412 unsigned long old_rid;
413 unsigned long vaddr = entry->vaddr;
414 struct vcpu* v;
415 int cpu;
416 int vcpu;
417 int local_purge = 1;
419 /* tlb inert tracking is done in PAGE_SIZE uint. */
420 unsigned char ps = max_t(unsigned char,
421 current->arch.vhpt_pg_shift, PAGE_SHIFT);
422 /* This case isn't supported (yet). */
423 BUG_ON(current->arch.vhpt_pg_shift > PAGE_SHIFT);
425 BUG_ON((vaddr >> VRN_SHIFT) != VRN7);
426 /*
427 * heuristic:
428 * dom0linux accesses grant mapped pages via the kernel
429 * straight mapped area and it doesn't change rr7 rid.
430 * So it is likey that rr7 == entry->rid so that
431 * we can avoid rid change.
432 * When blktap is supported, this heuristic should be revised.
433 */
434 vcpu_get_rr(current, VRN7 << VRN_SHIFT, &rr7_rid);
435 if (likely(rr7_rid == entry->rid)) {
436 perfc_incr(tlb_track_use_rr7);
437 } else {
438 swap_rr0 = 1;
439 vaddr = (vaddr << 3) >> 3;// force vrn0
440 perfc_incr(tlb_track_swap_rr0);
441 }
443 // tlb_track_entry_printf(entry);
444 if (swap_rr0) {
445 vcpu_get_rr(current, 0, &old_rid);
446 vcpu_set_rr(current, 0, entry->rid);
447 }
449 if (HAS_PERVCPU_VHPT(d)) {
450 for_each_vcpu_mask(vcpu, entry->vcpu_dirty_mask) {
451 v = d->vcpu[vcpu];
452 if (!v->is_initialised)
453 continue;
455 /* Invalidate VHPT entries. */
456 vcpu_flush_vhpt_range(v, vaddr, 1L << ps);
458 /*
459 * current->processor == v->processor
460 * is racy. we may see old v->processor and
461 * a new physical processor of v might see old
462 * vhpt entry and insert tlb.
463 */
464 if (v != current)
465 local_purge = 0;
466 }
467 } else {
468 for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
469 /* Invalidate VHPT entries. */
470 cpu_flush_vhpt_range(cpu, vaddr, 1L << ps);
472 if (d->vcpu[cpu] != current)
473 local_purge = 0;
474 }
475 }
477 /* ptc.ga */
478 if (local_purge) {
479 ia64_ptcl(vaddr, ps << 2);
480 perfc_incr(domain_flush_vtlb_local);
481 } else {
482 /* ptc.ga has release semantics. */
483 platform_global_tlb_purge(vaddr, vaddr + (1L << ps), ps);
484 perfc_incr(domain_flush_vtlb_global);
485 }
487 if (swap_rr0) {
488 vcpu_set_rr(current, 0, old_rid);
489 }
490 perfc_incr(domain_flush_vtlb_track_entry);
491 }
493 void
494 domain_flush_vtlb_track_entry(struct domain* d,
495 const struct tlb_track_entry* entry)
496 {
497 domain_purge_swtc_entries_vcpu_dirty_mask(d, entry->vcpu_dirty_mask);
498 smp_mb();
500 __domain_flush_vtlb_track_entry(d, entry);
501 }
503 #endif
505 static void flush_tlb_vhpt_all (struct domain *d)
506 {
507 /* First VHPT. */
508 local_vhpt_flush ();
510 /* Then mTLB. */
511 local_flush_tlb_all ();
512 }
514 void domain_flush_tlb_vhpt(struct domain *d)
515 {
516 /* Very heavy... */
517 if (HAS_PERVCPU_VHPT(d) || is_hvm_domain(d))
518 on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
519 else
520 on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
521 cpus_clear (d->domain_dirty_cpumask);
522 }
524 void flush_tlb_mask(cpumask_t mask)
525 {
526 int cpu;
528 cpu = smp_processor_id();
529 if (cpu_isset (cpu, mask)) {
530 cpu_clear(cpu, mask);
531 flush_tlb_vhpt_all (NULL);
532 }
534 if (cpus_empty(mask))
535 return;
537 for_each_cpu_mask (cpu, mask)
538 smp_call_function_single
539 (cpu, (void (*)(void *))flush_tlb_vhpt_all, NULL, 1, 1);
540 }
542 #ifdef PERF_COUNTERS
543 void gather_vhpt_stats(void)
544 {
545 int i, cpu;
547 perfc_set(vhpt_nbr_entries, VHPT_NUM_ENTRIES);
549 for_each_present_cpu (cpu) {
550 struct vhpt_lf_entry *v = __va(per_cpu(vhpt_paddr, cpu));
551 unsigned long vhpt_valid = 0;
553 for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++)
554 if (!(v->ti_tag & INVALID_TI_TAG))
555 vhpt_valid++;
556 per_cpu(perfcounters, cpu)[PERFC_vhpt_valid_entries] = vhpt_valid;
557 }
558 }
559 #endif