ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 13905:2b3dd681dbce

[IA64] Fix I&D cache incoherency after vcpu migration

Windows on HVM ocasionally crashes with BSOD especially on boot time.
I finally found out the cause is PAL_CACHE_FLUSH(cache_type=4).
The cache_type means an argument of PAL_CACHE_FLUSH and cache_type=4
makes local instruction caches coherent with the data caches.
See SDM vol2 11.10.3, PAL_CACHE_FLUSH.
FYI, Linux never uses cache_type=4.

Currently PAL_CACHE_FLUSH is called on only local cpu and caches on the
other cpus are still incoherent.

Attached patch does:
- When cache_type=1,2,3 that means flushing caches on local cpus,
caches on the other cpus becomes to be flushed also.
It might be overkill and not efficient. But I think it's permissive
since these cache_type are seldom used.

- When cache_type=4, the actual PAL call to the other cpus is deferred
until the vcpu migration occurs or the cpu becomes idle.
Since Windows uses cache_type=4 quite often and many vcpus on SMP
environment call PAL_CACHE_FLUSH simultaneously.

Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com>
author awilliam@xenbuild2.aw
date Thu Feb 15 10:25:33 2007 -0700 (2007-02-15)
parents 8bdbe88e422f
children 3fa7489f87bb
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
54 unsigned long dom0_size = 512*1024*1024;
56 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
57 static unsigned int dom0_max_vcpus = 1;
58 integer_param("dom0_max_vcpus", dom0_max_vcpus);
60 extern unsigned long running_on_sim;
62 extern char dom0_command_line[];
64 /* forward declaration */
65 static void init_switch_stack(struct vcpu *v);
67 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
68 This is a Xen virtual address. */
69 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
70 DEFINE_PER_CPU(int *, current_psr_ic_addr);
72 DEFINE_PER_CPU(struct vcpu *, fp_owner);
74 #include <xen/sched-if.h>
76 static void
77 ia64_disable_vhpt_walker(void)
78 {
79 // disable VHPT. ia64_new_rr7() might cause VHPT
80 // fault without this because it flushes dtr[IA64_TR_VHPT]
81 // (VHPT_SIZE_LOG2 << 2) is just for avoid
82 // Reserved Register/Field fault.
83 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
84 }
86 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
87 {
88 int cpu = smp_processor_id();
89 int last_vcpu_id, last_processor;
91 if (!is_idle_domain(prev->domain))
92 tlbflush_update_time
93 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
94 tlbflush_current_time());
96 if (is_idle_domain(next->domain))
97 return;
99 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
100 last_processor = next->arch.last_processor;
102 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
103 next->arch.last_processor = cpu;
105 if ((last_vcpu_id != next->vcpu_id &&
106 last_vcpu_id != INVALID_VCPU_ID) ||
107 (last_vcpu_id == next->vcpu_id &&
108 last_processor != cpu &&
109 last_processor != INVALID_PROCESSOR)) {
110 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
111 u32 last_tlbflush_timestamp =
112 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
113 #endif
114 int vhpt_is_flushed = 0;
116 // if the vTLB implementation was changed,
117 // the followings must be updated either.
118 if (VMX_DOMAIN(next)) {
119 // currently vTLB for vt-i domian is per vcpu.
120 // so any flushing isn't needed.
121 } else if (HAS_PERVCPU_VHPT(next->domain)) {
122 // nothing to do
123 } else {
124 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
125 last_tlbflush_timestamp)) {
126 local_vhpt_flush();
127 vhpt_is_flushed = 1;
128 }
129 }
130 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
131 last_tlbflush_timestamp)) {
132 local_flush_tlb_all();
133 perfc_incrc(tlbflush_clock_cswitch_purge);
134 } else {
135 perfc_incrc(tlbflush_clock_cswitch_skip);
136 }
137 perfc_incrc(flush_vtlb_for_context_switch);
138 }
139 }
141 static void flush_cache_for_context_switch(struct vcpu *next)
142 {
143 extern cpumask_t cpu_cache_coherent_map;
144 int cpu = smp_processor_id();
146 if (is_idle_vcpu(next) ||
147 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
148 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
149 unsigned long flags;
150 u64 progress = 0;
151 s64 status;
153 local_irq_save(flags);
154 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
155 local_irq_restore(flags);
156 if (status != 0)
157 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
158 "cache_type=4 status %lx", status);
159 }
160 }
161 }
163 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
164 {
165 /*
166 * Implement eager save, lazy restore
167 */
168 if (!is_idle_vcpu(prev)) {
169 if (VMX_DOMAIN(prev)) {
170 if (FP_PSR(prev) & IA64_PSR_MFH) {
171 __ia64_save_fpu(prev->arch._thread.fph);
172 __ia64_per_cpu_var(fp_owner) = prev;
173 }
174 } else {
175 if (PSCB(prev, hpsr_mfh)) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 }
180 }
182 if (!is_idle_vcpu(next)) {
183 if (VMX_DOMAIN(next)) {
184 FP_PSR(next) = IA64_PSR_DFH;
185 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
186 } else {
187 PSCB(next, hpsr_dfh) = 1;
188 PSCB(next, hpsr_mfh) = 0;
189 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
190 }
191 }
192 }
194 void schedule_tail(struct vcpu *prev)
195 {
196 extern char ia64_ivt;
198 context_saved(prev);
199 ia64_disable_vhpt_walker();
201 if (VMX_DOMAIN(current)) {
202 vmx_do_launch(current);
203 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
204 current->processor);
205 } else {
206 ia64_set_iva(&ia64_ivt);
207 load_region_regs(current);
208 ia64_set_pta(vcpu_pta(current));
209 vcpu_load_kernel_regs(current);
210 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
211 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
212 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
213 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
214 migrate_timer(&current->arch.hlt_timer, current->processor);
215 }
216 flush_vtlb_for_context_switch(prev, current);
217 }
219 void context_switch(struct vcpu *prev, struct vcpu *next)
220 {
221 uint64_t spsr;
223 local_irq_save(spsr);
225 if (VMX_DOMAIN(prev)) {
226 vmx_save_state(prev);
227 if (!VMX_DOMAIN(next)) {
228 /* VMX domains can change the physical cr.dcr.
229 * Restore default to prevent leakage. */
230 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
231 }
232 }
233 if (VMX_DOMAIN(next))
234 vmx_load_state(next);
236 ia64_disable_vhpt_walker();
237 lazy_fp_switch(prev, current);
239 prev = ia64_switch_to(next);
241 /* Note: ia64_switch_to does not return here at vcpu initialization. */
243 if (VMX_DOMAIN(current)) {
244 vmx_load_all_rr(current);
245 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
246 current->processor);
247 } else {
248 struct domain *nd;
249 extern char ia64_ivt;
251 ia64_set_iva(&ia64_ivt);
253 nd = current->domain;
254 if (!is_idle_domain(nd)) {
255 load_region_regs(current);
256 ia64_set_pta(vcpu_pta(current));
257 vcpu_load_kernel_regs(current);
258 vcpu_set_next_timer(current);
259 if (vcpu_timer_expired(current))
260 vcpu_pend_timer(current);
261 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
262 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
263 __ia64_per_cpu_var(current_psr_ic_addr) =
264 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
265 } else {
266 /* When switching to idle domain, only need to disable vhpt
267 * walker. Then all accesses happen within idle context will
268 * be handled by TR mapping and identity mapping.
269 */
270 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
271 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
272 }
273 }
274 local_irq_restore(spsr);
276 /* lazy fp */
277 if (current->processor != current->arch.last_processor) {
278 unsigned long *addr;
279 addr = (unsigned long *)per_cpu_addr(fp_owner,
280 current->arch.last_processor);
281 ia64_cmpxchg(acq, addr, current, 0, 8);
282 }
284 flush_vtlb_for_context_switch(prev, current);
285 flush_cache_for_context_switch(current);
286 context_saved(prev);
287 }
289 void continue_running(struct vcpu *same)
290 {
291 /* nothing to do */
292 }
294 #ifdef CONFIG_PERFMON
295 static int pal_halt = 1;
296 static int can_do_pal_halt = 1;
298 static int __init nohalt_setup(char * str)
299 {
300 pal_halt = can_do_pal_halt = 0;
301 return 1;
302 }
303 __setup("nohalt", nohalt_setup);
305 void
306 update_pal_halt_status(int status)
307 {
308 can_do_pal_halt = pal_halt && status;
309 }
310 #else
311 #define can_do_pal_halt (1)
312 #endif
314 static void default_idle(void)
315 {
316 local_irq_disable();
317 if ( !softirq_pending(smp_processor_id()) ) {
318 if (can_do_pal_halt)
319 safe_halt();
320 else
321 cpu_relax();
322 }
323 local_irq_enable();
324 }
326 static void continue_cpu_idle_loop(void)
327 {
328 for ( ; ; )
329 {
330 #ifdef IA64
331 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
332 #else
333 irq_stat[cpu].idle_timestamp = jiffies;
334 #endif
335 while ( !softirq_pending(smp_processor_id()) )
336 default_idle();
337 raise_softirq(SCHEDULE_SOFTIRQ);
338 do_softirq();
339 }
340 }
342 void startup_cpu_idle_loop(void)
343 {
344 /* Just some sanity to ensure that the scheduler is set up okay. */
345 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
346 raise_softirq(SCHEDULE_SOFTIRQ);
348 continue_cpu_idle_loop();
349 }
351 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
352 * get_order_from_shift(XMAPPEDREGS_SHIFT))
353 */
354 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
355 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
356 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
357 #endif
359 void hlt_timer_fn(void *data)
360 {
361 struct vcpu *v = data;
362 vcpu_unblock(v);
363 }
365 void relinquish_vcpu_resources(struct vcpu *v)
366 {
367 if (HAS_PERVCPU_VHPT(v->domain))
368 pervcpu_vhpt_free(v);
369 if (v->arch.privregs != NULL) {
370 free_xenheap_pages(v->arch.privregs,
371 get_order_from_shift(XMAPPEDREGS_SHIFT));
372 v->arch.privregs = NULL;
373 }
374 kill_timer(&v->arch.hlt_timer);
375 }
377 struct vcpu *alloc_vcpu_struct(void)
378 {
379 struct vcpu *v;
380 struct thread_info *ti;
381 static int first_allocation = 1;
383 if (first_allocation) {
384 first_allocation = 0;
385 /* Still keep idle vcpu0 static allocated at compilation, due
386 * to some code from Linux still requires it in early phase.
387 */
388 return idle_vcpu[0];
389 }
391 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
392 return NULL;
393 memset(v, 0, sizeof(*v));
395 ti = alloc_thread_info(v);
396 /* Clear thread_info to clear some important fields, like
397 * preempt_count
398 */
399 memset(ti, 0, sizeof(struct thread_info));
400 init_switch_stack(v);
402 return v;
403 }
405 void free_vcpu_struct(struct vcpu *v)
406 {
407 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
408 }
410 int vcpu_initialise(struct vcpu *v)
411 {
412 struct domain *d = v->domain;
414 if (!is_idle_domain(d)) {
415 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
416 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
417 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
418 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
420 /* Is it correct ?
421 It depends on the domain rid usage.
423 A domain may share rid among its processor (eg having a
424 global VHPT). In this case, we should also share rid
425 among vcpus and the rid range should be the same.
427 However a domain may have per cpu rid allocation. In
428 this case we don't want to share rid among vcpus, but we may
429 do it if two vcpus are on the same cpu... */
431 v->arch.starting_rid = d->arch.starting_rid;
432 v->arch.ending_rid = d->arch.ending_rid;
433 v->arch.breakimm = d->arch.breakimm;
434 v->arch.last_processor = INVALID_PROCESSOR;
435 }
437 if (!VMX_DOMAIN(v))
438 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
439 first_cpu(cpu_online_map));
441 return 0;
442 }
444 int vcpu_late_initialise(struct vcpu *v)
445 {
446 struct domain *d = v->domain;
447 int rc, order, i;
449 if (HAS_PERVCPU_VHPT(d)) {
450 rc = pervcpu_vhpt_alloc(v);
451 if (rc != 0)
452 return rc;
453 }
455 /* Create privregs page. */
456 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
457 v->arch.privregs = alloc_xenheap_pages(order);
458 BUG_ON(v->arch.privregs == NULL);
459 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
460 for (i = 0; i < (1 << order); i++)
461 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
462 d, XENSHARE_writable);
463 /*
464 * XXX IA64_XMAPPEDREGS_PADDR
465 * assign these pages into guest pseudo physical address
466 * space for dom0 to map this page by gmfn.
467 * this is necessary for domain save, restore and dump-core.
468 */
469 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
470 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
471 virt_to_maddr(v->arch.privregs + i));
473 return 0;
474 }
476 void vcpu_destroy(struct vcpu *v)
477 {
478 if (v->domain->arch.is_vti)
479 vmx_relinquish_vcpu_resources(v);
480 else
481 relinquish_vcpu_resources(v);
482 }
484 static void init_switch_stack(struct vcpu *v)
485 {
486 struct pt_regs *regs = vcpu_regs (v);
487 struct switch_stack *sw = (struct switch_stack *) regs - 1;
488 extern void ia64_ret_from_clone;
490 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
491 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
492 sw->b0 = (unsigned long) &ia64_ret_from_clone;
493 sw->ar_fpsr = FPSR_DEFAULT;
494 v->arch._thread.ksp = (unsigned long) sw - 16;
495 // stay on kernel stack because may get interrupts!
496 // ia64_ret_from_clone switches to user stack
497 v->arch._thread.on_ustack = 0;
498 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
499 }
501 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
502 static int opt_pervcpu_vhpt = 1;
503 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
504 #endif
506 int arch_domain_create(struct domain *d)
507 {
508 int i;
510 // the following will eventually need to be negotiated dynamically
511 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
512 d->arch.breakimm = 0x1000;
513 for (i = 0; i < NR_CPUS; i++) {
514 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
515 }
517 if (is_idle_domain(d))
518 return 0;
520 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
521 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
522 dprintk(XENLOG_WARNING, "%s:%d domain %d pervcpu_vhpt %d\n",
523 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
524 #endif
525 if (tlb_track_create(d) < 0)
526 goto fail_nomem1;
527 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
528 if (d->shared_info == NULL)
529 goto fail_nomem;
530 memset(d->shared_info, 0, XSI_SIZE);
531 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
532 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
533 d, XENSHARE_writable);
535 /* We may also need emulation rid for region4, though it's unlikely
536 * to see guest issue uncacheable access in metaphysical mode. But
537 * keep such info here may be more sane.
538 */
539 if (!allocate_rid_range(d,0))
540 goto fail_nomem;
542 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
544 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
545 goto fail_nomem;
547 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
548 RANGESETF_prettyprint_hex);
550 printk ("arch_domain_create: domain=%p\n", d);
551 return 0;
553 fail_nomem:
554 tlb_track_destroy(d);
555 fail_nomem1:
556 if (d->arch.mm.pgd != NULL)
557 pgd_free(d->arch.mm.pgd);
558 if (d->shared_info != NULL)
559 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
560 return -ENOMEM;
561 }
563 void arch_domain_destroy(struct domain *d)
564 {
565 mm_final_teardown(d);
567 if (d->shared_info != NULL)
568 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
570 tlb_track_destroy(d);
572 /* Clear vTLB for the next domain. */
573 domain_flush_tlb_vhpt(d);
575 deallocate_rid_range(d);
576 }
578 int arch_vcpu_reset(struct vcpu *v)
579 {
580 /* FIXME: Stub for now */
581 return 0;
582 }
584 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
585 {
586 int i;
587 struct vcpu_extra_regs *er = &c.nat->extra_regs;
589 c.nat->user_regs = *vcpu_regs(v);
590 c.nat->privregs_pfn = get_gpfn_from_mfn(virt_to_maddr(v->arch.privregs) >>
591 PAGE_SHIFT);
593 /* Fill extra regs. */
594 for (i = 0; i < 8; i++) {
595 er->itrs[i].pte = v->arch.itrs[i].pte.val;
596 er->itrs[i].itir = v->arch.itrs[i].itir;
597 er->itrs[i].vadr = v->arch.itrs[i].vadr;
598 er->itrs[i].rid = v->arch.itrs[i].rid;
599 }
600 for (i = 0; i < 8; i++) {
601 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
602 er->dtrs[i].itir = v->arch.dtrs[i].itir;
603 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
604 er->dtrs[i].rid = v->arch.dtrs[i].rid;
605 }
606 er->event_callback_ip = v->arch.event_callback_ip;
607 er->dcr = v->arch.privregs ? PSCB(v,dcr) : 0;
608 er->iva = v->arch.iva;
609 }
611 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
612 {
613 struct pt_regs *regs = vcpu_regs (v);
614 struct domain *d = v->domain;
615 int rc;
617 *regs = c.nat->user_regs;
619 if (!d->arch.is_vti) {
620 /* domain runs at PL2/3 */
621 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
622 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
623 }
625 if (c.nat->flags & VGCF_EXTRA_REGS) {
626 int i;
627 struct vcpu_extra_regs *er = &c.nat->extra_regs;
629 for (i = 0; i < 8; i++) {
630 vcpu_set_itr(v, i, er->itrs[i].pte,
631 er->itrs[i].itir,
632 er->itrs[i].vadr,
633 er->itrs[i].rid);
634 }
635 for (i = 0; i < 8; i++) {
636 vcpu_set_dtr(v, i,
637 er->dtrs[i].pte,
638 er->dtrs[i].itir,
639 er->dtrs[i].vadr,
640 er->dtrs[i].rid);
641 }
642 v->arch.event_callback_ip = er->event_callback_ip;
643 v->arch.iva = er->iva;
644 }
646 if (test_bit(_VCPUF_initialised, &v->vcpu_flags))
647 return 0;
649 if (d->arch.is_vti) {
650 rc = vmx_final_setup_guest(v);
651 if (rc != 0)
652 return rc;
653 } else {
654 rc = vcpu_late_initialise(v);
655 if (rc != 0)
656 return rc;
657 VCPU(v, interrupt_mask_addr) =
658 (unsigned char *) d->arch.shared_info_va +
659 INT_ENABLE_OFFSET(v);
660 }
662 /* This overrides some registers. */
663 vcpu_init_regs(v);
665 /* Don't redo final setup */
666 set_bit(_VCPUF_initialised, &v->vcpu_flags);
667 return 0;
668 }
670 static void relinquish_memory(struct domain *d, struct list_head *list)
671 {
672 struct list_head *ent;
673 struct page_info *page;
674 #ifndef __ia64__
675 unsigned long x, y;
676 #endif
678 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
679 spin_lock_recursive(&d->page_alloc_lock);
680 ent = list->next;
681 while ( ent != list )
682 {
683 page = list_entry(ent, struct page_info, list);
684 /* Grab a reference to the page so it won't disappear from under us. */
685 if ( unlikely(!get_page(page, d)) )
686 {
687 /* Couldn't get a reference -- someone is freeing this page. */
688 ent = ent->next;
689 continue;
690 }
692 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
693 put_page_and_type(page);
695 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
696 put_page(page);
698 #ifndef __ia64__
699 /*
700 * Forcibly invalidate base page tables at this point to break circular
701 * 'linear page table' references. This is okay because MMU structures
702 * are not shared across domains and this domain is now dead. Thus base
703 * tables are not in use so a non-zero count means circular reference.
704 */
705 y = page->u.inuse.type_info;
706 for ( ; ; )
707 {
708 x = y;
709 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
710 (PGT_base_page_table|PGT_validated)) )
711 break;
713 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
714 if ( likely(y == x) )
715 {
716 free_page_type(page, PGT_base_page_table);
717 break;
718 }
719 }
720 #endif
722 /* Follow the list chain and /then/ potentially free the page. */
723 ent = ent->next;
724 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
725 put_page(page);
726 }
728 spin_unlock_recursive(&d->page_alloc_lock);
729 }
731 void domain_relinquish_resources(struct domain *d)
732 {
733 /* Relinquish guest resources for VT-i domain. */
734 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
735 vmx_relinquish_guest_resources(d);
737 /* Tear down shadow mode stuff. */
738 mm_teardown(d);
740 /* Relinquish every page of memory. */
741 relinquish_memory(d, &d->xenpage_list);
742 relinquish_memory(d, &d->page_list);
744 if (d->arch.is_vti && d->arch.sal_data)
745 xfree(d->arch.sal_data);
747 /* Free page used by xen oprofile buffer */
748 free_xenoprof_pages(d);
749 }
751 unsigned long
752 domain_set_shared_info_va (unsigned long va)
753 {
754 struct vcpu *v = current;
755 struct domain *d = v->domain;
757 /* Check virtual address:
758 must belong to region 7,
759 must be 64Kb aligned,
760 must not be within Xen virtual space. */
761 if ((va >> 61) != 7
762 || (va & 0xffffUL) != 0
763 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
764 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
766 /* Note: this doesn't work well if other cpus are already running.
767 However this is part of the spec :-) */
768 printk ("Domain set shared_info_va to 0x%016lx\n", va);
769 d->arch.shared_info_va = va;
771 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
772 INT_ENABLE_OFFSET(v);
774 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
776 /* Remap the shared pages. */
777 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
779 return 0;
780 }
782 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
783 #define SHADOW_COPY_CHUNK 1024
785 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
786 {
787 unsigned int op = sc->op;
788 int rc = 0;
789 int i;
790 //struct vcpu *v;
792 if (unlikely(d == current->domain)) {
793 gdprintk(XENLOG_INFO,
794 "Don't try to do a shadow op on yourself!\n");
795 return -EINVAL;
796 }
798 domain_pause(d);
800 switch (op)
801 {
802 case XEN_DOMCTL_SHADOW_OP_OFF:
803 if (shadow_mode_enabled (d)) {
804 u64 *bm = d->arch.shadow_bitmap;
806 /* Flush vhpt and tlb to restore dirty bit usage. */
807 domain_flush_tlb_vhpt(d);
809 /* Free bitmap. */
810 d->arch.shadow_bitmap_size = 0;
811 d->arch.shadow_bitmap = NULL;
812 xfree(bm);
813 }
814 break;
816 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
817 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
818 rc = -EINVAL;
819 break;
821 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
822 if (shadow_mode_enabled(d)) {
823 rc = -EINVAL;
824 break;
825 }
827 atomic64_set(&d->arch.shadow_fault_count, 0);
828 atomic64_set(&d->arch.shadow_dirty_count, 0);
830 d->arch.shadow_bitmap_size =
831 ((d->arch.convmem_end >> PAGE_SHIFT) +
832 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
833 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
834 d->arch.shadow_bitmap_size / BITS_PER_LONG);
835 if (d->arch.shadow_bitmap == NULL) {
836 d->arch.shadow_bitmap_size = 0;
837 rc = -ENOMEM;
838 }
839 else {
840 memset(d->arch.shadow_bitmap, 0,
841 d->arch.shadow_bitmap_size / 8);
843 /* Flush vhtp and tlb to enable dirty bit
844 virtualization. */
845 domain_flush_tlb_vhpt(d);
846 }
847 break;
849 case XEN_DOMCTL_SHADOW_OP_CLEAN:
850 {
851 int nbr_bytes;
853 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
854 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
856 atomic64_set(&d->arch.shadow_fault_count, 0);
857 atomic64_set(&d->arch.shadow_dirty_count, 0);
859 if (guest_handle_is_null(sc->dirty_bitmap) ||
860 (d->arch.shadow_bitmap == NULL)) {
861 rc = -EINVAL;
862 break;
863 }
865 if (sc->pages > d->arch.shadow_bitmap_size)
866 sc->pages = d->arch.shadow_bitmap_size;
868 nbr_bytes = (sc->pages + 7) / 8;
870 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
871 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
872 SHADOW_COPY_CHUNK : nbr_bytes - i;
874 if (copy_to_guest_offset(
875 sc->dirty_bitmap, i,
876 (uint8_t *)d->arch.shadow_bitmap + i,
877 size)) {
878 rc = -EFAULT;
879 break;
880 }
882 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
883 }
885 break;
886 }
888 case XEN_DOMCTL_SHADOW_OP_PEEK:
889 {
890 unsigned long size;
892 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
893 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
895 if (guest_handle_is_null(sc->dirty_bitmap) ||
896 (d->arch.shadow_bitmap == NULL)) {
897 rc = -EINVAL;
898 break;
899 }
901 if (sc->pages > d->arch.shadow_bitmap_size)
902 sc->pages = d->arch.shadow_bitmap_size;
904 size = (sc->pages + 7) / 8;
905 if (copy_to_guest(sc->dirty_bitmap,
906 (uint8_t *)d->arch.shadow_bitmap, size)) {
907 rc = -EFAULT;
908 break;
909 }
910 break;
911 }
912 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
913 sc->mb = 0;
914 break;
915 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
916 if (sc->mb > 0) {
917 BUG();
918 rc = -ENOMEM;
919 }
920 break;
921 default:
922 rc = -EINVAL;
923 break;
924 }
926 domain_unpause(d);
928 return rc;
929 }
931 // remove following line if not privifying in memory
932 //#define HAVE_PRIVIFY_MEMORY
933 #ifndef HAVE_PRIVIFY_MEMORY
934 #define privify_memory(x,y) do {} while(0)
935 #endif
937 static void loaddomainelfimage(struct domain *d, struct elf_binary *elf)
938 {
939 const elf_phdr *phdr;
940 int phnum, h, filesz, memsz;
941 unsigned long elfaddr, dom_mpaddr, dom_imva;
942 struct page_info *p;
944 phnum = elf_uval(elf, elf->ehdr, e_phnum);
945 for (h = 0; h < phnum; h++) {
946 phdr = elf_phdr_by_index(elf, h);
947 if (!elf_phdr_is_loadable(elf, phdr))
948 continue;
950 filesz = elf_uval(elf, phdr, p_filesz);
951 memsz = elf_uval(elf, phdr, p_memsz);
952 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
953 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
955 while (memsz > 0) {
956 p = assign_new_domain_page(d,dom_mpaddr);
957 BUG_ON (unlikely(p == NULL));
958 dom_imva = __va_ul(page_to_maddr(p));
959 if (filesz > 0) {
960 if (filesz >= PAGE_SIZE)
961 memcpy((void *) dom_imva,
962 (void *) elfaddr,
963 PAGE_SIZE);
964 else {
965 // copy partial page
966 memcpy((void *) dom_imva,
967 (void *) elfaddr, filesz);
968 // zero the rest of page
969 memset((void *) dom_imva+filesz, 0,
970 PAGE_SIZE-filesz);
971 }
972 //FIXME: This test for code seems to find a lot more than objdump -x does
973 if (elf_uval(elf, phdr, p_flags) & PF_X) {
974 privify_memory(dom_imva,PAGE_SIZE);
975 flush_icache_range(dom_imva,
976 dom_imva+PAGE_SIZE);
977 }
978 }
979 else if (memsz > 0) {
980 /* always zero out entire page */
981 memset((void *) dom_imva, 0, PAGE_SIZE);
982 }
983 memsz -= PAGE_SIZE;
984 filesz -= PAGE_SIZE;
985 elfaddr += PAGE_SIZE;
986 dom_mpaddr += PAGE_SIZE;
987 }
988 }
989 }
991 void alloc_dom0(void)
992 {
993 /* Check dom0 size. */
994 if (dom0_size < 4 * 1024 * 1024) {
995 panic("dom0_mem is too small, boot aborted"
996 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
997 }
999 if (running_on_sim) {
1000 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1003 /* no need to allocate pages for now
1004 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1005 */
1009 /*
1010 * Domain 0 has direct access to all devices absolutely. However
1011 * the major point of this stub here, is to allow alloc_dom_mem
1012 * handled with order > 0 request. Dom0 requires that bit set to
1013 * allocate memory for other domains.
1014 */
1015 static void physdev_init_dom0(struct domain *d)
1017 if (iomem_permit_access(d, 0UL, ~0UL))
1018 BUG();
1019 if (irqs_permit_access(d, 0, NR_IRQS-1))
1020 BUG();
1021 if (ioports_permit_access(d, 0, 0xffff))
1022 BUG();
1025 int construct_dom0(struct domain *d,
1026 unsigned long image_start, unsigned long image_len,
1027 unsigned long initrd_start, unsigned long initrd_len,
1028 char *cmdline)
1030 int i, rc;
1031 start_info_t *si;
1032 dom0_vga_console_info_t *ci;
1033 struct vcpu *v = d->vcpu[0];
1034 unsigned long max_pages;
1036 struct elf_binary elf;
1037 struct elf_dom_parms parms;
1038 unsigned long p_start;
1039 unsigned long pkern_start;
1040 unsigned long pkern_entry;
1041 unsigned long pkern_end;
1042 unsigned long pinitrd_start = 0;
1043 unsigned long pstart_info;
1044 struct page_info *start_info_page;
1045 unsigned long bp_mpa;
1046 struct ia64_boot_param *bp;
1048 //printk("construct_dom0: starting\n");
1050 /* Sanity! */
1051 BUG_ON(d != dom0);
1052 BUG_ON(d->vcpu[0] == NULL);
1053 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1055 printk("*** LOADING DOMAIN 0 ***\n");
1057 max_pages = dom0_size / PAGE_SIZE;
1058 d->max_pages = max_pages;
1059 d->tot_pages = 0;
1061 rc = elf_init(&elf, (void*)image_start, image_len);
1062 if ( rc != 0 )
1063 return rc;
1064 #ifdef VERBOSE
1065 elf_set_verbose(&elf);
1066 #endif
1067 elf_parse_binary(&elf);
1068 if (0 != (elf_xen_parse(&elf, &parms)))
1069 return rc;
1071 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1072 elf_64bit(&elf) ? "64-bit" : "32-bit",
1073 elf_msb(&elf) ? "msb" : "lsb",
1074 elf.pstart, elf.pend);
1075 if (!elf_64bit(&elf) ||
1076 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1077 printk("Incompatible kernel binary\n");
1078 return -1;
1081 p_start = parms.virt_base;
1082 pkern_start = parms.virt_kstart;
1083 pkern_end = parms.virt_kend;
1084 pkern_entry = parms.virt_entry;
1086 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1088 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1090 printk("Initial guest OS must load to a page boundary.\n");
1091 return -EINVAL;
1094 pstart_info = PAGE_ALIGN(pkern_end);
1095 if(initrd_start && initrd_len){
1096 unsigned long offset;
1098 /* The next page aligned boundary after the start info.
1099 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1100 pinitrd_start = pstart_info + PAGE_SIZE;
1101 if (pinitrd_start + initrd_len >= dom0_size)
1102 panic("%s: not enough memory assigned to dom0", __func__);
1103 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1104 struct page_info *p;
1105 p = assign_new_domain_page(d, pinitrd_start + offset);
1106 if (p == NULL)
1107 panic("%s: can't allocate page for initrd image", __func__);
1108 if (initrd_len < offset + PAGE_SIZE)
1109 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1110 initrd_len - offset);
1111 else
1112 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1116 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1117 " Kernel image: %lx->%lx\n"
1118 " Entry address: %lx\n"
1119 " Init. ramdisk: %lx len %lx\n"
1120 " Start info.: %lx->%lx\n",
1121 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1122 pstart_info, pstart_info + PAGE_SIZE);
1124 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1126 printk("Initial guest OS requires too much space\n"
1127 "(%luMB is greater than %luMB limit)\n",
1128 (pkern_end-pkern_start)>>20,
1129 (max_pages <<PAGE_SHIFT)>>20);
1130 return -ENOMEM;
1133 // if high 3 bits of pkern start are non-zero, error
1135 // if pkern end is after end of metaphysical memory, error
1136 // (we should be able to deal with this... later)
1138 /* Mask all upcalls... */
1139 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1140 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1142 if (dom0_max_vcpus == 0)
1143 dom0_max_vcpus = MAX_VIRT_CPUS;
1144 if (dom0_max_vcpus > num_online_cpus())
1145 dom0_max_vcpus = num_online_cpus();
1146 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1147 dom0_max_vcpus = MAX_VIRT_CPUS;
1149 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1150 for ( i = 1; i < dom0_max_vcpus; i++ )
1151 if (alloc_vcpu(d, i, i) == NULL)
1152 panic("Cannot allocate dom0 vcpu %d\n", i);
1154 /* Copy the OS image. */
1155 loaddomainelfimage(d,&elf);
1157 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1158 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1160 /* Set up start info area. */
1161 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1162 start_info_page = assign_new_domain_page(d, pstart_info);
1163 if (start_info_page == NULL)
1164 panic("can't allocate start info page");
1165 si = page_to_virt(start_info_page);
1166 memset(si, 0, PAGE_SIZE);
1167 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1168 xen_major_version(), xen_minor_version());
1169 si->nr_pages = max_pages;
1170 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1172 printk("Dom0: 0x%lx\n", (u64)dom0);
1174 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1176 /* Build firmware.
1177 Note: Linux kernel reserve memory used by start_info, so there is
1178 no need to remove it from MDT. */
1179 bp_mpa = pstart_info + sizeof(struct start_info);
1180 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1182 /* Fill boot param. */
1183 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1185 bp = (struct ia64_boot_param *)((unsigned char *)si +
1186 sizeof(start_info_t));
1187 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1189 /* We assume console has reached the last line! */
1190 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1191 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1192 bp->console_info.orig_x = 0;
1193 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1194 0 : bp->console_info.num_rows - 1;
1196 bp->initrd_start = pinitrd_start;
1197 bp->initrd_size = ia64_boot_param->initrd_size;
1199 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1200 sizeof(start_info_t) +
1201 sizeof(struct ia64_boot_param));
1203 if (fill_console_start_info(ci)) {
1204 si->console.dom0.info_off = sizeof(start_info_t) +
1205 sizeof(struct ia64_boot_param);
1206 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1209 vcpu_init_regs (v);
1211 vcpu_regs(v)->r28 = bp_mpa;
1213 vcpu_regs (v)->cr_iip = pkern_entry;
1215 physdev_init_dom0(d);
1217 return 0;
1220 void machine_restart(char * __unused)
1222 console_start_sync();
1223 if (running_on_sim)
1224 printk ("machine_restart called. spinning...\n");
1225 else
1226 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1227 while(1);
1230 extern void cpu_halt(void);
1232 void machine_halt(void)
1234 console_start_sync();
1235 if (running_on_sim)
1236 printk ("machine_halt called. spinning...\n");
1237 else
1238 cpu_halt();
1239 while(1);
1242 void sync_vcpu_execstate(struct vcpu *v)
1244 // __ia64_save_fpu(v->arch._thread.fph);
1245 // if (VMX_DOMAIN(v))
1246 // vmx_save_state(v);
1247 // FIXME SMP: Anything else needed here for SMP?
1250 static void parse_dom0_mem(char *s)
1252 dom0_size = parse_size_and_unit(s, NULL);
1254 custom_param("dom0_mem", parse_dom0_mem);