direct-io.hg

view xen/arch/ia64/xen/domain.c @ 14350:f3f5f2756d75

x86: Add VGCF_onlien flag to vcpu_guest_context.
Change common Xen code to start all VCPUs (except idle ones)
offline. Change arch code to deal with this.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Mon Mar 12 13:53:43 2007 +0000 (2007-03-12)
parents d907467f08cd
children 1584263f9fc5
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
54 unsigned long dom0_size = 512*1024*1024;
56 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
57 static unsigned int dom0_max_vcpus = 1;
58 integer_param("dom0_max_vcpus", dom0_max_vcpus);
60 extern unsigned long running_on_sim;
62 extern char dom0_command_line[];
64 /* forward declaration */
65 static void init_switch_stack(struct vcpu *v);
67 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
68 This is a Xen virtual address. */
69 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
70 DEFINE_PER_CPU(int *, current_psr_ic_addr);
72 DEFINE_PER_CPU(struct vcpu *, fp_owner);
74 #include <xen/sched-if.h>
76 static void
77 ia64_disable_vhpt_walker(void)
78 {
79 // disable VHPT. ia64_new_rr7() might cause VHPT
80 // fault without this because it flushes dtr[IA64_TR_VHPT]
81 // (VHPT_SIZE_LOG2 << 2) is just for avoid
82 // Reserved Register/Field fault.
83 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
84 }
86 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
87 {
88 int cpu = smp_processor_id();
89 int last_vcpu_id, last_processor;
91 if (!is_idle_domain(prev->domain))
92 tlbflush_update_time
93 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
94 tlbflush_current_time());
96 if (is_idle_domain(next->domain))
97 return;
99 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
100 last_processor = next->arch.last_processor;
102 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
103 next->arch.last_processor = cpu;
105 if ((last_vcpu_id != next->vcpu_id &&
106 last_vcpu_id != INVALID_VCPU_ID) ||
107 (last_vcpu_id == next->vcpu_id &&
108 last_processor != cpu &&
109 last_processor != INVALID_PROCESSOR)) {
110 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
111 u32 last_tlbflush_timestamp =
112 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
113 #endif
114 int vhpt_is_flushed = 0;
116 // if the vTLB implementation was changed,
117 // the followings must be updated either.
118 if (VMX_DOMAIN(next)) {
119 // currently vTLB for vt-i domian is per vcpu.
120 // so any flushing isn't needed.
121 } else if (HAS_PERVCPU_VHPT(next->domain)) {
122 // nothing to do
123 } else {
124 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
125 last_tlbflush_timestamp)) {
126 local_vhpt_flush();
127 vhpt_is_flushed = 1;
128 }
129 }
130 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
131 last_tlbflush_timestamp)) {
132 local_flush_tlb_all();
133 perfc_incrc(tlbflush_clock_cswitch_purge);
134 } else {
135 perfc_incrc(tlbflush_clock_cswitch_skip);
136 }
137 perfc_incrc(flush_vtlb_for_context_switch);
138 }
139 }
141 static void flush_cache_for_context_switch(struct vcpu *next)
142 {
143 extern cpumask_t cpu_cache_coherent_map;
144 int cpu = smp_processor_id();
146 if (is_idle_vcpu(next) ||
147 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
148 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
149 unsigned long flags;
150 u64 progress = 0;
151 s64 status;
153 local_irq_save(flags);
154 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
155 local_irq_restore(flags);
156 if (status != 0)
157 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
158 "cache_type=4 status %lx", status);
159 }
160 }
161 }
163 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
164 {
165 /*
166 * Implement eager save, lazy restore
167 */
168 if (!is_idle_vcpu(prev)) {
169 if (VMX_DOMAIN(prev)) {
170 if (FP_PSR(prev) & IA64_PSR_MFH) {
171 __ia64_save_fpu(prev->arch._thread.fph);
172 __ia64_per_cpu_var(fp_owner) = prev;
173 }
174 } else {
175 if (PSCB(prev, hpsr_mfh)) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 }
180 }
182 if (!is_idle_vcpu(next)) {
183 if (VMX_DOMAIN(next)) {
184 FP_PSR(next) = IA64_PSR_DFH;
185 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
186 } else {
187 PSCB(next, hpsr_dfh) = 1;
188 PSCB(next, hpsr_mfh) = 0;
189 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
190 }
191 }
192 }
194 void schedule_tail(struct vcpu *prev)
195 {
196 extern char ia64_ivt;
198 context_saved(prev);
199 ia64_disable_vhpt_walker();
201 if (VMX_DOMAIN(current)) {
202 vmx_do_launch(current);
203 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
204 current->processor);
205 } else {
206 ia64_set_iva(&ia64_ivt);
207 load_region_regs(current);
208 ia64_set_pta(vcpu_pta(current));
209 vcpu_load_kernel_regs(current);
210 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
211 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
212 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
213 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
214 migrate_timer(&current->arch.hlt_timer, current->processor);
215 }
216 flush_vtlb_for_context_switch(prev, current);
217 }
219 void context_switch(struct vcpu *prev, struct vcpu *next)
220 {
221 uint64_t spsr;
223 local_irq_save(spsr);
225 if (VMX_DOMAIN(prev)) {
226 vmx_save_state(prev);
227 if (!VMX_DOMAIN(next)) {
228 /* VMX domains can change the physical cr.dcr.
229 * Restore default to prevent leakage. */
230 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
231 }
232 }
233 if (VMX_DOMAIN(next))
234 vmx_load_state(next);
236 ia64_disable_vhpt_walker();
237 lazy_fp_switch(prev, current);
239 prev = ia64_switch_to(next);
241 /* Note: ia64_switch_to does not return here at vcpu initialization. */
243 if (VMX_DOMAIN(current)) {
244 vmx_load_all_rr(current);
245 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
246 current->processor);
247 } else {
248 struct domain *nd;
249 extern char ia64_ivt;
251 ia64_set_iva(&ia64_ivt);
253 nd = current->domain;
254 if (!is_idle_domain(nd)) {
255 load_region_regs(current);
256 ia64_set_pta(vcpu_pta(current));
257 vcpu_load_kernel_regs(current);
258 vcpu_set_next_timer(current);
259 if (vcpu_timer_expired(current))
260 vcpu_pend_timer(current);
261 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
262 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
263 __ia64_per_cpu_var(current_psr_ic_addr) =
264 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
265 } else {
266 /* When switching to idle domain, only need to disable vhpt
267 * walker. Then all accesses happen within idle context will
268 * be handled by TR mapping and identity mapping.
269 */
270 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
271 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
272 }
273 }
274 local_irq_restore(spsr);
276 /* lazy fp */
277 if (current->processor != current->arch.last_processor) {
278 unsigned long *addr;
279 addr = (unsigned long *)per_cpu_addr(fp_owner,
280 current->arch.last_processor);
281 ia64_cmpxchg(acq, addr, current, 0, 8);
282 }
284 flush_vtlb_for_context_switch(prev, current);
285 flush_cache_for_context_switch(current);
286 context_saved(prev);
287 }
289 void continue_running(struct vcpu *same)
290 {
291 /* nothing to do */
292 }
294 #ifdef CONFIG_PERFMON
295 static int pal_halt = 1;
296 static int can_do_pal_halt = 1;
298 static int __init nohalt_setup(char * str)
299 {
300 pal_halt = can_do_pal_halt = 0;
301 return 1;
302 }
303 __setup("nohalt", nohalt_setup);
305 void
306 update_pal_halt_status(int status)
307 {
308 can_do_pal_halt = pal_halt && status;
309 }
310 #else
311 #define can_do_pal_halt (1)
312 #endif
314 static void default_idle(void)
315 {
316 local_irq_disable();
317 if ( !softirq_pending(smp_processor_id()) ) {
318 if (can_do_pal_halt)
319 safe_halt();
320 else
321 cpu_relax();
322 }
323 local_irq_enable();
324 }
326 static void continue_cpu_idle_loop(void)
327 {
328 for ( ; ; )
329 {
330 #ifdef IA64
331 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
332 #else
333 irq_stat[cpu].idle_timestamp = jiffies;
334 #endif
335 while ( !softirq_pending(smp_processor_id()) )
336 default_idle();
337 raise_softirq(SCHEDULE_SOFTIRQ);
338 do_softirq();
339 }
340 }
342 void startup_cpu_idle_loop(void)
343 {
344 /* Just some sanity to ensure that the scheduler is set up okay. */
345 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
346 raise_softirq(SCHEDULE_SOFTIRQ);
348 continue_cpu_idle_loop();
349 }
351 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
352 * get_order_from_shift(XMAPPEDREGS_SHIFT))
353 */
354 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
355 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
356 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
357 #endif
359 void hlt_timer_fn(void *data)
360 {
361 struct vcpu *v = data;
362 vcpu_unblock(v);
363 }
365 void relinquish_vcpu_resources(struct vcpu *v)
366 {
367 if (HAS_PERVCPU_VHPT(v->domain))
368 pervcpu_vhpt_free(v);
369 if (v->arch.privregs != NULL) {
370 free_xenheap_pages(v->arch.privregs,
371 get_order_from_shift(XMAPPEDREGS_SHIFT));
372 v->arch.privregs = NULL;
373 }
374 kill_timer(&v->arch.hlt_timer);
375 }
377 struct vcpu *alloc_vcpu_struct(void)
378 {
379 struct vcpu *v;
380 struct thread_info *ti;
381 static int first_allocation = 1;
383 if (first_allocation) {
384 first_allocation = 0;
385 /* Still keep idle vcpu0 static allocated at compilation, due
386 * to some code from Linux still requires it in early phase.
387 */
388 return idle_vcpu[0];
389 }
391 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
392 return NULL;
393 memset(v, 0, sizeof(*v));
395 ti = alloc_thread_info(v);
396 /* Clear thread_info to clear some important fields, like
397 * preempt_count
398 */
399 memset(ti, 0, sizeof(struct thread_info));
400 init_switch_stack(v);
402 return v;
403 }
405 void free_vcpu_struct(struct vcpu *v)
406 {
407 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
408 }
410 int vcpu_initialise(struct vcpu *v)
411 {
412 struct domain *d = v->domain;
414 if (!is_idle_domain(d)) {
415 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
416 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
417 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
418 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
420 /* Is it correct ?
421 It depends on the domain rid usage.
423 A domain may share rid among its processor (eg having a
424 global VHPT). In this case, we should also share rid
425 among vcpus and the rid range should be the same.
427 However a domain may have per cpu rid allocation. In
428 this case we don't want to share rid among vcpus, but we may
429 do it if two vcpus are on the same cpu... */
431 v->arch.starting_rid = d->arch.starting_rid;
432 v->arch.ending_rid = d->arch.ending_rid;
433 v->arch.breakimm = d->arch.breakimm;
434 v->arch.last_processor = INVALID_PROCESSOR;
435 }
437 if (!VMX_DOMAIN(v))
438 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
439 first_cpu(cpu_online_map));
441 return 0;
442 }
444 int vcpu_late_initialise(struct vcpu *v)
445 {
446 struct domain *d = v->domain;
447 int rc, order, i;
449 if (HAS_PERVCPU_VHPT(d)) {
450 rc = pervcpu_vhpt_alloc(v);
451 if (rc != 0)
452 return rc;
453 }
455 /* Create privregs page. */
456 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
457 v->arch.privregs = alloc_xenheap_pages(order);
458 BUG_ON(v->arch.privregs == NULL);
459 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
460 for (i = 0; i < (1 << order); i++)
461 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
462 d, XENSHARE_writable);
463 /*
464 * XXX IA64_XMAPPEDREGS_PADDR
465 * assign these pages into guest pseudo physical address
466 * space for dom0 to map this page by gmfn.
467 * this is necessary for domain save, restore and dump-core.
468 */
469 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
470 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
471 virt_to_maddr(v->arch.privregs + i));
473 return 0;
474 }
476 void vcpu_destroy(struct vcpu *v)
477 {
478 if (v->domain->arch.is_vti)
479 vmx_relinquish_vcpu_resources(v);
480 else
481 relinquish_vcpu_resources(v);
482 }
484 static void init_switch_stack(struct vcpu *v)
485 {
486 struct pt_regs *regs = vcpu_regs (v);
487 struct switch_stack *sw = (struct switch_stack *) regs - 1;
488 extern void ia64_ret_from_clone;
490 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
491 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
492 sw->b0 = (unsigned long) &ia64_ret_from_clone;
493 sw->ar_fpsr = FPSR_DEFAULT;
494 v->arch._thread.ksp = (unsigned long) sw - 16;
495 // stay on kernel stack because may get interrupts!
496 // ia64_ret_from_clone switches to user stack
497 v->arch._thread.on_ustack = 0;
498 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
499 }
501 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
502 static int opt_pervcpu_vhpt = 1;
503 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
504 #endif
506 int arch_domain_create(struct domain *d)
507 {
508 int i;
510 // the following will eventually need to be negotiated dynamically
511 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
512 d->arch.breakimm = 0x1000;
513 for (i = 0; i < NR_CPUS; i++) {
514 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
515 }
517 if (is_idle_domain(d))
518 return 0;
520 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
521 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
522 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
523 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
524 #endif
525 if (tlb_track_create(d) < 0)
526 goto fail_nomem1;
527 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
528 if (d->shared_info == NULL)
529 goto fail_nomem;
530 memset(d->shared_info, 0, XSI_SIZE);
531 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
532 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
533 d, XENSHARE_writable);
535 /* We may also need emulation rid for region4, though it's unlikely
536 * to see guest issue uncacheable access in metaphysical mode. But
537 * keep such info here may be more sane.
538 */
539 if (!allocate_rid_range(d,0))
540 goto fail_nomem;
542 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
544 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
545 goto fail_nomem;
547 /*
548 * grant_table_create() can't fully initialize grant table for domain
549 * because it is called before arch_domain_create().
550 * Here we complete the initialization which requires p2m table.
551 */
552 spin_lock(&d->grant_table->lock);
553 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
554 ia64_gnttab_create_shared_page(d, d->grant_table, i);
555 spin_unlock(&d->grant_table->lock);
557 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
558 RANGESETF_prettyprint_hex);
560 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
561 return 0;
563 fail_nomem:
564 tlb_track_destroy(d);
565 fail_nomem1:
566 if (d->arch.mm.pgd != NULL)
567 pgd_free(d->arch.mm.pgd);
568 if (d->shared_info != NULL)
569 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
570 return -ENOMEM;
571 }
573 void arch_domain_destroy(struct domain *d)
574 {
575 mm_final_teardown(d);
577 if (d->shared_info != NULL)
578 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
580 tlb_track_destroy(d);
582 /* Clear vTLB for the next domain. */
583 domain_flush_tlb_vhpt(d);
585 deallocate_rid_range(d);
586 }
588 int arch_vcpu_reset(struct vcpu *v)
589 {
590 /* FIXME: Stub for now */
591 return 0;
592 }
594 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
595 {
596 int i;
597 struct vcpu_extra_regs *er = &c.nat->extra_regs;
599 c.nat->user_regs = *vcpu_regs(v);
600 c.nat->privregs_pfn = get_gpfn_from_mfn(virt_to_maddr(v->arch.privregs) >>
601 PAGE_SHIFT);
603 /* Fill extra regs. */
604 for (i = 0; i < 8; i++) {
605 er->itrs[i].pte = v->arch.itrs[i].pte.val;
606 er->itrs[i].itir = v->arch.itrs[i].itir;
607 er->itrs[i].vadr = v->arch.itrs[i].vadr;
608 er->itrs[i].rid = v->arch.itrs[i].rid;
609 }
610 for (i = 0; i < 8; i++) {
611 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
612 er->dtrs[i].itir = v->arch.dtrs[i].itir;
613 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
614 er->dtrs[i].rid = v->arch.dtrs[i].rid;
615 }
616 er->event_callback_ip = v->arch.event_callback_ip;
617 er->dcr = v->arch.privregs ? PSCB(v,dcr) : 0;
618 er->iva = v->arch.iva;
619 }
621 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
622 {
623 struct pt_regs *regs = vcpu_regs (v);
624 struct domain *d = v->domain;
625 int rc;
627 *regs = c.nat->user_regs;
629 if (!d->arch.is_vti) {
630 /* domain runs at PL2/3 */
631 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
632 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
633 }
635 if (c.nat->flags & VGCF_EXTRA_REGS) {
636 int i;
637 struct vcpu_extra_regs *er = &c.nat->extra_regs;
639 for (i = 0; i < 8; i++) {
640 vcpu_set_itr(v, i, er->itrs[i].pte,
641 er->itrs[i].itir,
642 er->itrs[i].vadr,
643 er->itrs[i].rid);
644 }
645 for (i = 0; i < 8; i++) {
646 vcpu_set_dtr(v, i,
647 er->dtrs[i].pte,
648 er->dtrs[i].itir,
649 er->dtrs[i].vadr,
650 er->dtrs[i].rid);
651 }
652 v->arch.event_callback_ip = er->event_callback_ip;
653 v->arch.iva = er->iva;
654 }
656 if (test_bit(_VCPUF_initialised, &v->vcpu_flags))
657 return 0;
659 if (d->arch.is_vti) {
660 rc = vmx_final_setup_guest(v);
661 if (rc != 0)
662 return rc;
663 } else {
664 rc = vcpu_late_initialise(v);
665 if (rc != 0)
666 return rc;
667 VCPU(v, interrupt_mask_addr) =
668 (unsigned char *) d->arch.shared_info_va +
669 INT_ENABLE_OFFSET(v);
670 }
672 /* This overrides some registers. */
673 vcpu_init_regs(v);
675 /* Don't redo final setup. Auto-online VCPU0. */
676 if (!test_and_set_bit(_VCPUF_initialised, &v->vcpu_flags) &&
677 (v->vcpu_id == 0))
678 clear_bit(_VCPUF_down, &v->vcpu_flags);
680 return 0;
681 }
683 static void relinquish_memory(struct domain *d, struct list_head *list)
684 {
685 struct list_head *ent;
686 struct page_info *page;
687 #ifndef __ia64__
688 unsigned long x, y;
689 #endif
691 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
692 spin_lock_recursive(&d->page_alloc_lock);
693 ent = list->next;
694 while ( ent != list )
695 {
696 page = list_entry(ent, struct page_info, list);
697 /* Grab a reference to the page so it won't disappear from under us. */
698 if ( unlikely(!get_page(page, d)) )
699 {
700 /* Couldn't get a reference -- someone is freeing this page. */
701 ent = ent->next;
702 continue;
703 }
705 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
706 put_page_and_type(page);
708 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
709 put_page(page);
711 #ifndef __ia64__
712 /*
713 * Forcibly invalidate base page tables at this point to break circular
714 * 'linear page table' references. This is okay because MMU structures
715 * are not shared across domains and this domain is now dead. Thus base
716 * tables are not in use so a non-zero count means circular reference.
717 */
718 y = page->u.inuse.type_info;
719 for ( ; ; )
720 {
721 x = y;
722 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
723 (PGT_base_page_table|PGT_validated)) )
724 break;
726 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
727 if ( likely(y == x) )
728 {
729 free_page_type(page, PGT_base_page_table);
730 break;
731 }
732 }
733 #endif
735 /* Follow the list chain and /then/ potentially free the page. */
736 ent = ent->next;
737 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
738 put_page(page);
739 }
741 spin_unlock_recursive(&d->page_alloc_lock);
742 }
744 void domain_relinquish_resources(struct domain *d)
745 {
746 /* Relinquish guest resources for VT-i domain. */
747 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
748 vmx_relinquish_guest_resources(d);
750 /* Tear down shadow mode stuff. */
751 mm_teardown(d);
753 /* Relinquish every page of memory. */
754 relinquish_memory(d, &d->xenpage_list);
755 relinquish_memory(d, &d->page_list);
757 if (d->arch.is_vti && d->arch.sal_data)
758 xfree(d->arch.sal_data);
760 /* Free page used by xen oprofile buffer */
761 free_xenoprof_pages(d);
762 }
764 unsigned long
765 domain_set_shared_info_va (unsigned long va)
766 {
767 struct vcpu *v = current;
768 struct domain *d = v->domain;
770 /* Check virtual address:
771 must belong to region 7,
772 must be 64Kb aligned,
773 must not be within Xen virtual space. */
774 if ((va >> 61) != 7
775 || (va & 0xffffUL) != 0
776 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
777 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
779 /* Note: this doesn't work well if other cpus are already running.
780 However this is part of the spec :-) */
781 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
782 d->arch.shared_info_va = va;
784 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
785 INT_ENABLE_OFFSET(v);
787 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
789 /* Remap the shared pages. */
790 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
792 return 0;
793 }
795 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
796 #define SHADOW_COPY_CHUNK 1024
798 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
799 {
800 unsigned int op = sc->op;
801 int rc = 0;
802 int i;
803 //struct vcpu *v;
805 if (unlikely(d == current->domain)) {
806 gdprintk(XENLOG_INFO,
807 "Don't try to do a shadow op on yourself!\n");
808 return -EINVAL;
809 }
811 domain_pause(d);
813 switch (op)
814 {
815 case XEN_DOMCTL_SHADOW_OP_OFF:
816 if (shadow_mode_enabled (d)) {
817 u64 *bm = d->arch.shadow_bitmap;
819 /* Flush vhpt and tlb to restore dirty bit usage. */
820 domain_flush_tlb_vhpt(d);
822 /* Free bitmap. */
823 d->arch.shadow_bitmap_size = 0;
824 d->arch.shadow_bitmap = NULL;
825 xfree(bm);
826 }
827 break;
829 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
830 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
831 rc = -EINVAL;
832 break;
834 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
835 if (shadow_mode_enabled(d)) {
836 rc = -EINVAL;
837 break;
838 }
840 atomic64_set(&d->arch.shadow_fault_count, 0);
841 atomic64_set(&d->arch.shadow_dirty_count, 0);
843 d->arch.shadow_bitmap_size =
844 ((d->arch.convmem_end >> PAGE_SHIFT) +
845 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
846 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
847 d->arch.shadow_bitmap_size / BITS_PER_LONG);
848 if (d->arch.shadow_bitmap == NULL) {
849 d->arch.shadow_bitmap_size = 0;
850 rc = -ENOMEM;
851 }
852 else {
853 memset(d->arch.shadow_bitmap, 0,
854 d->arch.shadow_bitmap_size / 8);
856 /* Flush vhtp and tlb to enable dirty bit
857 virtualization. */
858 domain_flush_tlb_vhpt(d);
859 }
860 break;
862 case XEN_DOMCTL_SHADOW_OP_CLEAN:
863 {
864 int nbr_bytes;
866 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
867 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
869 atomic64_set(&d->arch.shadow_fault_count, 0);
870 atomic64_set(&d->arch.shadow_dirty_count, 0);
872 if (guest_handle_is_null(sc->dirty_bitmap) ||
873 (d->arch.shadow_bitmap == NULL)) {
874 rc = -EINVAL;
875 break;
876 }
878 if (sc->pages > d->arch.shadow_bitmap_size)
879 sc->pages = d->arch.shadow_bitmap_size;
881 nbr_bytes = (sc->pages + 7) / 8;
883 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
884 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
885 SHADOW_COPY_CHUNK : nbr_bytes - i;
887 if (copy_to_guest_offset(
888 sc->dirty_bitmap, i,
889 (uint8_t *)d->arch.shadow_bitmap + i,
890 size)) {
891 rc = -EFAULT;
892 break;
893 }
895 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
896 }
898 break;
899 }
901 case XEN_DOMCTL_SHADOW_OP_PEEK:
902 {
903 unsigned long size;
905 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
906 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
908 if (guest_handle_is_null(sc->dirty_bitmap) ||
909 (d->arch.shadow_bitmap == NULL)) {
910 rc = -EINVAL;
911 break;
912 }
914 if (sc->pages > d->arch.shadow_bitmap_size)
915 sc->pages = d->arch.shadow_bitmap_size;
917 size = (sc->pages + 7) / 8;
918 if (copy_to_guest(sc->dirty_bitmap,
919 (uint8_t *)d->arch.shadow_bitmap, size)) {
920 rc = -EFAULT;
921 break;
922 }
923 break;
924 }
925 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
926 sc->mb = 0;
927 break;
928 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
929 if (sc->mb > 0) {
930 BUG();
931 rc = -ENOMEM;
932 }
933 break;
934 default:
935 rc = -EINVAL;
936 break;
937 }
939 domain_unpause(d);
941 return rc;
942 }
944 // remove following line if not privifying in memory
945 //#define HAVE_PRIVIFY_MEMORY
946 #ifndef HAVE_PRIVIFY_MEMORY
947 #define privify_memory(x,y) do {} while(0)
948 #endif
950 static void loaddomainelfimage(struct domain *d, struct elf_binary *elf)
951 {
952 const elf_phdr *phdr;
953 int phnum, h, filesz, memsz;
954 unsigned long elfaddr, dom_mpaddr, dom_imva;
955 struct page_info *p;
957 phnum = elf_uval(elf, elf->ehdr, e_phnum);
958 for (h = 0; h < phnum; h++) {
959 phdr = elf_phdr_by_index(elf, h);
960 if (!elf_phdr_is_loadable(elf, phdr))
961 continue;
963 filesz = elf_uval(elf, phdr, p_filesz);
964 memsz = elf_uval(elf, phdr, p_memsz);
965 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
966 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
968 while (memsz > 0) {
969 p = assign_new_domain_page(d,dom_mpaddr);
970 BUG_ON (unlikely(p == NULL));
971 dom_imva = __va_ul(page_to_maddr(p));
972 if (filesz > 0) {
973 if (filesz >= PAGE_SIZE)
974 memcpy((void *) dom_imva,
975 (void *) elfaddr,
976 PAGE_SIZE);
977 else {
978 // copy partial page
979 memcpy((void *) dom_imva,
980 (void *) elfaddr, filesz);
981 // zero the rest of page
982 memset((void *) dom_imva+filesz, 0,
983 PAGE_SIZE-filesz);
984 }
985 //FIXME: This test for code seems to find a lot more than objdump -x does
986 if (elf_uval(elf, phdr, p_flags) & PF_X) {
987 privify_memory(dom_imva,PAGE_SIZE);
988 flush_icache_range(dom_imva,
989 dom_imva+PAGE_SIZE);
990 }
991 }
992 else if (memsz > 0) {
993 /* always zero out entire page */
994 memset((void *) dom_imva, 0, PAGE_SIZE);
995 }
996 memsz -= PAGE_SIZE;
997 filesz -= PAGE_SIZE;
998 elfaddr += PAGE_SIZE;
999 dom_mpaddr += PAGE_SIZE;
1004 void alloc_dom0(void)
1006 /* Check dom0 size. */
1007 if (dom0_size < 4 * 1024 * 1024) {
1008 panic("dom0_mem is too small, boot aborted"
1009 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1012 if (running_on_sim) {
1013 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1016 /* no need to allocate pages for now
1017 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1018 */
1022 /*
1023 * Domain 0 has direct access to all devices absolutely. However
1024 * the major point of this stub here, is to allow alloc_dom_mem
1025 * handled with order > 0 request. Dom0 requires that bit set to
1026 * allocate memory for other domains.
1027 */
1028 static void physdev_init_dom0(struct domain *d)
1030 if (iomem_permit_access(d, 0UL, ~0UL))
1031 BUG();
1032 if (irqs_permit_access(d, 0, NR_IRQS-1))
1033 BUG();
1034 if (ioports_permit_access(d, 0, 0xffff))
1035 BUG();
1038 int construct_dom0(struct domain *d,
1039 unsigned long image_start, unsigned long image_len,
1040 unsigned long initrd_start, unsigned long initrd_len,
1041 char *cmdline)
1043 int i, rc;
1044 start_info_t *si;
1045 dom0_vga_console_info_t *ci;
1046 struct vcpu *v = d->vcpu[0];
1047 unsigned long max_pages;
1049 struct elf_binary elf;
1050 struct elf_dom_parms parms;
1051 unsigned long p_start;
1052 unsigned long pkern_start;
1053 unsigned long pkern_entry;
1054 unsigned long pkern_end;
1055 unsigned long pinitrd_start = 0;
1056 unsigned long pstart_info;
1057 struct page_info *start_info_page;
1058 unsigned long bp_mpa;
1059 struct ia64_boot_param *bp;
1061 //printk("construct_dom0: starting\n");
1063 /* Sanity! */
1064 BUG_ON(d != dom0);
1065 BUG_ON(d->vcpu[0] == NULL);
1066 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1068 printk("*** LOADING DOMAIN 0 ***\n");
1070 max_pages = dom0_size / PAGE_SIZE;
1071 d->max_pages = max_pages;
1072 d->tot_pages = 0;
1074 rc = elf_init(&elf, (void*)image_start, image_len);
1075 if ( rc != 0 )
1076 return rc;
1077 #ifdef VERBOSE
1078 elf_set_verbose(&elf);
1079 #endif
1080 elf_parse_binary(&elf);
1081 if (0 != (elf_xen_parse(&elf, &parms)))
1082 return rc;
1084 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1085 elf_64bit(&elf) ? "64-bit" : "32-bit",
1086 elf_msb(&elf) ? "msb" : "lsb",
1087 elf.pstart, elf.pend);
1088 if (!elf_64bit(&elf) ||
1089 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1090 printk("Incompatible kernel binary\n");
1091 return -1;
1094 p_start = parms.virt_base;
1095 pkern_start = parms.virt_kstart;
1096 pkern_end = parms.virt_kend;
1097 pkern_entry = parms.virt_entry;
1099 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1101 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1103 printk("Initial guest OS must load to a page boundary.\n");
1104 return -EINVAL;
1107 pstart_info = PAGE_ALIGN(pkern_end);
1108 if(initrd_start && initrd_len){
1109 unsigned long offset;
1111 /* The next page aligned boundary after the start info.
1112 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1113 pinitrd_start = pstart_info + PAGE_SIZE;
1114 if (pinitrd_start + initrd_len >= dom0_size)
1115 panic("%s: not enough memory assigned to dom0", __func__);
1116 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1117 struct page_info *p;
1118 p = assign_new_domain_page(d, pinitrd_start + offset);
1119 if (p == NULL)
1120 panic("%s: can't allocate page for initrd image", __func__);
1121 if (initrd_len < offset + PAGE_SIZE)
1122 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1123 initrd_len - offset);
1124 else
1125 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1129 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1130 " Kernel image: %lx->%lx\n"
1131 " Entry address: %lx\n"
1132 " Init. ramdisk: %lx len %lx\n"
1133 " Start info.: %lx->%lx\n",
1134 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1135 pstart_info, pstart_info + PAGE_SIZE);
1137 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1139 printk("Initial guest OS requires too much space\n"
1140 "(%luMB is greater than %luMB limit)\n",
1141 (pkern_end-pkern_start)>>20,
1142 (max_pages <<PAGE_SHIFT)>>20);
1143 return -ENOMEM;
1146 // if high 3 bits of pkern start are non-zero, error
1148 // if pkern end is after end of metaphysical memory, error
1149 // (we should be able to deal with this... later)
1151 /* Mask all upcalls... */
1152 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1153 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1155 if (dom0_max_vcpus == 0)
1156 dom0_max_vcpus = MAX_VIRT_CPUS;
1157 if (dom0_max_vcpus > num_online_cpus())
1158 dom0_max_vcpus = num_online_cpus();
1159 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1160 dom0_max_vcpus = MAX_VIRT_CPUS;
1162 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1163 for ( i = 1; i < dom0_max_vcpus; i++ )
1164 if (alloc_vcpu(d, i, i) == NULL)
1165 panic("Cannot allocate dom0 vcpu %d\n", i);
1167 /* Copy the OS image. */
1168 loaddomainelfimage(d,&elf);
1170 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1171 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1173 /* Set up start info area. */
1174 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1175 start_info_page = assign_new_domain_page(d, pstart_info);
1176 if (start_info_page == NULL)
1177 panic("can't allocate start info page");
1178 si = page_to_virt(start_info_page);
1179 memset(si, 0, PAGE_SIZE);
1180 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1181 xen_major_version(), xen_minor_version());
1182 si->nr_pages = max_pages;
1183 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1185 printk("Dom0: 0x%lx\n", (u64)dom0);
1187 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1188 clear_bit(_VCPUF_down, &v->vcpu_flags);
1190 /* Build firmware.
1191 Note: Linux kernel reserve memory used by start_info, so there is
1192 no need to remove it from MDT. */
1193 bp_mpa = pstart_info + sizeof(struct start_info);
1194 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1196 /* Fill boot param. */
1197 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1199 bp = (struct ia64_boot_param *)((unsigned char *)si +
1200 sizeof(start_info_t));
1201 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1203 /* We assume console has reached the last line! */
1204 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1205 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1206 bp->console_info.orig_x = 0;
1207 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1208 0 : bp->console_info.num_rows - 1;
1210 bp->initrd_start = pinitrd_start;
1211 bp->initrd_size = ia64_boot_param->initrd_size;
1213 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1214 sizeof(start_info_t) +
1215 sizeof(struct ia64_boot_param));
1217 if (fill_console_start_info(ci)) {
1218 si->console.dom0.info_off = sizeof(start_info_t) +
1219 sizeof(struct ia64_boot_param);
1220 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1223 vcpu_init_regs (v);
1225 vcpu_regs(v)->r28 = bp_mpa;
1227 vcpu_regs (v)->cr_iip = pkern_entry;
1229 physdev_init_dom0(d);
1231 return 0;
1234 void machine_restart(char * __unused)
1236 console_start_sync();
1237 if (running_on_sim)
1238 printk ("machine_restart called. spinning...\n");
1239 else
1240 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1241 while(1);
1244 extern void cpu_halt(void);
1246 void machine_halt(void)
1248 console_start_sync();
1249 if (running_on_sim)
1250 printk ("machine_halt called. spinning...\n");
1251 else
1252 cpu_halt();
1253 while(1);
1256 void sync_vcpu_execstate(struct vcpu *v)
1258 // __ia64_save_fpu(v->arch._thread.fph);
1259 // if (VMX_DOMAIN(v))
1260 // vmx_save_state(v);
1261 // FIXME SMP: Anything else needed here for SMP?
1264 static void parse_dom0_mem(char *s)
1266 dom0_size = parse_size_and_unit(s, NULL);
1268 custom_param("dom0_mem", parse_dom0_mem);