ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 10786:86e5d8458c08

[IA64] live migration

Shadow mode and live migration.

Virtualize Dirty bit.

Signed-off-by: Tristan Gingold <tristan.gingold@bull.net>
author awilliam@xenbuild.aw
date Wed Jul 26 09:36:36 2006 -0600 (2006-07-26)
parents b2abc70be89e
children 7be1cfe8345b
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/compile.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/arch-ia64.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <public/arch-ia64.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <asm/privop_stat.h>
52 #ifndef CONFIG_XEN_IA64_DOM0_VP
53 #define CONFIG_DOMAIN0_CONTIGUOUS
54 #endif
55 unsigned long dom0_start = -1L;
56 unsigned long dom0_size = 512*1024*1024;
57 unsigned long dom0_align = 64*1024*1024;
59 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
60 static unsigned int dom0_max_vcpus = 1;
61 integer_param("dom0_max_vcpus", dom0_max_vcpus);
63 extern unsigned long running_on_sim;
65 extern char dom0_command_line[];
67 /* FIXME: where these declarations should be there ? */
68 extern void serial_input_init(void);
69 static void init_switch_stack(struct vcpu *v);
70 extern void vmx_do_launch(struct vcpu *);
72 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
73 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
75 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
76 This is a Xen virtual address. */
77 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
78 DEFINE_PER_CPU(int *, current_psr_ic_addr);
80 #include <xen/sched-if.h>
82 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
83 {
84 int cpu = smp_processor_id();
85 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
86 int last_processor = vcpu->arch.last_processor;
88 if (is_idle_domain(vcpu->domain))
89 return;
91 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
92 vcpu->arch.last_processor = cpu;
94 if ((last_vcpu_id != vcpu->vcpu_id &&
95 last_vcpu_id != INVALID_VCPU_ID) ||
96 (last_vcpu_id == vcpu->vcpu_id &&
97 last_processor != cpu &&
98 last_processor != INVALID_PROCESSOR)) {
100 // if the vTLB implementation was changed,
101 // the followings must be updated either.
102 if (VMX_DOMAIN(vcpu)) {
103 // currently vTLB for vt-i domian is per vcpu.
104 // so any flushing isn't needed.
105 } else {
106 vhpt_flush();
107 }
108 local_flush_tlb_all();
109 }
110 }
112 void schedule_tail(struct vcpu *prev)
113 {
114 extern char ia64_ivt;
115 context_saved(prev);
117 if (VMX_DOMAIN(current)) {
118 vmx_do_launch(current);
119 } else {
120 ia64_set_iva(&ia64_ivt);
121 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
122 VHPT_ENABLED);
123 load_region_regs(current);
124 vcpu_load_kernel_regs(current);
125 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
126 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
127 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
128 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
129 }
130 flush_vtlb_for_context_switch(current);
131 }
133 void context_switch(struct vcpu *prev, struct vcpu *next)
134 {
135 uint64_t spsr;
136 uint64_t pta;
138 local_irq_save(spsr);
139 context_switch_count++;
141 __ia64_save_fpu(prev->arch._thread.fph);
142 __ia64_load_fpu(next->arch._thread.fph);
143 if (VMX_DOMAIN(prev))
144 vmx_save_state(prev);
145 if (VMX_DOMAIN(next))
146 vmx_load_state(next);
147 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
148 prev = ia64_switch_to(next);
150 /* Note: ia64_switch_to does not return here at vcpu initialization. */
152 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
154 // leave this debug for now: it acts as a heartbeat when more than
155 // one domain is active
156 {
157 static long cnt[16] = { 50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50};
158 static int i = 100;
159 int id = ((struct vcpu *)current)->domain->domain_id & 0xf;
160 if (!cnt[id]--) { cnt[id] = 500000; printk("%x",id); }
161 if (!i--) { i = 1000000; printk("+"); }
162 }
164 if (VMX_DOMAIN(current)){
165 vmx_load_all_rr(current);
166 } else {
167 struct domain *nd;
168 extern char ia64_ivt;
170 ia64_set_iva(&ia64_ivt);
172 nd = current->domain;
173 if (!is_idle_domain(nd)) {
174 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
175 VHPT_ENABLED);
176 load_region_regs(current);
177 vcpu_load_kernel_regs(current);
178 vcpu_set_next_timer(current);
179 if (vcpu_timer_expired(current))
180 vcpu_pend_timer(current);
181 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
182 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
183 __ia64_per_cpu_var(current_psr_ic_addr) =
184 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
185 } else {
186 /* When switching to idle domain, only need to disable vhpt
187 * walker. Then all accesses happen within idle context will
188 * be handled by TR mapping and identity mapping.
189 */
190 pta = ia64_get_pta();
191 ia64_set_pta(pta & ~VHPT_ENABLED);
192 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
193 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
194 }
195 }
196 flush_vtlb_for_context_switch(current);
197 local_irq_restore(spsr);
198 context_saved(prev);
199 }
201 void continue_running(struct vcpu *same)
202 {
203 /* nothing to do */
204 }
206 static void default_idle(void)
207 {
208 local_irq_disable();
209 if ( !softirq_pending(smp_processor_id()) )
210 safe_halt();
211 local_irq_enable();
212 }
214 static void continue_cpu_idle_loop(void)
215 {
216 for ( ; ; )
217 {
218 #ifdef IA64
219 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
220 #else
221 irq_stat[cpu].idle_timestamp = jiffies;
222 #endif
223 while ( !softirq_pending(smp_processor_id()) )
224 default_idle();
225 raise_softirq(SCHEDULE_SOFTIRQ);
226 do_softirq();
227 }
228 }
230 void startup_cpu_idle_loop(void)
231 {
232 /* Just some sanity to ensure that the scheduler is set up okay. */
233 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
234 raise_softirq(SCHEDULE_SOFTIRQ);
236 continue_cpu_idle_loop();
237 }
239 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
240 {
241 struct vcpu *v;
242 struct thread_info *ti;
244 /* Still keep idle vcpu0 static allocated at compilation, due
245 * to some code from Linux still requires it in early phase.
246 */
247 if (is_idle_domain(d) && !vcpu_id)
248 v = idle_vcpu[0];
249 else {
250 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
251 return NULL;
252 memset(v, 0, sizeof(*v));
254 ti = alloc_thread_info(v);
255 /* Clear thread_info to clear some important fields, like
256 * preempt_count
257 */
258 memset(ti, 0, sizeof(struct thread_info));
259 init_switch_stack(v);
260 }
262 if (!is_idle_domain(d)) {
263 if (!d->arch.is_vti) {
264 /* Create privregs page only if not VTi. */
265 v->arch.privregs =
266 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
267 BUG_ON(v->arch.privregs == NULL);
268 memset(v->arch.privregs, 0, PAGE_SIZE);
269 share_xen_page_with_guest(virt_to_page(v->arch.privregs),
270 d, XENSHARE_writable);
271 }
273 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
274 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
275 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
276 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
278 /* Is it correct ?
279 It depends on the domain rid usage.
281 A domain may share rid among its processor (eg having a
282 global VHPT). In this case, we should also share rid
283 among vcpus and the rid range should be the same.
285 However a domain may have per cpu rid allocation. In
286 this case we don't want to share rid among vcpus, but we may
287 do it if two vcpus are on the same cpu... */
289 v->arch.starting_rid = d->arch.starting_rid;
290 v->arch.ending_rid = d->arch.ending_rid;
291 v->arch.breakimm = d->arch.breakimm;
292 v->arch.last_processor = INVALID_PROCESSOR;
293 }
295 return v;
296 }
298 void free_vcpu_struct(struct vcpu *v)
299 {
300 if (VMX_DOMAIN(v))
301 vmx_relinquish_vcpu_resources(v);
302 else {
303 if (v->arch.privregs != NULL)
304 free_xenheap_pages(v->arch.privregs,
305 get_order_from_shift(XMAPPEDREGS_SHIFT));
306 }
308 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
309 }
311 static void init_switch_stack(struct vcpu *v)
312 {
313 struct pt_regs *regs = vcpu_regs (v);
314 struct switch_stack *sw = (struct switch_stack *) regs - 1;
315 extern void ia64_ret_from_clone;
317 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
318 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
319 sw->b0 = (unsigned long) &ia64_ret_from_clone;
320 sw->ar_fpsr = FPSR_DEFAULT;
321 v->arch._thread.ksp = (unsigned long) sw - 16;
322 // stay on kernel stack because may get interrupts!
323 // ia64_ret_from_clone switches to user stack
324 v->arch._thread.on_ustack = 0;
325 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
326 }
328 int arch_domain_create(struct domain *d)
329 {
330 int i;
332 // the following will eventually need to be negotiated dynamically
333 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
334 d->arch.breakimm = 0x1000;
335 for (i = 0; i < NR_CPUS; i++) {
336 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
337 }
339 if (is_idle_domain(d))
340 return 0;
342 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
343 if (d->shared_info == NULL)
344 goto fail_nomem;
345 memset(d->shared_info, 0, XSI_SIZE);
346 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
347 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
348 d, XENSHARE_writable);
350 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
351 /* We may also need emulation rid for region4, though it's unlikely
352 * to see guest issue uncacheable access in metaphysical mode. But
353 * keep such info here may be more sane.
354 */
355 if (!allocate_rid_range(d,0))
356 goto fail_nomem;
358 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
360 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
361 goto fail_nomem;
363 printf ("arch_domain_create: domain=%p\n", d);
364 return 0;
366 fail_nomem:
367 if (d->arch.mm.pgd != NULL)
368 pgd_free(d->arch.mm.pgd);
369 if (d->shared_info != NULL)
370 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
371 return -ENOMEM;
372 }
374 void arch_domain_destroy(struct domain *d)
375 {
376 BUG_ON(d->arch.mm.pgd != NULL);
377 if (d->shared_info != NULL)
378 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
379 if (d->arch.shadow_bitmap != NULL)
380 xfree(d->arch.shadow_bitmap);
382 /* Clear vTLB for the next domain. */
383 domain_flush_tlb_vhpt(d);
385 deallocate_rid_range(d);
386 }
388 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
389 {
390 int i;
391 struct vcpu_extra_regs *er = &c->extra_regs;
393 c->user_regs = *vcpu_regs (v);
394 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
396 /* Fill extra regs. */
397 for (i = 0; i < 8; i++) {
398 er->itrs[i].pte = v->arch.itrs[i].pte.val;
399 er->itrs[i].itir = v->arch.itrs[i].itir;
400 er->itrs[i].vadr = v->arch.itrs[i].vadr;
401 er->itrs[i].rid = v->arch.itrs[i].rid;
402 }
403 for (i = 0; i < 8; i++) {
404 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
405 er->dtrs[i].itir = v->arch.dtrs[i].itir;
406 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
407 er->dtrs[i].rid = v->arch.dtrs[i].rid;
408 }
409 er->event_callback_ip = v->arch.event_callback_ip;
410 er->dcr = v->arch.dcr;
411 er->iva = v->arch.iva;
412 }
414 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
415 {
416 struct pt_regs *regs = vcpu_regs (v);
417 struct domain *d = v->domain;
419 *regs = c->user_regs;
421 if (!d->arch.is_vti) {
422 /* domain runs at PL2/3 */
423 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
424 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
425 }
427 if (c->flags & VGCF_EXTRA_REGS) {
428 int i;
429 struct vcpu_extra_regs *er = &c->extra_regs;
431 for (i = 0; i < 8; i++) {
432 vcpu_set_itr(v, i, er->itrs[i].pte,
433 er->itrs[i].itir,
434 er->itrs[i].vadr,
435 er->itrs[i].rid);
436 }
437 for (i = 0; i < 8; i++) {
438 vcpu_set_dtr(v, i,
439 er->dtrs[i].pte,
440 er->dtrs[i].itir,
441 er->dtrs[i].vadr,
442 er->dtrs[i].rid);
443 }
444 v->arch.event_callback_ip = er->event_callback_ip;
445 v->arch.dcr = er->dcr;
446 v->arch.iva = er->iva;
447 }
449 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
450 return 0;
451 if (d->arch.is_vti)
452 vmx_final_setup_guest(v);
454 /* This overrides some registers. */
455 vcpu_init_regs(v);
457 /* Don't redo final setup */
458 set_bit(_VCPUF_initialised, &v->vcpu_flags);
459 return 0;
460 }
462 static void relinquish_memory(struct domain *d, struct list_head *list)
463 {
464 struct list_head *ent;
465 struct page_info *page;
466 #ifndef __ia64__
467 unsigned long x, y;
468 #endif
470 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
471 spin_lock_recursive(&d->page_alloc_lock);
472 ent = list->next;
473 while ( ent != list )
474 {
475 page = list_entry(ent, struct page_info, list);
476 /* Grab a reference to the page so it won't disappear from under us. */
477 if ( unlikely(!get_page(page, d)) )
478 {
479 /* Couldn't get a reference -- someone is freeing this page. */
480 ent = ent->next;
481 continue;
482 }
484 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
485 put_page_and_type(page);
487 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
488 put_page(page);
490 #ifndef __ia64__
491 /*
492 * Forcibly invalidate base page tables at this point to break circular
493 * 'linear page table' references. This is okay because MMU structures
494 * are not shared across domains and this domain is now dead. Thus base
495 * tables are not in use so a non-zero count means circular reference.
496 */
497 y = page->u.inuse.type_info;
498 for ( ; ; )
499 {
500 x = y;
501 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
502 (PGT_base_page_table|PGT_validated)) )
503 break;
505 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
506 if ( likely(y == x) )
507 {
508 free_page_type(page, PGT_base_page_table);
509 break;
510 }
511 }
512 #endif
514 /* Follow the list chain and /then/ potentially free the page. */
515 ent = ent->next;
516 #ifdef CONFIG_XEN_IA64_DOM0_VP
517 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
518 #endif
519 put_page(page);
520 }
522 spin_unlock_recursive(&d->page_alloc_lock);
523 }
525 void domain_relinquish_resources(struct domain *d)
526 {
527 /* Relinquish every page of memory. */
529 // relase page traversing d->arch.mm.
530 relinquish_mm(d);
532 relinquish_memory(d, &d->xenpage_list);
533 relinquish_memory(d, &d->page_list);
535 if (d->arch.is_vti && d->arch.sal_data)
536 xfree(d->arch.sal_data);
537 }
539 void build_physmap_table(struct domain *d)
540 {
541 struct list_head *list_ent = d->page_list.next;
542 unsigned long mfn, i = 0;
544 while(list_ent != &d->page_list) {
545 mfn = page_to_mfn(list_entry(
546 list_ent, struct page_info, list));
547 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
549 i++;
550 list_ent = mfn_to_page(mfn)->list.next;
551 }
552 }
554 unsigned long
555 domain_set_shared_info_va (unsigned long va)
556 {
557 struct vcpu *v = current;
558 struct domain *d = v->domain;
559 struct vcpu *v1;
561 /* Check virtual address:
562 must belong to region 7,
563 must be 64Kb aligned,
564 must not be within Xen virtual space. */
565 if ((va >> 61) != 7
566 || (va & 0xffffUL) != 0
567 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
568 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
570 /* Note: this doesn't work well if other cpus are already running.
571 However this is part of the spec :-) */
572 printf ("Domain set shared_info_va to 0x%016lx\n", va);
573 d->arch.shared_info_va = va;
575 for_each_vcpu (d, v1) {
576 VCPU(v1, interrupt_mask_addr) =
577 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
578 }
580 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
582 /* Remap the shared pages. */
583 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
585 return 0;
586 }
588 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
589 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
591 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
592 {
593 unsigned int op = sc->op;
594 int rc = 0;
595 int i;
596 //struct vcpu *v;
598 if (unlikely(d == current->domain)) {
599 DPRINTK("Don't try to do a shadow op on yourself!\n");
600 return -EINVAL;
601 }
603 domain_pause(d);
605 switch (op)
606 {
607 case DOM0_SHADOW_CONTROL_OP_OFF:
608 if (shadow_mode_enabled (d)) {
609 u64 *bm = d->arch.shadow_bitmap;
611 /* Flush vhpt and tlb to restore dirty bit usage. */
612 domain_flush_tlb_vhpt(d);
614 /* Free bitmap. */
615 d->arch.shadow_bitmap_size = 0;
616 d->arch.shadow_bitmap = NULL;
617 xfree(bm);
618 }
619 break;
621 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
622 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
623 rc = -EINVAL;
624 break;
626 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
627 if (shadow_mode_enabled(d)) {
628 rc = -EINVAL;
629 break;
630 }
632 atomic64_set(&d->arch.shadow_fault_count, 0);
633 atomic64_set(&d->arch.shadow_dirty_count, 0);
635 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
636 ~(BITS_PER_LONG-1);
637 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
638 d->arch.shadow_bitmap_size / BITS_PER_LONG);
639 if (d->arch.shadow_bitmap == NULL) {
640 d->arch.shadow_bitmap_size = 0;
641 rc = -ENOMEM;
642 }
643 else {
644 memset(d->arch.shadow_bitmap, 0,
645 d->arch.shadow_bitmap_size / 8);
647 /* Flush vhtp and tlb to enable dirty bit
648 virtualization. */
649 domain_flush_tlb_vhpt(d);
650 }
651 break;
653 case DOM0_SHADOW_CONTROL_OP_FLUSH:
654 atomic64_set(&d->arch.shadow_fault_count, 0);
655 atomic64_set(&d->arch.shadow_dirty_count, 0);
656 break;
658 case DOM0_SHADOW_CONTROL_OP_CLEAN:
659 {
660 int nbr_longs;
662 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
663 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
665 atomic64_set(&d->arch.shadow_fault_count, 0);
666 atomic64_set(&d->arch.shadow_dirty_count, 0);
668 if (guest_handle_is_null(sc->dirty_bitmap) ||
669 (d->arch.shadow_bitmap == NULL)) {
670 rc = -EINVAL;
671 break;
672 }
674 if (sc->pages > d->arch.shadow_bitmap_size)
675 sc->pages = d->arch.shadow_bitmap_size;
677 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
679 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
680 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
681 SHADOW_COPY_CHUNK : nbr_longs - i;
683 if (copy_to_guest_offset(sc->dirty_bitmap, i,
684 d->arch.shadow_bitmap + i,
685 size)) {
686 rc = -EFAULT;
687 break;
688 }
690 memset(d->arch.shadow_bitmap + i,
691 0, size * sizeof(unsigned long));
692 }
694 break;
695 }
697 case DOM0_SHADOW_CONTROL_OP_PEEK:
698 {
699 unsigned long size;
701 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
702 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
704 if (guest_handle_is_null(sc->dirty_bitmap) ||
705 (d->arch.shadow_bitmap == NULL)) {
706 rc = -EINVAL;
707 break;
708 }
710 if (sc->pages > d->arch.shadow_bitmap_size)
711 sc->pages = d->arch.shadow_bitmap_size;
713 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
714 if (copy_to_guest(sc->dirty_bitmap,
715 d->arch.shadow_bitmap, size)) {
716 rc = -EFAULT;
717 break;
718 }
719 break;
720 }
721 default:
722 rc = -EINVAL;
723 break;
724 }
726 domain_unpause(d);
728 return rc;
729 }
731 // remove following line if not privifying in memory
732 //#define HAVE_PRIVIFY_MEMORY
733 #ifndef HAVE_PRIVIFY_MEMORY
734 #define privify_memory(x,y) do {} while(0)
735 #endif
737 // see arch/x86/xxx/domain_build.c
738 int elf_sanity_check(Elf_Ehdr *ehdr)
739 {
740 if (!(IS_ELF(*ehdr)))
741 {
742 printk("DOM0 image is not a Xen-compatible Elf image.\n");
743 return 0;
744 }
745 return 1;
746 }
748 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
749 {
750 char *elfbase = (char *) image_start;
751 Elf_Ehdr ehdr;
752 Elf_Phdr phdr;
753 int h, filesz, memsz;
754 unsigned long elfaddr, dom_mpaddr, dom_imva;
755 struct page_info *p;
757 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
758 for ( h = 0; h < ehdr.e_phnum; h++ ) {
759 memcpy(&phdr,
760 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
761 sizeof(Elf_Phdr));
762 if ((phdr.p_type != PT_LOAD))
763 continue;
765 filesz = phdr.p_filesz;
766 memsz = phdr.p_memsz;
767 elfaddr = (unsigned long) elfbase + phdr.p_offset;
768 dom_mpaddr = phdr.p_paddr;
770 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
771 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
772 if (d == dom0) {
773 if (dom_mpaddr+memsz>dom0_size)
774 panic("Dom0 doesn't fit in memory space!\n");
775 dom_imva = __va_ul(dom_mpaddr + dom0_start);
776 memcpy((void *)dom_imva, (void *)elfaddr, filesz);
777 if (memsz > filesz)
778 memset((void *)dom_imva+filesz, 0,
779 memsz-filesz);
780 //FIXME: This test for code seems to find a lot more than objdump -x does
781 if (phdr.p_flags & PF_X) {
782 privify_memory(dom_imva,filesz);
783 flush_icache_range (dom_imva, dom_imva+filesz);
784 }
785 }
786 else
787 #endif
788 while (memsz > 0) {
789 p = assign_new_domain_page(d,dom_mpaddr);
790 BUG_ON (unlikely(p == NULL));
791 dom_imva = __va_ul(page_to_maddr(p));
792 if (filesz > 0) {
793 if (filesz >= PAGE_SIZE)
794 memcpy((void *) dom_imva,
795 (void *) elfaddr,
796 PAGE_SIZE);
797 else {
798 // copy partial page
799 memcpy((void *) dom_imva,
800 (void *) elfaddr, filesz);
801 // zero the rest of page
802 memset((void *) dom_imva+filesz, 0,
803 PAGE_SIZE-filesz);
804 }
805 //FIXME: This test for code seems to find a lot more than objdump -x does
806 if (phdr.p_flags & PF_X) {
807 privify_memory(dom_imva,PAGE_SIZE);
808 flush_icache_range(dom_imva,
809 dom_imva+PAGE_SIZE);
810 }
811 }
812 else if (memsz > 0) {
813 /* always zero out entire page */
814 memset((void *) dom_imva, 0, PAGE_SIZE);
815 }
816 memsz -= PAGE_SIZE;
817 filesz -= PAGE_SIZE;
818 elfaddr += PAGE_SIZE;
819 dom_mpaddr += PAGE_SIZE;
820 }
821 }
822 }
824 void alloc_dom0(void)
825 {
826 /* Check dom0 size. */
827 if (dom0_size < 4 * 1024 * 1024) {
828 panic("dom0_mem is too small, boot aborted"
829 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
830 }
832 /* Check dom0 align. */
833 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
834 panic("dom0_align (%lx) must be power of two, boot aborted"
835 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
836 dom0_align);
837 }
838 if (dom0_align < PAGE_SIZE) {
839 panic("dom0_align must be >= %ld, boot aborted"
840 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
841 PAGE_SIZE);
842 }
843 if (dom0_size % dom0_align) {
844 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
845 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
846 dom0_size,dom0_align);
847 }
849 if (running_on_sim) {
850 dom0_size = 128*1024*1024; //FIXME: Should be configurable
851 }
852 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
853 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
855 /* FIXME: The first trunk (say 256M) should always be assigned to
856 * Dom0, since Dom0's physical == machine address for DMA purpose.
857 * Some old version linux, like 2.4, assumes physical memory existing
858 * in 2nd 64M space.
859 */
860 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
861 dom0_start <<= PAGE_SHIFT;
862 if (!dom0_start) {
863 panic("alloc_dom0: can't allocate contiguous memory size=%lu\n",
864 dom0_size);
865 }
866 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
867 #else
868 // no need to allocate pages for now
869 // pages are allocated by map_new_domain_page() via loaddomainelfimage()
870 dom0_start = 0;
871 #endif
873 }
876 /*
877 * Domain 0 has direct access to all devices absolutely. However
878 * the major point of this stub here, is to allow alloc_dom_mem
879 * handled with order > 0 request. Dom0 requires that bit set to
880 * allocate memory for other domains.
881 */
882 static void physdev_init_dom0(struct domain *d)
883 {
884 if (iomem_permit_access(d, 0UL, ~0UL))
885 BUG();
886 if (irqs_permit_access(d, 0, NR_IRQS-1))
887 BUG();
888 }
890 int construct_dom0(struct domain *d,
891 unsigned long image_start, unsigned long image_len,
892 unsigned long initrd_start, unsigned long initrd_len,
893 char *cmdline)
894 {
895 int i, rc;
896 unsigned long alloc_start, alloc_end;
897 start_info_t *si;
898 struct vcpu *v = d->vcpu[0];
899 unsigned long max_pages;
901 struct domain_setup_info dsi;
902 unsigned long p_start;
903 unsigned long pkern_start;
904 unsigned long pkern_entry;
905 unsigned long pkern_end;
906 unsigned long pinitrd_start = 0;
907 unsigned long pstart_info;
908 struct page_info *start_info_page;
909 unsigned long bp_mpa;
910 struct ia64_boot_param *bp;
912 #ifdef VALIDATE_VT
913 unsigned int vmx_dom0 = 0;
914 unsigned long mfn;
915 struct page_info *page = NULL;
916 #endif
918 //printf("construct_dom0: starting\n");
920 /* Sanity! */
921 BUG_ON(d != dom0);
922 BUG_ON(d->vcpu[0] == NULL);
923 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
925 memset(&dsi, 0, sizeof(struct domain_setup_info));
927 printk("*** LOADING DOMAIN 0 ***\n");
929 alloc_start = dom0_start;
930 alloc_end = dom0_start + dom0_size;
931 max_pages = dom0_size / PAGE_SIZE;
932 d->max_pages = max_pages;
933 #ifndef CONFIG_XEN_IA64_DOM0_VP
934 d->tot_pages = d->max_pages;
935 #else
936 d->tot_pages = 0;
937 #endif
938 dsi.image_addr = (unsigned long)image_start;
939 dsi.image_len = image_len;
940 rc = parseelfimage(&dsi);
941 if ( rc != 0 )
942 return rc;
944 #ifdef VALIDATE_VT
945 /* Temp workaround */
946 if (running_on_sim)
947 dsi.xen_section_string = (char *)1;
949 /* Check whether dom0 is vti domain */
950 if ((!vmx_enabled) && !dsi.xen_section_string) {
951 printk("Lack of hardware support for unmodified vmx dom0\n");
952 panic("");
953 }
955 if (vmx_enabled && !dsi.xen_section_string) {
956 printk("Dom0 is vmx domain!\n");
957 vmx_dom0 = 1;
958 }
959 #endif
961 p_start = dsi.v_start;
962 pkern_start = dsi.v_kernstart;
963 pkern_end = dsi.v_kernend;
964 pkern_entry = dsi.v_kernentry;
966 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
968 if ( (p_start & (PAGE_SIZE-1)) != 0 )
969 {
970 printk("Initial guest OS must load to a page boundary.\n");
971 return -EINVAL;
972 }
974 pstart_info = PAGE_ALIGN(pkern_end);
975 if(initrd_start && initrd_len){
976 unsigned long offset;
978 pinitrd_start= (dom0_start + dom0_size) -
979 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
980 if (pinitrd_start <= pstart_info)
981 panic("%s:enough memory is not assigned to dom0", __func__);
983 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
984 struct page_info *p;
985 p = assign_new_domain_page(d, pinitrd_start + offset);
986 if (p == NULL)
987 panic("%s: can't allocate page for initrd image", __func__);
988 if (initrd_len < offset + PAGE_SIZE)
989 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
990 initrd_len - offset);
991 else
992 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
993 }
994 }
996 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
997 " Kernel image: %lx->%lx\n"
998 " Entry address: %lx\n"
999 " Init. ramdisk: %lx len %lx\n"
1000 " Start info.: %lx->%lx\n",
1001 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1002 pstart_info, pstart_info + PAGE_SIZE);
1004 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1006 printk("Initial guest OS requires too much space\n"
1007 "(%luMB is greater than %luMB limit)\n",
1008 (pkern_end-pkern_start)>>20,
1009 (max_pages <<PAGE_SHIFT)>>20);
1010 return -ENOMEM;
1013 // if high 3 bits of pkern start are non-zero, error
1015 // if pkern end is after end of metaphysical memory, error
1016 // (we should be able to deal with this... later)
1018 /* Mask all upcalls... */
1019 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1020 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1022 if (dom0_max_vcpus == 0)
1023 dom0_max_vcpus = MAX_VIRT_CPUS;
1024 if (dom0_max_vcpus > num_online_cpus())
1025 dom0_max_vcpus = num_online_cpus();
1026 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1027 dom0_max_vcpus = MAX_VIRT_CPUS;
1029 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1030 for ( i = 1; i < dom0_max_vcpus; i++ )
1031 if (alloc_vcpu(d, i, i) == NULL)
1032 printf ("Cannot allocate dom0 vcpu %d\n", i);
1034 #if defined(VALIDATE_VT) && !defined(CONFIG_XEN_IA64_DOM0_VP)
1035 /* Construct a frame-allocation list for the initial domain, since these
1036 * pages are allocated by boot allocator and pfns are not set properly
1037 */
1038 for ( mfn = (alloc_start>>PAGE_SHIFT);
1039 mfn < (alloc_end>>PAGE_SHIFT);
1040 mfn++ )
1042 page = mfn_to_page(mfn);
1043 page_set_owner(page, d);
1044 page->u.inuse.type_info = 0;
1045 page->count_info = PGC_allocated | 1;
1046 list_add_tail(&page->list, &d->page_list);
1048 /* Construct 1:1 mapping */
1049 set_gpfn_from_mfn(mfn, mfn);
1051 #endif
1053 /* Copy the OS image. */
1054 loaddomainelfimage(d,image_start);
1056 /* Copy the initial ramdisk. */
1057 //if ( initrd_len != 0 )
1058 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1060 /* Set up start info area. */
1061 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1062 start_info_page = assign_new_domain_page(d, pstart_info);
1063 if (start_info_page == NULL)
1064 panic("can't allocate start info page");
1065 si = page_to_virt(start_info_page);
1066 memset(si, 0, PAGE_SIZE);
1067 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
1068 si->nr_pages = max_pages;
1069 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1071 printk("Dom0: 0x%lx\n", (u64)dom0);
1073 #ifdef VALIDATE_VT
1074 /* VMX specific construction for Dom0, if hardware supports VMX
1075 * and Dom0 is unmodified image
1076 */
1077 if (vmx_dom0)
1078 vmx_final_setup_guest(v);
1079 #endif
1081 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1083 /* Build firmware.
1084 Note: Linux kernel reserve memory used by start_info, so there is
1085 no need to remove it from MDT. */
1086 bp_mpa = pstart_info + sizeof(struct start_info);
1087 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1089 /* Fill boot param. */
1090 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1091 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1093 bp = (struct ia64_boot_param *)(si + 1);
1094 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1096 /* We assume console has reached the last line! */
1097 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1098 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1099 bp->console_info.orig_x = 0;
1100 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1101 0 : bp->console_info.num_rows - 1;
1103 bp->initrd_start = (dom0_start+dom0_size) -
1104 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1105 bp->initrd_size = ia64_boot_param->initrd_size;
1107 vcpu_init_regs (v);
1109 vcpu_regs(v)->r28 = bp_mpa;
1111 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1112 pkern_entry += dom0_start;
1113 #endif
1114 vcpu_regs (v)->cr_iip = pkern_entry;
1116 physdev_init_dom0(d);
1118 // FIXME: Hack for keyboard input
1119 //serial_input_init();
1121 return 0;
1124 void machine_restart(char * __unused)
1126 console_start_sync();
1127 if (running_on_sim)
1128 printf ("machine_restart called. spinning...\n");
1129 else
1130 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1131 while(1);
1134 void machine_halt(void)
1136 console_start_sync();
1137 if (running_on_sim)
1138 printf ("machine_halt called. spinning...\n");
1139 else
1140 (*efi.reset_system)(EFI_RESET_SHUTDOWN,0,0,NULL);
1141 while(1);
1144 void sync_vcpu_execstate(struct vcpu *v)
1146 // __ia64_save_fpu(v->arch._thread.fph);
1147 // if (VMX_DOMAIN(v))
1148 // vmx_save_state(v);
1149 // FIXME SMP: Anything else needed here for SMP?
1152 static void parse_dom0_mem(char *s)
1154 dom0_size = parse_size_and_unit(s);
1156 custom_param("dom0_mem", parse_dom0_mem);
1159 static void parse_dom0_align(char *s)
1161 dom0_align = parse_size_and_unit(s);
1163 custom_param("dom0_align", parse_dom0_align);