ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 10890:b786bfb058eb

[XEN] Hide compile-time information values behind a
functional interface, so that the values are contained
within a single object file.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 01 15:28:09 2006 +0100 (2006-08-01)
parents 7be1cfe8345b
children 47a5dfd1bcd6
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/arch-ia64.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <public/arch-ia64.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <asm/privop_stat.h>
52 #ifndef CONFIG_XEN_IA64_DOM0_VP
53 #define CONFIG_DOMAIN0_CONTIGUOUS
54 #endif
55 unsigned long dom0_start = -1L;
56 unsigned long dom0_size = 512*1024*1024;
57 unsigned long dom0_align = 64*1024*1024;
59 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
60 static unsigned int dom0_max_vcpus = 1;
61 integer_param("dom0_max_vcpus", dom0_max_vcpus);
63 extern unsigned long running_on_sim;
65 extern char dom0_command_line[];
67 /* FIXME: where these declarations should be there ? */
68 extern void serial_input_init(void);
69 static void init_switch_stack(struct vcpu *v);
70 extern void vmx_do_launch(struct vcpu *);
72 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
73 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
75 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
76 This is a Xen virtual address. */
77 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
78 DEFINE_PER_CPU(int *, current_psr_ic_addr);
80 #include <xen/sched-if.h>
82 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
83 {
84 int cpu = smp_processor_id();
85 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
86 int last_processor = vcpu->arch.last_processor;
88 if (is_idle_domain(vcpu->domain))
89 return;
91 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
92 vcpu->arch.last_processor = cpu;
94 if ((last_vcpu_id != vcpu->vcpu_id &&
95 last_vcpu_id != INVALID_VCPU_ID) ||
96 (last_vcpu_id == vcpu->vcpu_id &&
97 last_processor != cpu &&
98 last_processor != INVALID_PROCESSOR)) {
100 // if the vTLB implementation was changed,
101 // the followings must be updated either.
102 if (VMX_DOMAIN(vcpu)) {
103 // currently vTLB for vt-i domian is per vcpu.
104 // so any flushing isn't needed.
105 } else {
106 vhpt_flush();
107 }
108 local_flush_tlb_all();
109 }
110 }
112 void schedule_tail(struct vcpu *prev)
113 {
114 extern char ia64_ivt;
115 context_saved(prev);
117 if (VMX_DOMAIN(current)) {
118 vmx_do_launch(current);
119 } else {
120 ia64_set_iva(&ia64_ivt);
121 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
122 VHPT_ENABLED);
123 load_region_regs(current);
124 vcpu_load_kernel_regs(current);
125 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
126 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
127 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
128 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
129 }
130 flush_vtlb_for_context_switch(current);
131 }
133 void context_switch(struct vcpu *prev, struct vcpu *next)
134 {
135 uint64_t spsr;
136 uint64_t pta;
138 local_irq_save(spsr);
139 context_switch_count++;
141 __ia64_save_fpu(prev->arch._thread.fph);
142 __ia64_load_fpu(next->arch._thread.fph);
143 if (VMX_DOMAIN(prev))
144 vmx_save_state(prev);
145 if (VMX_DOMAIN(next))
146 vmx_load_state(next);
147 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
148 prev = ia64_switch_to(next);
150 /* Note: ia64_switch_to does not return here at vcpu initialization. */
152 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
154 // leave this debug for now: it acts as a heartbeat when more than
155 // one domain is active
156 {
157 static long cnt[16] = { 50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50};
158 static int i = 100;
159 int id = ((struct vcpu *)current)->domain->domain_id & 0xf;
160 if (!cnt[id]--) { cnt[id] = 500000; printk("%x",id); }
161 if (!i--) { i = 1000000; printk("+"); }
162 }
164 if (VMX_DOMAIN(current)){
165 vmx_load_all_rr(current);
166 } else {
167 struct domain *nd;
168 extern char ia64_ivt;
170 ia64_set_iva(&ia64_ivt);
172 nd = current->domain;
173 if (!is_idle_domain(nd)) {
174 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
175 VHPT_ENABLED);
176 load_region_regs(current);
177 vcpu_load_kernel_regs(current);
178 vcpu_set_next_timer(current);
179 if (vcpu_timer_expired(current))
180 vcpu_pend_timer(current);
181 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
182 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
183 __ia64_per_cpu_var(current_psr_ic_addr) =
184 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
185 } else {
186 /* When switching to idle domain, only need to disable vhpt
187 * walker. Then all accesses happen within idle context will
188 * be handled by TR mapping and identity mapping.
189 */
190 pta = ia64_get_pta();
191 ia64_set_pta(pta & ~VHPT_ENABLED);
192 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
193 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
194 }
195 }
196 flush_vtlb_for_context_switch(current);
197 local_irq_restore(spsr);
198 context_saved(prev);
199 }
201 void continue_running(struct vcpu *same)
202 {
203 /* nothing to do */
204 }
206 static void default_idle(void)
207 {
208 local_irq_disable();
209 if ( !softirq_pending(smp_processor_id()) )
210 safe_halt();
211 local_irq_enable();
212 }
214 static void continue_cpu_idle_loop(void)
215 {
216 for ( ; ; )
217 {
218 #ifdef IA64
219 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
220 #else
221 irq_stat[cpu].idle_timestamp = jiffies;
222 #endif
223 while ( !softirq_pending(smp_processor_id()) )
224 default_idle();
225 raise_softirq(SCHEDULE_SOFTIRQ);
226 do_softirq();
227 }
228 }
230 void startup_cpu_idle_loop(void)
231 {
232 /* Just some sanity to ensure that the scheduler is set up okay. */
233 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
234 raise_softirq(SCHEDULE_SOFTIRQ);
236 continue_cpu_idle_loop();
237 }
239 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
240 {
241 struct vcpu *v;
242 struct thread_info *ti;
244 /* Still keep idle vcpu0 static allocated at compilation, due
245 * to some code from Linux still requires it in early phase.
246 */
247 if (is_idle_domain(d) && !vcpu_id)
248 v = idle_vcpu[0];
249 else {
250 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
251 return NULL;
252 memset(v, 0, sizeof(*v));
254 ti = alloc_thread_info(v);
255 /* Clear thread_info to clear some important fields, like
256 * preempt_count
257 */
258 memset(ti, 0, sizeof(struct thread_info));
259 init_switch_stack(v);
260 }
262 if (!is_idle_domain(d)) {
263 if (!d->arch.is_vti) {
264 /* Create privregs page only if not VTi. */
265 v->arch.privregs =
266 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
267 BUG_ON(v->arch.privregs == NULL);
268 memset(v->arch.privregs, 0, PAGE_SIZE);
269 share_xen_page_with_guest(virt_to_page(v->arch.privregs),
270 d, XENSHARE_writable);
271 }
273 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
274 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
275 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
276 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
278 /* Is it correct ?
279 It depends on the domain rid usage.
281 A domain may share rid among its processor (eg having a
282 global VHPT). In this case, we should also share rid
283 among vcpus and the rid range should be the same.
285 However a domain may have per cpu rid allocation. In
286 this case we don't want to share rid among vcpus, but we may
287 do it if two vcpus are on the same cpu... */
289 v->arch.starting_rid = d->arch.starting_rid;
290 v->arch.ending_rid = d->arch.ending_rid;
291 v->arch.breakimm = d->arch.breakimm;
292 v->arch.last_processor = INVALID_PROCESSOR;
293 }
295 return v;
296 }
298 void free_vcpu_struct(struct vcpu *v)
299 {
300 if (VMX_DOMAIN(v))
301 vmx_relinquish_vcpu_resources(v);
302 else {
303 if (v->arch.privregs != NULL)
304 free_xenheap_pages(v->arch.privregs,
305 get_order_from_shift(XMAPPEDREGS_SHIFT));
306 }
308 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
309 }
311 static void init_switch_stack(struct vcpu *v)
312 {
313 struct pt_regs *regs = vcpu_regs (v);
314 struct switch_stack *sw = (struct switch_stack *) regs - 1;
315 extern void ia64_ret_from_clone;
317 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
318 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
319 sw->b0 = (unsigned long) &ia64_ret_from_clone;
320 sw->ar_fpsr = FPSR_DEFAULT;
321 v->arch._thread.ksp = (unsigned long) sw - 16;
322 // stay on kernel stack because may get interrupts!
323 // ia64_ret_from_clone switches to user stack
324 v->arch._thread.on_ustack = 0;
325 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
326 }
328 int arch_domain_create(struct domain *d)
329 {
330 int i;
332 // the following will eventually need to be negotiated dynamically
333 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
334 d->arch.breakimm = 0x1000;
335 for (i = 0; i < NR_CPUS; i++) {
336 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
337 }
339 if (is_idle_domain(d))
340 return 0;
342 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
343 if (d->shared_info == NULL)
344 goto fail_nomem;
345 memset(d->shared_info, 0, XSI_SIZE);
346 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
347 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
348 d, XENSHARE_writable);
350 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
351 /* We may also need emulation rid for region4, though it's unlikely
352 * to see guest issue uncacheable access in metaphysical mode. But
353 * keep such info here may be more sane.
354 */
355 if (!allocate_rid_range(d,0))
356 goto fail_nomem;
358 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
360 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
361 goto fail_nomem;
363 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
364 RANGESETF_prettyprint_hex);
366 printf ("arch_domain_create: domain=%p\n", d);
367 return 0;
369 fail_nomem:
370 if (d->arch.mm.pgd != NULL)
371 pgd_free(d->arch.mm.pgd);
372 if (d->shared_info != NULL)
373 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
374 return -ENOMEM;
375 }
377 void arch_domain_destroy(struct domain *d)
378 {
379 BUG_ON(d->arch.mm.pgd != NULL);
380 if (d->shared_info != NULL)
381 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
382 if (d->arch.shadow_bitmap != NULL)
383 xfree(d->arch.shadow_bitmap);
385 /* Clear vTLB for the next domain. */
386 domain_flush_tlb_vhpt(d);
388 deallocate_rid_range(d);
389 }
391 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
392 {
393 int i;
394 struct vcpu_extra_regs *er = &c->extra_regs;
396 c->user_regs = *vcpu_regs (v);
397 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
399 /* Fill extra regs. */
400 for (i = 0; i < 8; i++) {
401 er->itrs[i].pte = v->arch.itrs[i].pte.val;
402 er->itrs[i].itir = v->arch.itrs[i].itir;
403 er->itrs[i].vadr = v->arch.itrs[i].vadr;
404 er->itrs[i].rid = v->arch.itrs[i].rid;
405 }
406 for (i = 0; i < 8; i++) {
407 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
408 er->dtrs[i].itir = v->arch.dtrs[i].itir;
409 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
410 er->dtrs[i].rid = v->arch.dtrs[i].rid;
411 }
412 er->event_callback_ip = v->arch.event_callback_ip;
413 er->dcr = v->arch.dcr;
414 er->iva = v->arch.iva;
415 }
417 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
418 {
419 struct pt_regs *regs = vcpu_regs (v);
420 struct domain *d = v->domain;
422 *regs = c->user_regs;
424 if (!d->arch.is_vti) {
425 /* domain runs at PL2/3 */
426 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
427 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
428 }
430 if (c->flags & VGCF_EXTRA_REGS) {
431 int i;
432 struct vcpu_extra_regs *er = &c->extra_regs;
434 for (i = 0; i < 8; i++) {
435 vcpu_set_itr(v, i, er->itrs[i].pte,
436 er->itrs[i].itir,
437 er->itrs[i].vadr,
438 er->itrs[i].rid);
439 }
440 for (i = 0; i < 8; i++) {
441 vcpu_set_dtr(v, i,
442 er->dtrs[i].pte,
443 er->dtrs[i].itir,
444 er->dtrs[i].vadr,
445 er->dtrs[i].rid);
446 }
447 v->arch.event_callback_ip = er->event_callback_ip;
448 v->arch.dcr = er->dcr;
449 v->arch.iva = er->iva;
450 }
452 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
453 return 0;
454 if (d->arch.is_vti)
455 vmx_final_setup_guest(v);
457 /* This overrides some registers. */
458 vcpu_init_regs(v);
460 /* Don't redo final setup */
461 set_bit(_VCPUF_initialised, &v->vcpu_flags);
462 return 0;
463 }
465 static void relinquish_memory(struct domain *d, struct list_head *list)
466 {
467 struct list_head *ent;
468 struct page_info *page;
469 #ifndef __ia64__
470 unsigned long x, y;
471 #endif
473 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
474 spin_lock_recursive(&d->page_alloc_lock);
475 ent = list->next;
476 while ( ent != list )
477 {
478 page = list_entry(ent, struct page_info, list);
479 /* Grab a reference to the page so it won't disappear from under us. */
480 if ( unlikely(!get_page(page, d)) )
481 {
482 /* Couldn't get a reference -- someone is freeing this page. */
483 ent = ent->next;
484 continue;
485 }
487 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
488 put_page_and_type(page);
490 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
491 put_page(page);
493 #ifndef __ia64__
494 /*
495 * Forcibly invalidate base page tables at this point to break circular
496 * 'linear page table' references. This is okay because MMU structures
497 * are not shared across domains and this domain is now dead. Thus base
498 * tables are not in use so a non-zero count means circular reference.
499 */
500 y = page->u.inuse.type_info;
501 for ( ; ; )
502 {
503 x = y;
504 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
505 (PGT_base_page_table|PGT_validated)) )
506 break;
508 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
509 if ( likely(y == x) )
510 {
511 free_page_type(page, PGT_base_page_table);
512 break;
513 }
514 }
515 #endif
517 /* Follow the list chain and /then/ potentially free the page. */
518 ent = ent->next;
519 #ifdef CONFIG_XEN_IA64_DOM0_VP
520 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
521 #endif
522 put_page(page);
523 }
525 spin_unlock_recursive(&d->page_alloc_lock);
526 }
528 void domain_relinquish_resources(struct domain *d)
529 {
530 /* Relinquish every page of memory. */
532 // relase page traversing d->arch.mm.
533 relinquish_mm(d);
535 relinquish_memory(d, &d->xenpage_list);
536 relinquish_memory(d, &d->page_list);
538 if (d->arch.is_vti && d->arch.sal_data)
539 xfree(d->arch.sal_data);
540 }
542 void build_physmap_table(struct domain *d)
543 {
544 struct list_head *list_ent = d->page_list.next;
545 unsigned long mfn, i = 0;
547 while(list_ent != &d->page_list) {
548 mfn = page_to_mfn(list_entry(
549 list_ent, struct page_info, list));
550 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
552 i++;
553 list_ent = mfn_to_page(mfn)->list.next;
554 }
555 }
557 unsigned long
558 domain_set_shared_info_va (unsigned long va)
559 {
560 struct vcpu *v = current;
561 struct domain *d = v->domain;
562 struct vcpu *v1;
564 /* Check virtual address:
565 must belong to region 7,
566 must be 64Kb aligned,
567 must not be within Xen virtual space. */
568 if ((va >> 61) != 7
569 || (va & 0xffffUL) != 0
570 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
571 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
573 /* Note: this doesn't work well if other cpus are already running.
574 However this is part of the spec :-) */
575 printf ("Domain set shared_info_va to 0x%016lx\n", va);
576 d->arch.shared_info_va = va;
578 for_each_vcpu (d, v1) {
579 VCPU(v1, interrupt_mask_addr) =
580 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
581 }
583 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
585 /* Remap the shared pages. */
586 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
588 return 0;
589 }
591 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
592 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
594 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
595 {
596 unsigned int op = sc->op;
597 int rc = 0;
598 int i;
599 //struct vcpu *v;
601 if (unlikely(d == current->domain)) {
602 DPRINTK("Don't try to do a shadow op on yourself!\n");
603 return -EINVAL;
604 }
606 domain_pause(d);
608 switch (op)
609 {
610 case DOM0_SHADOW_CONTROL_OP_OFF:
611 if (shadow_mode_enabled (d)) {
612 u64 *bm = d->arch.shadow_bitmap;
614 /* Flush vhpt and tlb to restore dirty bit usage. */
615 domain_flush_tlb_vhpt(d);
617 /* Free bitmap. */
618 d->arch.shadow_bitmap_size = 0;
619 d->arch.shadow_bitmap = NULL;
620 xfree(bm);
621 }
622 break;
624 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
625 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
626 rc = -EINVAL;
627 break;
629 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
630 if (shadow_mode_enabled(d)) {
631 rc = -EINVAL;
632 break;
633 }
635 atomic64_set(&d->arch.shadow_fault_count, 0);
636 atomic64_set(&d->arch.shadow_dirty_count, 0);
638 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
639 ~(BITS_PER_LONG-1);
640 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
641 d->arch.shadow_bitmap_size / BITS_PER_LONG);
642 if (d->arch.shadow_bitmap == NULL) {
643 d->arch.shadow_bitmap_size = 0;
644 rc = -ENOMEM;
645 }
646 else {
647 memset(d->arch.shadow_bitmap, 0,
648 d->arch.shadow_bitmap_size / 8);
650 /* Flush vhtp and tlb to enable dirty bit
651 virtualization. */
652 domain_flush_tlb_vhpt(d);
653 }
654 break;
656 case DOM0_SHADOW_CONTROL_OP_FLUSH:
657 atomic64_set(&d->arch.shadow_fault_count, 0);
658 atomic64_set(&d->arch.shadow_dirty_count, 0);
659 break;
661 case DOM0_SHADOW_CONTROL_OP_CLEAN:
662 {
663 int nbr_longs;
665 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
666 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
668 atomic64_set(&d->arch.shadow_fault_count, 0);
669 atomic64_set(&d->arch.shadow_dirty_count, 0);
671 if (guest_handle_is_null(sc->dirty_bitmap) ||
672 (d->arch.shadow_bitmap == NULL)) {
673 rc = -EINVAL;
674 break;
675 }
677 if (sc->pages > d->arch.shadow_bitmap_size)
678 sc->pages = d->arch.shadow_bitmap_size;
680 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
682 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
683 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
684 SHADOW_COPY_CHUNK : nbr_longs - i;
686 if (copy_to_guest_offset(sc->dirty_bitmap, i,
687 d->arch.shadow_bitmap + i,
688 size)) {
689 rc = -EFAULT;
690 break;
691 }
693 memset(d->arch.shadow_bitmap + i,
694 0, size * sizeof(unsigned long));
695 }
697 break;
698 }
700 case DOM0_SHADOW_CONTROL_OP_PEEK:
701 {
702 unsigned long size;
704 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
705 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
707 if (guest_handle_is_null(sc->dirty_bitmap) ||
708 (d->arch.shadow_bitmap == NULL)) {
709 rc = -EINVAL;
710 break;
711 }
713 if (sc->pages > d->arch.shadow_bitmap_size)
714 sc->pages = d->arch.shadow_bitmap_size;
716 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
717 if (copy_to_guest(sc->dirty_bitmap,
718 d->arch.shadow_bitmap, size)) {
719 rc = -EFAULT;
720 break;
721 }
722 break;
723 }
724 default:
725 rc = -EINVAL;
726 break;
727 }
729 domain_unpause(d);
731 return rc;
732 }
734 // remove following line if not privifying in memory
735 //#define HAVE_PRIVIFY_MEMORY
736 #ifndef HAVE_PRIVIFY_MEMORY
737 #define privify_memory(x,y) do {} while(0)
738 #endif
740 // see arch/x86/xxx/domain_build.c
741 int elf_sanity_check(Elf_Ehdr *ehdr)
742 {
743 if (!(IS_ELF(*ehdr)))
744 {
745 printk("DOM0 image is not a Xen-compatible Elf image.\n");
746 return 0;
747 }
748 return 1;
749 }
751 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
752 {
753 char *elfbase = (char *) image_start;
754 Elf_Ehdr ehdr;
755 Elf_Phdr phdr;
756 int h, filesz, memsz;
757 unsigned long elfaddr, dom_mpaddr, dom_imva;
758 struct page_info *p;
760 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
761 for ( h = 0; h < ehdr.e_phnum; h++ ) {
762 memcpy(&phdr,
763 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
764 sizeof(Elf_Phdr));
765 if ((phdr.p_type != PT_LOAD))
766 continue;
768 filesz = phdr.p_filesz;
769 memsz = phdr.p_memsz;
770 elfaddr = (unsigned long) elfbase + phdr.p_offset;
771 dom_mpaddr = phdr.p_paddr;
773 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
774 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
775 if (d == dom0) {
776 if (dom_mpaddr+memsz>dom0_size)
777 panic("Dom0 doesn't fit in memory space!\n");
778 dom_imva = __va_ul(dom_mpaddr + dom0_start);
779 memcpy((void *)dom_imva, (void *)elfaddr, filesz);
780 if (memsz > filesz)
781 memset((void *)dom_imva+filesz, 0,
782 memsz-filesz);
783 //FIXME: This test for code seems to find a lot more than objdump -x does
784 if (phdr.p_flags & PF_X) {
785 privify_memory(dom_imva,filesz);
786 flush_icache_range (dom_imva, dom_imva+filesz);
787 }
788 }
789 else
790 #endif
791 while (memsz > 0) {
792 p = assign_new_domain_page(d,dom_mpaddr);
793 BUG_ON (unlikely(p == NULL));
794 dom_imva = __va_ul(page_to_maddr(p));
795 if (filesz > 0) {
796 if (filesz >= PAGE_SIZE)
797 memcpy((void *) dom_imva,
798 (void *) elfaddr,
799 PAGE_SIZE);
800 else {
801 // copy partial page
802 memcpy((void *) dom_imva,
803 (void *) elfaddr, filesz);
804 // zero the rest of page
805 memset((void *) dom_imva+filesz, 0,
806 PAGE_SIZE-filesz);
807 }
808 //FIXME: This test for code seems to find a lot more than objdump -x does
809 if (phdr.p_flags & PF_X) {
810 privify_memory(dom_imva,PAGE_SIZE);
811 flush_icache_range(dom_imva,
812 dom_imva+PAGE_SIZE);
813 }
814 }
815 else if (memsz > 0) {
816 /* always zero out entire page */
817 memset((void *) dom_imva, 0, PAGE_SIZE);
818 }
819 memsz -= PAGE_SIZE;
820 filesz -= PAGE_SIZE;
821 elfaddr += PAGE_SIZE;
822 dom_mpaddr += PAGE_SIZE;
823 }
824 }
825 }
827 void alloc_dom0(void)
828 {
829 /* Check dom0 size. */
830 if (dom0_size < 4 * 1024 * 1024) {
831 panic("dom0_mem is too small, boot aborted"
832 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
833 }
835 /* Check dom0 align. */
836 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
837 panic("dom0_align (%lx) must be power of two, boot aborted"
838 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
839 dom0_align);
840 }
841 if (dom0_align < PAGE_SIZE) {
842 panic("dom0_align must be >= %ld, boot aborted"
843 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
844 PAGE_SIZE);
845 }
846 if (dom0_size % dom0_align) {
847 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
848 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
849 dom0_size,dom0_align);
850 }
852 if (running_on_sim) {
853 dom0_size = 128*1024*1024; //FIXME: Should be configurable
854 }
855 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
856 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
858 /* FIXME: The first trunk (say 256M) should always be assigned to
859 * Dom0, since Dom0's physical == machine address for DMA purpose.
860 * Some old version linux, like 2.4, assumes physical memory existing
861 * in 2nd 64M space.
862 */
863 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
864 dom0_start <<= PAGE_SHIFT;
865 if (!dom0_start) {
866 panic("alloc_dom0: can't allocate contiguous memory size=%lu\n",
867 dom0_size);
868 }
869 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
870 #else
871 // no need to allocate pages for now
872 // pages are allocated by map_new_domain_page() via loaddomainelfimage()
873 dom0_start = 0;
874 #endif
876 }
879 /*
880 * Domain 0 has direct access to all devices absolutely. However
881 * the major point of this stub here, is to allow alloc_dom_mem
882 * handled with order > 0 request. Dom0 requires that bit set to
883 * allocate memory for other domains.
884 */
885 static void physdev_init_dom0(struct domain *d)
886 {
887 if (iomem_permit_access(d, 0UL, ~0UL))
888 BUG();
889 if (irqs_permit_access(d, 0, NR_IRQS-1))
890 BUG();
891 if (ioports_permit_access(d, 0, 0xffff))
892 BUG();
893 }
895 int construct_dom0(struct domain *d,
896 unsigned long image_start, unsigned long image_len,
897 unsigned long initrd_start, unsigned long initrd_len,
898 char *cmdline)
899 {
900 int i, rc;
901 unsigned long alloc_start, alloc_end;
902 start_info_t *si;
903 struct vcpu *v = d->vcpu[0];
904 unsigned long max_pages;
906 struct domain_setup_info dsi;
907 unsigned long p_start;
908 unsigned long pkern_start;
909 unsigned long pkern_entry;
910 unsigned long pkern_end;
911 unsigned long pinitrd_start = 0;
912 unsigned long pstart_info;
913 struct page_info *start_info_page;
914 unsigned long bp_mpa;
915 struct ia64_boot_param *bp;
917 #ifdef VALIDATE_VT
918 unsigned int vmx_dom0 = 0;
919 unsigned long mfn;
920 struct page_info *page = NULL;
921 #endif
923 //printf("construct_dom0: starting\n");
925 /* Sanity! */
926 BUG_ON(d != dom0);
927 BUG_ON(d->vcpu[0] == NULL);
928 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
930 memset(&dsi, 0, sizeof(struct domain_setup_info));
932 printk("*** LOADING DOMAIN 0 ***\n");
934 alloc_start = dom0_start;
935 alloc_end = dom0_start + dom0_size;
936 max_pages = dom0_size / PAGE_SIZE;
937 d->max_pages = max_pages;
938 #ifndef CONFIG_XEN_IA64_DOM0_VP
939 d->tot_pages = d->max_pages;
940 #else
941 d->tot_pages = 0;
942 #endif
943 dsi.image_addr = (unsigned long)image_start;
944 dsi.image_len = image_len;
945 rc = parseelfimage(&dsi);
946 if ( rc != 0 )
947 return rc;
949 #ifdef VALIDATE_VT
950 /* Temp workaround */
951 if (running_on_sim)
952 dsi.xen_section_string = (char *)1;
954 /* Check whether dom0 is vti domain */
955 if ((!vmx_enabled) && !dsi.xen_section_string) {
956 printk("Lack of hardware support for unmodified vmx dom0\n");
957 panic("");
958 }
960 if (vmx_enabled && !dsi.xen_section_string) {
961 printk("Dom0 is vmx domain!\n");
962 vmx_dom0 = 1;
963 }
964 #endif
966 p_start = dsi.v_start;
967 pkern_start = dsi.v_kernstart;
968 pkern_end = dsi.v_kernend;
969 pkern_entry = dsi.v_kernentry;
971 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
973 if ( (p_start & (PAGE_SIZE-1)) != 0 )
974 {
975 printk("Initial guest OS must load to a page boundary.\n");
976 return -EINVAL;
977 }
979 pstart_info = PAGE_ALIGN(pkern_end);
980 if(initrd_start && initrd_len){
981 unsigned long offset;
983 pinitrd_start= (dom0_start + dom0_size) -
984 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
985 if (pinitrd_start <= pstart_info)
986 panic("%s:enough memory is not assigned to dom0", __func__);
988 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
989 struct page_info *p;
990 p = assign_new_domain_page(d, pinitrd_start + offset);
991 if (p == NULL)
992 panic("%s: can't allocate page for initrd image", __func__);
993 if (initrd_len < offset + PAGE_SIZE)
994 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
995 initrd_len - offset);
996 else
997 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
998 }
999 }
1001 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1002 " Kernel image: %lx->%lx\n"
1003 " Entry address: %lx\n"
1004 " Init. ramdisk: %lx len %lx\n"
1005 " Start info.: %lx->%lx\n",
1006 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1007 pstart_info, pstart_info + PAGE_SIZE);
1009 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1011 printk("Initial guest OS requires too much space\n"
1012 "(%luMB is greater than %luMB limit)\n",
1013 (pkern_end-pkern_start)>>20,
1014 (max_pages <<PAGE_SHIFT)>>20);
1015 return -ENOMEM;
1018 // if high 3 bits of pkern start are non-zero, error
1020 // if pkern end is after end of metaphysical memory, error
1021 // (we should be able to deal with this... later)
1023 /* Mask all upcalls... */
1024 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1025 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1027 if (dom0_max_vcpus == 0)
1028 dom0_max_vcpus = MAX_VIRT_CPUS;
1029 if (dom0_max_vcpus > num_online_cpus())
1030 dom0_max_vcpus = num_online_cpus();
1031 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1032 dom0_max_vcpus = MAX_VIRT_CPUS;
1034 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1035 for ( i = 1; i < dom0_max_vcpus; i++ )
1036 if (alloc_vcpu(d, i, i) == NULL)
1037 printf ("Cannot allocate dom0 vcpu %d\n", i);
1039 #if defined(VALIDATE_VT) && !defined(CONFIG_XEN_IA64_DOM0_VP)
1040 /* Construct a frame-allocation list for the initial domain, since these
1041 * pages are allocated by boot allocator and pfns are not set properly
1042 */
1043 for ( mfn = (alloc_start>>PAGE_SHIFT);
1044 mfn < (alloc_end>>PAGE_SHIFT);
1045 mfn++ )
1047 page = mfn_to_page(mfn);
1048 page_set_owner(page, d);
1049 page->u.inuse.type_info = 0;
1050 page->count_info = PGC_allocated | 1;
1051 list_add_tail(&page->list, &d->page_list);
1053 /* Construct 1:1 mapping */
1054 set_gpfn_from_mfn(mfn, mfn);
1056 #endif
1058 /* Copy the OS image. */
1059 loaddomainelfimage(d,image_start);
1061 /* Copy the initial ramdisk. */
1062 //if ( initrd_len != 0 )
1063 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1065 /* Set up start info area. */
1066 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1067 start_info_page = assign_new_domain_page(d, pstart_info);
1068 if (start_info_page == NULL)
1069 panic("can't allocate start info page");
1070 si = page_to_virt(start_info_page);
1071 memset(si, 0, PAGE_SIZE);
1072 sprintf(si->magic, "xen-%i.%i-ia64",
1073 xen_major_version(), xen_minor_version());
1074 si->nr_pages = max_pages;
1075 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1077 printk("Dom0: 0x%lx\n", (u64)dom0);
1079 #ifdef VALIDATE_VT
1080 /* VMX specific construction for Dom0, if hardware supports VMX
1081 * and Dom0 is unmodified image
1082 */
1083 if (vmx_dom0)
1084 vmx_final_setup_guest(v);
1085 #endif
1087 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1089 /* Build firmware.
1090 Note: Linux kernel reserve memory used by start_info, so there is
1091 no need to remove it from MDT. */
1092 bp_mpa = pstart_info + sizeof(struct start_info);
1093 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1095 /* Fill boot param. */
1096 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1097 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1099 bp = (struct ia64_boot_param *)(si + 1);
1100 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1102 /* We assume console has reached the last line! */
1103 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1104 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1105 bp->console_info.orig_x = 0;
1106 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1107 0 : bp->console_info.num_rows - 1;
1109 bp->initrd_start = (dom0_start+dom0_size) -
1110 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1111 bp->initrd_size = ia64_boot_param->initrd_size;
1113 vcpu_init_regs (v);
1115 vcpu_regs(v)->r28 = bp_mpa;
1117 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1118 pkern_entry += dom0_start;
1119 #endif
1120 vcpu_regs (v)->cr_iip = pkern_entry;
1122 physdev_init_dom0(d);
1124 // FIXME: Hack for keyboard input
1125 //serial_input_init();
1127 return 0;
1130 void machine_restart(char * __unused)
1132 console_start_sync();
1133 if (running_on_sim)
1134 printf ("machine_restart called. spinning...\n");
1135 else
1136 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1137 while(1);
1140 void machine_halt(void)
1142 console_start_sync();
1143 if (running_on_sim)
1144 printf ("machine_halt called. spinning...\n");
1145 else
1146 (*efi.reset_system)(EFI_RESET_SHUTDOWN,0,0,NULL);
1147 while(1);
1150 void sync_vcpu_execstate(struct vcpu *v)
1152 // __ia64_save_fpu(v->arch._thread.fph);
1153 // if (VMX_DOMAIN(v))
1154 // vmx_save_state(v);
1155 // FIXME SMP: Anything else needed here for SMP?
1158 static void parse_dom0_mem(char *s)
1160 dom0_size = parse_size_and_unit(s);
1162 custom_param("dom0_mem", parse_dom0_mem);
1165 static void parse_dom0_align(char *s)
1167 dom0_align = parse_size_and_unit(s);
1169 custom_param("dom0_align", parse_dom0_align);