ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 10929:7cde0d938ef4

[IA64] convert more privop_stat to perfc

Convert most privop stats to perfc.

Signed-off-by: Tristan Gingold <tristan.gingold@bull.net>
author awilliam@xenbuild.aw
date Fri Aug 04 09:02:43 2006 -0600 (2006-08-04)
parents 3d6c1af609bf
children c3e20511c745
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/arch-ia64.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <public/arch-ia64.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <asm/privop_stat.h>
52 #ifndef CONFIG_XEN_IA64_DOM0_VP
53 #define CONFIG_DOMAIN0_CONTIGUOUS
54 #endif
55 unsigned long dom0_start = -1L;
56 unsigned long dom0_size = 512*1024*1024;
57 unsigned long dom0_align = 64*1024*1024;
59 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
60 static unsigned int dom0_max_vcpus = 1;
61 integer_param("dom0_max_vcpus", dom0_max_vcpus);
63 extern unsigned long running_on_sim;
65 extern char dom0_command_line[];
67 /* FIXME: where these declarations should be there ? */
68 extern void serial_input_init(void);
69 static void init_switch_stack(struct vcpu *v);
70 extern void vmx_do_launch(struct vcpu *);
72 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
73 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
75 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
76 This is a Xen virtual address. */
77 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
78 DEFINE_PER_CPU(int *, current_psr_ic_addr);
80 #include <xen/sched-if.h>
82 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
83 {
84 int cpu = smp_processor_id();
85 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
86 int last_processor = vcpu->arch.last_processor;
88 if (is_idle_domain(vcpu->domain))
89 return;
91 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
92 vcpu->arch.last_processor = cpu;
94 if ((last_vcpu_id != vcpu->vcpu_id &&
95 last_vcpu_id != INVALID_VCPU_ID) ||
96 (last_vcpu_id == vcpu->vcpu_id &&
97 last_processor != cpu &&
98 last_processor != INVALID_PROCESSOR)) {
100 // if the vTLB implementation was changed,
101 // the followings must be updated either.
102 if (VMX_DOMAIN(vcpu)) {
103 // currently vTLB for vt-i domian is per vcpu.
104 // so any flushing isn't needed.
105 } else {
106 vhpt_flush();
107 }
108 local_flush_tlb_all();
109 }
110 }
112 void schedule_tail(struct vcpu *prev)
113 {
114 extern char ia64_ivt;
115 context_saved(prev);
117 if (VMX_DOMAIN(current)) {
118 vmx_do_launch(current);
119 } else {
120 ia64_set_iva(&ia64_ivt);
121 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
122 VHPT_ENABLED);
123 load_region_regs(current);
124 vcpu_load_kernel_regs(current);
125 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
126 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
127 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
128 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
129 }
130 flush_vtlb_for_context_switch(current);
131 }
133 void context_switch(struct vcpu *prev, struct vcpu *next)
134 {
135 uint64_t spsr;
136 uint64_t pta;
138 local_irq_save(spsr);
140 __ia64_save_fpu(prev->arch._thread.fph);
141 __ia64_load_fpu(next->arch._thread.fph);
142 if (VMX_DOMAIN(prev))
143 vmx_save_state(prev);
144 if (VMX_DOMAIN(next))
145 vmx_load_state(next);
146 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
147 prev = ia64_switch_to(next);
149 /* Note: ia64_switch_to does not return here at vcpu initialization. */
151 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
153 // leave this debug for now: it acts as a heartbeat when more than
154 // one domain is active
155 {
156 static long cnt[16] = { 50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50};
157 static int i = 100;
158 int id = ((struct vcpu *)current)->domain->domain_id & 0xf;
159 if (!cnt[id]--) { cnt[id] = 500000; printk("%x",id); }
160 if (!i--) { i = 1000000; printk("+"); }
161 }
163 if (VMX_DOMAIN(current)){
164 vmx_load_all_rr(current);
165 } else {
166 struct domain *nd;
167 extern char ia64_ivt;
169 ia64_set_iva(&ia64_ivt);
171 nd = current->domain;
172 if (!is_idle_domain(nd)) {
173 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
174 VHPT_ENABLED);
175 load_region_regs(current);
176 vcpu_load_kernel_regs(current);
177 vcpu_set_next_timer(current);
178 if (vcpu_timer_expired(current))
179 vcpu_pend_timer(current);
180 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
181 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
182 __ia64_per_cpu_var(current_psr_ic_addr) =
183 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
184 } else {
185 /* When switching to idle domain, only need to disable vhpt
186 * walker. Then all accesses happen within idle context will
187 * be handled by TR mapping and identity mapping.
188 */
189 pta = ia64_get_pta();
190 ia64_set_pta(pta & ~VHPT_ENABLED);
191 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
192 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
193 }
194 }
195 flush_vtlb_for_context_switch(current);
196 local_irq_restore(spsr);
197 context_saved(prev);
198 }
200 void continue_running(struct vcpu *same)
201 {
202 /* nothing to do */
203 }
205 static void default_idle(void)
206 {
207 local_irq_disable();
208 if ( !softirq_pending(smp_processor_id()) )
209 safe_halt();
210 local_irq_enable();
211 }
213 static void continue_cpu_idle_loop(void)
214 {
215 for ( ; ; )
216 {
217 #ifdef IA64
218 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
219 #else
220 irq_stat[cpu].idle_timestamp = jiffies;
221 #endif
222 while ( !softirq_pending(smp_processor_id()) )
223 default_idle();
224 raise_softirq(SCHEDULE_SOFTIRQ);
225 do_softirq();
226 }
227 }
229 void startup_cpu_idle_loop(void)
230 {
231 /* Just some sanity to ensure that the scheduler is set up okay. */
232 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
233 raise_softirq(SCHEDULE_SOFTIRQ);
235 continue_cpu_idle_loop();
236 }
238 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
239 * get_order_from_shift(XMAPPEDREGS_SHIFT))
240 */
241 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
242 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
243 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
244 #endif
246 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
247 {
248 struct vcpu *v;
249 struct thread_info *ti;
251 /* Still keep idle vcpu0 static allocated at compilation, due
252 * to some code from Linux still requires it in early phase.
253 */
254 if (is_idle_domain(d) && !vcpu_id)
255 v = idle_vcpu[0];
256 else {
257 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
258 return NULL;
259 memset(v, 0, sizeof(*v));
261 ti = alloc_thread_info(v);
262 /* Clear thread_info to clear some important fields, like
263 * preempt_count
264 */
265 memset(ti, 0, sizeof(struct thread_info));
266 init_switch_stack(v);
267 }
269 if (!is_idle_domain(d)) {
270 if (!d->arch.is_vti) {
271 int order;
272 int i;
274 /* Create privregs page only if not VTi. */
275 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
276 v->arch.privregs = alloc_xenheap_pages(order);
277 BUG_ON(v->arch.privregs == NULL);
278 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
279 for (i = 0; i < (1 << order); i++)
280 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
281 i, d, XENSHARE_writable);
282 }
284 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
285 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
286 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
287 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
289 /* Is it correct ?
290 It depends on the domain rid usage.
292 A domain may share rid among its processor (eg having a
293 global VHPT). In this case, we should also share rid
294 among vcpus and the rid range should be the same.
296 However a domain may have per cpu rid allocation. In
297 this case we don't want to share rid among vcpus, but we may
298 do it if two vcpus are on the same cpu... */
300 v->arch.starting_rid = d->arch.starting_rid;
301 v->arch.ending_rid = d->arch.ending_rid;
302 v->arch.breakimm = d->arch.breakimm;
303 v->arch.last_processor = INVALID_PROCESSOR;
304 }
306 return v;
307 }
309 void relinquish_vcpu_resources(struct vcpu *v)
310 {
311 if (v->arch.privregs != NULL) {
312 free_xenheap_pages(v->arch.privregs,
313 get_order_from_shift(XMAPPEDREGS_SHIFT));
314 v->arch.privregs = NULL;
315 }
316 }
318 void free_vcpu_struct(struct vcpu *v)
319 {
320 if (VMX_DOMAIN(v))
321 vmx_relinquish_vcpu_resources(v);
322 else
323 relinquish_vcpu_resources(v);
325 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
326 }
328 static void init_switch_stack(struct vcpu *v)
329 {
330 struct pt_regs *regs = vcpu_regs (v);
331 struct switch_stack *sw = (struct switch_stack *) regs - 1;
332 extern void ia64_ret_from_clone;
334 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
335 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
336 sw->b0 = (unsigned long) &ia64_ret_from_clone;
337 sw->ar_fpsr = FPSR_DEFAULT;
338 v->arch._thread.ksp = (unsigned long) sw - 16;
339 // stay on kernel stack because may get interrupts!
340 // ia64_ret_from_clone switches to user stack
341 v->arch._thread.on_ustack = 0;
342 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
343 }
345 int arch_domain_create(struct domain *d)
346 {
347 int i;
349 // the following will eventually need to be negotiated dynamically
350 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
351 d->arch.breakimm = 0x1000;
352 for (i = 0; i < NR_CPUS; i++) {
353 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
354 }
356 if (is_idle_domain(d))
357 return 0;
359 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
360 if (d->shared_info == NULL)
361 goto fail_nomem;
362 memset(d->shared_info, 0, XSI_SIZE);
363 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
364 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
365 d, XENSHARE_writable);
367 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
368 /* We may also need emulation rid for region4, though it's unlikely
369 * to see guest issue uncacheable access in metaphysical mode. But
370 * keep such info here may be more sane.
371 */
372 if (!allocate_rid_range(d,0))
373 goto fail_nomem;
375 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
377 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
378 goto fail_nomem;
380 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
381 RANGESETF_prettyprint_hex);
383 printf ("arch_domain_create: domain=%p\n", d);
384 return 0;
386 fail_nomem:
387 if (d->arch.mm.pgd != NULL)
388 pgd_free(d->arch.mm.pgd);
389 if (d->shared_info != NULL)
390 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
391 return -ENOMEM;
392 }
394 void arch_domain_destroy(struct domain *d)
395 {
396 BUG_ON(d->arch.mm.pgd != NULL);
397 if (d->shared_info != NULL)
398 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
399 if (d->arch.shadow_bitmap != NULL)
400 xfree(d->arch.shadow_bitmap);
402 /* Clear vTLB for the next domain. */
403 domain_flush_tlb_vhpt(d);
405 deallocate_rid_range(d);
406 }
408 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
409 {
410 int i;
411 struct vcpu_extra_regs *er = &c->extra_regs;
413 c->user_regs = *vcpu_regs (v);
414 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
416 /* Fill extra regs. */
417 for (i = 0; i < 8; i++) {
418 er->itrs[i].pte = v->arch.itrs[i].pte.val;
419 er->itrs[i].itir = v->arch.itrs[i].itir;
420 er->itrs[i].vadr = v->arch.itrs[i].vadr;
421 er->itrs[i].rid = v->arch.itrs[i].rid;
422 }
423 for (i = 0; i < 8; i++) {
424 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
425 er->dtrs[i].itir = v->arch.dtrs[i].itir;
426 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
427 er->dtrs[i].rid = v->arch.dtrs[i].rid;
428 }
429 er->event_callback_ip = v->arch.event_callback_ip;
430 er->dcr = v->arch.dcr;
431 er->iva = v->arch.iva;
432 }
434 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
435 {
436 struct pt_regs *regs = vcpu_regs (v);
437 struct domain *d = v->domain;
439 *regs = c->user_regs;
441 if (!d->arch.is_vti) {
442 /* domain runs at PL2/3 */
443 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
444 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
445 }
447 if (c->flags & VGCF_EXTRA_REGS) {
448 int i;
449 struct vcpu_extra_regs *er = &c->extra_regs;
451 for (i = 0; i < 8; i++) {
452 vcpu_set_itr(v, i, er->itrs[i].pte,
453 er->itrs[i].itir,
454 er->itrs[i].vadr,
455 er->itrs[i].rid);
456 }
457 for (i = 0; i < 8; i++) {
458 vcpu_set_dtr(v, i,
459 er->dtrs[i].pte,
460 er->dtrs[i].itir,
461 er->dtrs[i].vadr,
462 er->dtrs[i].rid);
463 }
464 v->arch.event_callback_ip = er->event_callback_ip;
465 v->arch.dcr = er->dcr;
466 v->arch.iva = er->iva;
467 }
469 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
470 return 0;
471 if (d->arch.is_vti)
472 vmx_final_setup_guest(v);
474 /* This overrides some registers. */
475 vcpu_init_regs(v);
477 /* Don't redo final setup */
478 set_bit(_VCPUF_initialised, &v->vcpu_flags);
479 return 0;
480 }
482 static void relinquish_memory(struct domain *d, struct list_head *list)
483 {
484 struct list_head *ent;
485 struct page_info *page;
486 #ifndef __ia64__
487 unsigned long x, y;
488 #endif
490 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
491 spin_lock_recursive(&d->page_alloc_lock);
492 ent = list->next;
493 while ( ent != list )
494 {
495 page = list_entry(ent, struct page_info, list);
496 /* Grab a reference to the page so it won't disappear from under us. */
497 if ( unlikely(!get_page(page, d)) )
498 {
499 /* Couldn't get a reference -- someone is freeing this page. */
500 ent = ent->next;
501 continue;
502 }
504 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
505 put_page_and_type(page);
507 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
508 put_page(page);
510 #ifndef __ia64__
511 /*
512 * Forcibly invalidate base page tables at this point to break circular
513 * 'linear page table' references. This is okay because MMU structures
514 * are not shared across domains and this domain is now dead. Thus base
515 * tables are not in use so a non-zero count means circular reference.
516 */
517 y = page->u.inuse.type_info;
518 for ( ; ; )
519 {
520 x = y;
521 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
522 (PGT_base_page_table|PGT_validated)) )
523 break;
525 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
526 if ( likely(y == x) )
527 {
528 free_page_type(page, PGT_base_page_table);
529 break;
530 }
531 }
532 #endif
534 /* Follow the list chain and /then/ potentially free the page. */
535 ent = ent->next;
536 #ifdef CONFIG_XEN_IA64_DOM0_VP
537 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
538 #endif
539 put_page(page);
540 }
542 spin_unlock_recursive(&d->page_alloc_lock);
543 }
545 void domain_relinquish_resources(struct domain *d)
546 {
547 /* Relinquish every page of memory. */
549 // relase page traversing d->arch.mm.
550 relinquish_mm(d);
552 relinquish_memory(d, &d->xenpage_list);
553 relinquish_memory(d, &d->page_list);
555 if (d->arch.is_vti && d->arch.sal_data)
556 xfree(d->arch.sal_data);
557 }
559 void build_physmap_table(struct domain *d)
560 {
561 struct list_head *list_ent = d->page_list.next;
562 unsigned long mfn, i = 0;
564 while(list_ent != &d->page_list) {
565 mfn = page_to_mfn(list_entry(
566 list_ent, struct page_info, list));
567 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
569 i++;
570 list_ent = mfn_to_page(mfn)->list.next;
571 }
572 }
574 unsigned long
575 domain_set_shared_info_va (unsigned long va)
576 {
577 struct vcpu *v = current;
578 struct domain *d = v->domain;
579 struct vcpu *v1;
581 /* Check virtual address:
582 must belong to region 7,
583 must be 64Kb aligned,
584 must not be within Xen virtual space. */
585 if ((va >> 61) != 7
586 || (va & 0xffffUL) != 0
587 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
588 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
590 /* Note: this doesn't work well if other cpus are already running.
591 However this is part of the spec :-) */
592 printf ("Domain set shared_info_va to 0x%016lx\n", va);
593 d->arch.shared_info_va = va;
595 for_each_vcpu (d, v1) {
596 VCPU(v1, interrupt_mask_addr) =
597 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
598 }
600 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
602 /* Remap the shared pages. */
603 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
605 return 0;
606 }
608 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
609 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
611 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
612 {
613 unsigned int op = sc->op;
614 int rc = 0;
615 int i;
616 //struct vcpu *v;
618 if (unlikely(d == current->domain)) {
619 DPRINTK("Don't try to do a shadow op on yourself!\n");
620 return -EINVAL;
621 }
623 domain_pause(d);
625 switch (op)
626 {
627 case DOM0_SHADOW_CONTROL_OP_OFF:
628 if (shadow_mode_enabled (d)) {
629 u64 *bm = d->arch.shadow_bitmap;
631 /* Flush vhpt and tlb to restore dirty bit usage. */
632 domain_flush_tlb_vhpt(d);
634 /* Free bitmap. */
635 d->arch.shadow_bitmap_size = 0;
636 d->arch.shadow_bitmap = NULL;
637 xfree(bm);
638 }
639 break;
641 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
642 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
643 rc = -EINVAL;
644 break;
646 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
647 if (shadow_mode_enabled(d)) {
648 rc = -EINVAL;
649 break;
650 }
652 atomic64_set(&d->arch.shadow_fault_count, 0);
653 atomic64_set(&d->arch.shadow_dirty_count, 0);
655 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
656 ~(BITS_PER_LONG-1);
657 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
658 d->arch.shadow_bitmap_size / BITS_PER_LONG);
659 if (d->arch.shadow_bitmap == NULL) {
660 d->arch.shadow_bitmap_size = 0;
661 rc = -ENOMEM;
662 }
663 else {
664 memset(d->arch.shadow_bitmap, 0,
665 d->arch.shadow_bitmap_size / 8);
667 /* Flush vhtp and tlb to enable dirty bit
668 virtualization. */
669 domain_flush_tlb_vhpt(d);
670 }
671 break;
673 case DOM0_SHADOW_CONTROL_OP_FLUSH:
674 atomic64_set(&d->arch.shadow_fault_count, 0);
675 atomic64_set(&d->arch.shadow_dirty_count, 0);
676 break;
678 case DOM0_SHADOW_CONTROL_OP_CLEAN:
679 {
680 int nbr_longs;
682 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
683 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
685 atomic64_set(&d->arch.shadow_fault_count, 0);
686 atomic64_set(&d->arch.shadow_dirty_count, 0);
688 if (guest_handle_is_null(sc->dirty_bitmap) ||
689 (d->arch.shadow_bitmap == NULL)) {
690 rc = -EINVAL;
691 break;
692 }
694 if (sc->pages > d->arch.shadow_bitmap_size)
695 sc->pages = d->arch.shadow_bitmap_size;
697 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
699 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
700 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
701 SHADOW_COPY_CHUNK : nbr_longs - i;
703 if (copy_to_guest_offset(sc->dirty_bitmap, i,
704 d->arch.shadow_bitmap + i,
705 size)) {
706 rc = -EFAULT;
707 break;
708 }
710 memset(d->arch.shadow_bitmap + i,
711 0, size * sizeof(unsigned long));
712 }
714 break;
715 }
717 case DOM0_SHADOW_CONTROL_OP_PEEK:
718 {
719 unsigned long size;
721 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
722 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
724 if (guest_handle_is_null(sc->dirty_bitmap) ||
725 (d->arch.shadow_bitmap == NULL)) {
726 rc = -EINVAL;
727 break;
728 }
730 if (sc->pages > d->arch.shadow_bitmap_size)
731 sc->pages = d->arch.shadow_bitmap_size;
733 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
734 if (copy_to_guest(sc->dirty_bitmap,
735 d->arch.shadow_bitmap, size)) {
736 rc = -EFAULT;
737 break;
738 }
739 break;
740 }
741 default:
742 rc = -EINVAL;
743 break;
744 }
746 domain_unpause(d);
748 return rc;
749 }
751 // remove following line if not privifying in memory
752 //#define HAVE_PRIVIFY_MEMORY
753 #ifndef HAVE_PRIVIFY_MEMORY
754 #define privify_memory(x,y) do {} while(0)
755 #endif
757 // see arch/x86/xxx/domain_build.c
758 int elf_sanity_check(Elf_Ehdr *ehdr)
759 {
760 if (!(IS_ELF(*ehdr)))
761 {
762 printk("DOM0 image is not a Xen-compatible Elf image.\n");
763 return 0;
764 }
765 return 1;
766 }
768 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
769 {
770 char *elfbase = (char *) image_start;
771 Elf_Ehdr ehdr;
772 Elf_Phdr phdr;
773 int h, filesz, memsz;
774 unsigned long elfaddr, dom_mpaddr, dom_imva;
775 struct page_info *p;
777 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
778 for ( h = 0; h < ehdr.e_phnum; h++ ) {
779 memcpy(&phdr,
780 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
781 sizeof(Elf_Phdr));
782 if ((phdr.p_type != PT_LOAD))
783 continue;
785 filesz = phdr.p_filesz;
786 memsz = phdr.p_memsz;
787 elfaddr = (unsigned long) elfbase + phdr.p_offset;
788 dom_mpaddr = phdr.p_paddr;
790 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
791 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
792 if (d == dom0) {
793 if (dom_mpaddr+memsz>dom0_size)
794 panic("Dom0 doesn't fit in memory space!\n");
795 dom_imva = __va_ul(dom_mpaddr + dom0_start);
796 memcpy((void *)dom_imva, (void *)elfaddr, filesz);
797 if (memsz > filesz)
798 memset((void *)dom_imva+filesz, 0,
799 memsz-filesz);
800 //FIXME: This test for code seems to find a lot more than objdump -x does
801 if (phdr.p_flags & PF_X) {
802 privify_memory(dom_imva,filesz);
803 flush_icache_range (dom_imva, dom_imva+filesz);
804 }
805 }
806 else
807 #endif
808 while (memsz > 0) {
809 p = assign_new_domain_page(d,dom_mpaddr);
810 BUG_ON (unlikely(p == NULL));
811 dom_imva = __va_ul(page_to_maddr(p));
812 if (filesz > 0) {
813 if (filesz >= PAGE_SIZE)
814 memcpy((void *) dom_imva,
815 (void *) elfaddr,
816 PAGE_SIZE);
817 else {
818 // copy partial page
819 memcpy((void *) dom_imva,
820 (void *) elfaddr, filesz);
821 // zero the rest of page
822 memset((void *) dom_imva+filesz, 0,
823 PAGE_SIZE-filesz);
824 }
825 //FIXME: This test for code seems to find a lot more than objdump -x does
826 if (phdr.p_flags & PF_X) {
827 privify_memory(dom_imva,PAGE_SIZE);
828 flush_icache_range(dom_imva,
829 dom_imva+PAGE_SIZE);
830 }
831 }
832 else if (memsz > 0) {
833 /* always zero out entire page */
834 memset((void *) dom_imva, 0, PAGE_SIZE);
835 }
836 memsz -= PAGE_SIZE;
837 filesz -= PAGE_SIZE;
838 elfaddr += PAGE_SIZE;
839 dom_mpaddr += PAGE_SIZE;
840 }
841 }
842 }
844 void alloc_dom0(void)
845 {
846 /* Check dom0 size. */
847 if (dom0_size < 4 * 1024 * 1024) {
848 panic("dom0_mem is too small, boot aborted"
849 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
850 }
852 /* Check dom0 align. */
853 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
854 panic("dom0_align (%lx) must be power of two, boot aborted"
855 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
856 dom0_align);
857 }
858 if (dom0_align < PAGE_SIZE) {
859 panic("dom0_align must be >= %ld, boot aborted"
860 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
861 PAGE_SIZE);
862 }
863 if (dom0_size % dom0_align) {
864 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
865 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
866 dom0_size,dom0_align);
867 }
869 if (running_on_sim) {
870 dom0_size = 128*1024*1024; //FIXME: Should be configurable
871 }
872 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
873 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
875 /* FIXME: The first trunk (say 256M) should always be assigned to
876 * Dom0, since Dom0's physical == machine address for DMA purpose.
877 * Some old version linux, like 2.4, assumes physical memory existing
878 * in 2nd 64M space.
879 */
880 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
881 dom0_start <<= PAGE_SHIFT;
882 if (!dom0_start) {
883 panic("alloc_dom0: can't allocate contiguous memory size=%lu\n",
884 dom0_size);
885 }
886 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
887 #else
888 // no need to allocate pages for now
889 // pages are allocated by map_new_domain_page() via loaddomainelfimage()
890 dom0_start = 0;
891 #endif
893 }
896 /*
897 * Domain 0 has direct access to all devices absolutely. However
898 * the major point of this stub here, is to allow alloc_dom_mem
899 * handled with order > 0 request. Dom0 requires that bit set to
900 * allocate memory for other domains.
901 */
902 static void physdev_init_dom0(struct domain *d)
903 {
904 if (iomem_permit_access(d, 0UL, ~0UL))
905 BUG();
906 if (irqs_permit_access(d, 0, NR_IRQS-1))
907 BUG();
908 if (ioports_permit_access(d, 0, 0xffff))
909 BUG();
910 }
912 int construct_dom0(struct domain *d,
913 unsigned long image_start, unsigned long image_len,
914 unsigned long initrd_start, unsigned long initrd_len,
915 char *cmdline)
916 {
917 int i, rc;
918 unsigned long alloc_start, alloc_end;
919 start_info_t *si;
920 struct vcpu *v = d->vcpu[0];
921 unsigned long max_pages;
923 struct domain_setup_info dsi;
924 unsigned long p_start;
925 unsigned long pkern_start;
926 unsigned long pkern_entry;
927 unsigned long pkern_end;
928 unsigned long pinitrd_start = 0;
929 unsigned long pstart_info;
930 struct page_info *start_info_page;
931 unsigned long bp_mpa;
932 struct ia64_boot_param *bp;
934 #ifdef VALIDATE_VT
935 unsigned int vmx_dom0 = 0;
936 unsigned long mfn;
937 struct page_info *page = NULL;
938 #endif
940 //printf("construct_dom0: starting\n");
942 /* Sanity! */
943 BUG_ON(d != dom0);
944 BUG_ON(d->vcpu[0] == NULL);
945 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
947 memset(&dsi, 0, sizeof(struct domain_setup_info));
949 printk("*** LOADING DOMAIN 0 ***\n");
951 alloc_start = dom0_start;
952 alloc_end = dom0_start + dom0_size;
953 max_pages = dom0_size / PAGE_SIZE;
954 d->max_pages = max_pages;
955 #ifndef CONFIG_XEN_IA64_DOM0_VP
956 d->tot_pages = d->max_pages;
957 #else
958 d->tot_pages = 0;
959 #endif
960 dsi.image_addr = (unsigned long)image_start;
961 dsi.image_len = image_len;
962 rc = parseelfimage(&dsi);
963 if ( rc != 0 )
964 return rc;
966 #ifdef VALIDATE_VT
967 /* Temp workaround */
968 if (running_on_sim)
969 dsi.xen_section_string = (char *)1;
971 /* Check whether dom0 is vti domain */
972 if ((!vmx_enabled) && !dsi.xen_section_string) {
973 printk("Lack of hardware support for unmodified vmx dom0\n");
974 panic("");
975 }
977 if (vmx_enabled && !dsi.xen_section_string) {
978 printk("Dom0 is vmx domain!\n");
979 vmx_dom0 = 1;
980 }
981 #endif
983 p_start = dsi.v_start;
984 pkern_start = dsi.v_kernstart;
985 pkern_end = dsi.v_kernend;
986 pkern_entry = dsi.v_kernentry;
988 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
990 if ( (p_start & (PAGE_SIZE-1)) != 0 )
991 {
992 printk("Initial guest OS must load to a page boundary.\n");
993 return -EINVAL;
994 }
996 pstart_info = PAGE_ALIGN(pkern_end);
997 if(initrd_start && initrd_len){
998 unsigned long offset;
1000 pinitrd_start= (dom0_start + dom0_size) -
1001 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
1002 if (pinitrd_start <= pstart_info)
1003 panic("%s:enough memory is not assigned to dom0", __func__);
1005 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1006 struct page_info *p;
1007 p = assign_new_domain_page(d, pinitrd_start + offset);
1008 if (p == NULL)
1009 panic("%s: can't allocate page for initrd image", __func__);
1010 if (initrd_len < offset + PAGE_SIZE)
1011 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1012 initrd_len - offset);
1013 else
1014 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1018 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1019 " Kernel image: %lx->%lx\n"
1020 " Entry address: %lx\n"
1021 " Init. ramdisk: %lx len %lx\n"
1022 " Start info.: %lx->%lx\n",
1023 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1024 pstart_info, pstart_info + PAGE_SIZE);
1026 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1028 printk("Initial guest OS requires too much space\n"
1029 "(%luMB is greater than %luMB limit)\n",
1030 (pkern_end-pkern_start)>>20,
1031 (max_pages <<PAGE_SHIFT)>>20);
1032 return -ENOMEM;
1035 // if high 3 bits of pkern start are non-zero, error
1037 // if pkern end is after end of metaphysical memory, error
1038 // (we should be able to deal with this... later)
1040 /* Mask all upcalls... */
1041 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1042 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1044 if (dom0_max_vcpus == 0)
1045 dom0_max_vcpus = MAX_VIRT_CPUS;
1046 if (dom0_max_vcpus > num_online_cpus())
1047 dom0_max_vcpus = num_online_cpus();
1048 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1049 dom0_max_vcpus = MAX_VIRT_CPUS;
1051 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1052 for ( i = 1; i < dom0_max_vcpus; i++ )
1053 if (alloc_vcpu(d, i, i) == NULL)
1054 printf ("Cannot allocate dom0 vcpu %d\n", i);
1056 #if defined(VALIDATE_VT) && !defined(CONFIG_XEN_IA64_DOM0_VP)
1057 /* Construct a frame-allocation list for the initial domain, since these
1058 * pages are allocated by boot allocator and pfns are not set properly
1059 */
1060 for ( mfn = (alloc_start>>PAGE_SHIFT);
1061 mfn < (alloc_end>>PAGE_SHIFT);
1062 mfn++ )
1064 page = mfn_to_page(mfn);
1065 page_set_owner(page, d);
1066 page->u.inuse.type_info = 0;
1067 page->count_info = PGC_allocated | 1;
1068 list_add_tail(&page->list, &d->page_list);
1070 /* Construct 1:1 mapping */
1071 set_gpfn_from_mfn(mfn, mfn);
1073 #endif
1075 /* Copy the OS image. */
1076 loaddomainelfimage(d,image_start);
1078 /* Copy the initial ramdisk. */
1079 //if ( initrd_len != 0 )
1080 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1082 /* Set up start info area. */
1083 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1084 start_info_page = assign_new_domain_page(d, pstart_info);
1085 if (start_info_page == NULL)
1086 panic("can't allocate start info page");
1087 si = page_to_virt(start_info_page);
1088 memset(si, 0, PAGE_SIZE);
1089 sprintf(si->magic, "xen-%i.%i-ia64",
1090 xen_major_version(), xen_minor_version());
1091 si->nr_pages = max_pages;
1092 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1094 printk("Dom0: 0x%lx\n", (u64)dom0);
1096 #ifdef VALIDATE_VT
1097 /* VMX specific construction for Dom0, if hardware supports VMX
1098 * and Dom0 is unmodified image
1099 */
1100 if (vmx_dom0)
1101 vmx_final_setup_guest(v);
1102 #endif
1104 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1106 /* Build firmware.
1107 Note: Linux kernel reserve memory used by start_info, so there is
1108 no need to remove it from MDT. */
1109 bp_mpa = pstart_info + sizeof(struct start_info);
1110 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1112 /* Fill boot param. */
1113 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1114 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1116 bp = (struct ia64_boot_param *)(si + 1);
1117 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1119 /* We assume console has reached the last line! */
1120 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1121 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1122 bp->console_info.orig_x = 0;
1123 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1124 0 : bp->console_info.num_rows - 1;
1126 bp->initrd_start = (dom0_start+dom0_size) -
1127 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1128 bp->initrd_size = ia64_boot_param->initrd_size;
1130 vcpu_init_regs (v);
1132 vcpu_regs(v)->r28 = bp_mpa;
1134 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1135 pkern_entry += dom0_start;
1136 #endif
1137 vcpu_regs (v)->cr_iip = pkern_entry;
1139 physdev_init_dom0(d);
1141 // FIXME: Hack for keyboard input
1142 //serial_input_init();
1144 return 0;
1147 void machine_restart(char * __unused)
1149 console_start_sync();
1150 if (running_on_sim)
1151 printf ("machine_restart called. spinning...\n");
1152 else
1153 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1154 while(1);
1157 void machine_halt(void)
1159 console_start_sync();
1160 if (running_on_sim)
1161 printf ("machine_halt called. spinning...\n");
1162 else
1163 (*efi.reset_system)(EFI_RESET_SHUTDOWN,0,0,NULL);
1164 while(1);
1167 void sync_vcpu_execstate(struct vcpu *v)
1169 // __ia64_save_fpu(v->arch._thread.fph);
1170 // if (VMX_DOMAIN(v))
1171 // vmx_save_state(v);
1172 // FIXME SMP: Anything else needed here for SMP?
1175 static void parse_dom0_mem(char *s)
1177 dom0_size = parse_size_and_unit(s);
1179 custom_param("dom0_mem", parse_dom0_mem);
1182 static void parse_dom0_align(char *s)
1184 dom0_align = parse_size_and_unit(s);
1186 custom_param("dom0_align", parse_dom0_align);