ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 11707:a3a079af0e92

[IA64] avoid long time interrupt masking.

flush_vtlb_fro_context_switch() can be executed with interrupt enabled.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author awilliam@xenbuild.aw
date Mon Oct 02 21:32:46 2006 -0600 (2006-10-02)
parents 5c97ef4c7147
children 3f28ffed6fff
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
49 #include <xen/guest_access.h>
51 unsigned long dom0_size = 512*1024*1024;
52 unsigned long dom0_align = 64*1024*1024;
54 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
55 static unsigned int dom0_max_vcpus = 1;
56 integer_param("dom0_max_vcpus", dom0_max_vcpus);
58 extern unsigned long running_on_sim;
60 extern char dom0_command_line[];
62 /* FIXME: where these declarations should be there ? */
63 extern void serial_input_init(void);
64 static void init_switch_stack(struct vcpu *v);
65 extern void vmx_do_launch(struct vcpu *);
67 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
68 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
70 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
71 This is a Xen virtual address. */
72 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
73 DEFINE_PER_CPU(int *, current_psr_ic_addr);
75 #include <xen/sched-if.h>
77 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
78 {
79 int cpu = smp_processor_id();
80 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
81 int last_processor = vcpu->arch.last_processor;
83 if (is_idle_domain(vcpu->domain))
84 return;
86 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
87 vcpu->arch.last_processor = cpu;
89 if ((last_vcpu_id != vcpu->vcpu_id &&
90 last_vcpu_id != INVALID_VCPU_ID) ||
91 (last_vcpu_id == vcpu->vcpu_id &&
92 last_processor != cpu &&
93 last_processor != INVALID_PROCESSOR)) {
95 // if the vTLB implementation was changed,
96 // the followings must be updated either.
97 if (VMX_DOMAIN(vcpu)) {
98 // currently vTLB for vt-i domian is per vcpu.
99 // so any flushing isn't needed.
100 } else {
101 vhpt_flush();
102 }
103 local_flush_tlb_all();
104 }
105 }
107 void schedule_tail(struct vcpu *prev)
108 {
109 extern char ia64_ivt;
110 context_saved(prev);
112 if (VMX_DOMAIN(current)) {
113 vmx_do_launch(current);
114 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
115 current->processor);
116 } else {
117 ia64_set_iva(&ia64_ivt);
118 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
119 VHPT_ENABLED);
120 load_region_regs(current);
121 vcpu_load_kernel_regs(current);
122 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
123 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
124 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
125 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
126 migrate_timer(&current->arch.hlt_timer, current->processor);
127 }
128 flush_vtlb_for_context_switch(current);
129 }
131 void context_switch(struct vcpu *prev, struct vcpu *next)
132 {
133 uint64_t spsr;
134 uint64_t pta;
136 local_irq_save(spsr);
138 __ia64_save_fpu(prev->arch._thread.fph);
139 __ia64_load_fpu(next->arch._thread.fph);
140 if (VMX_DOMAIN(prev)) {
141 vmx_save_state(prev);
142 if (!VMX_DOMAIN(next)) {
143 /* VMX domains can change the physical cr.dcr.
144 * Restore default to prevent leakage. */
145 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
146 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
147 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
148 }
149 }
150 if (VMX_DOMAIN(next))
151 vmx_load_state(next);
152 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
153 prev = ia64_switch_to(next);
155 /* Note: ia64_switch_to does not return here at vcpu initialization. */
157 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
159 if (VMX_DOMAIN(current)){
160 vmx_load_all_rr(current);
161 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
162 current->processor);
163 } else {
164 struct domain *nd;
165 extern char ia64_ivt;
167 ia64_set_iva(&ia64_ivt);
169 nd = current->domain;
170 if (!is_idle_domain(nd)) {
171 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
172 VHPT_ENABLED);
173 load_region_regs(current);
174 vcpu_load_kernel_regs(current);
175 vcpu_set_next_timer(current);
176 if (vcpu_timer_expired(current))
177 vcpu_pend_timer(current);
178 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
179 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
180 __ia64_per_cpu_var(current_psr_ic_addr) =
181 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
182 } else {
183 /* When switching to idle domain, only need to disable vhpt
184 * walker. Then all accesses happen within idle context will
185 * be handled by TR mapping and identity mapping.
186 */
187 pta = ia64_get_pta();
188 ia64_set_pta(pta & ~VHPT_ENABLED);
189 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
190 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
191 }
192 }
193 local_irq_restore(spsr);
194 flush_vtlb_for_context_switch(current);
195 context_saved(prev);
196 }
198 void continue_running(struct vcpu *same)
199 {
200 /* nothing to do */
201 }
203 static void default_idle(void)
204 {
205 local_irq_disable();
206 if ( !softirq_pending(smp_processor_id()) )
207 safe_halt();
208 local_irq_enable();
209 }
211 static void continue_cpu_idle_loop(void)
212 {
213 for ( ; ; )
214 {
215 #ifdef IA64
216 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
217 #else
218 irq_stat[cpu].idle_timestamp = jiffies;
219 #endif
220 while ( !softirq_pending(smp_processor_id()) )
221 default_idle();
222 raise_softirq(SCHEDULE_SOFTIRQ);
223 do_softirq();
224 }
225 }
227 void startup_cpu_idle_loop(void)
228 {
229 /* Just some sanity to ensure that the scheduler is set up okay. */
230 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
231 raise_softirq(SCHEDULE_SOFTIRQ);
233 continue_cpu_idle_loop();
234 }
236 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
237 * get_order_from_shift(XMAPPEDREGS_SHIFT))
238 */
239 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
240 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
241 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
242 #endif
244 void hlt_timer_fn(void *data)
245 {
246 struct vcpu *v = data;
247 vcpu_unblock(v);
248 }
250 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
251 {
252 struct vcpu *v;
253 struct thread_info *ti;
255 /* Still keep idle vcpu0 static allocated at compilation, due
256 * to some code from Linux still requires it in early phase.
257 */
258 if (is_idle_domain(d) && !vcpu_id)
259 v = idle_vcpu[0];
260 else {
261 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
262 return NULL;
263 memset(v, 0, sizeof(*v));
265 ti = alloc_thread_info(v);
266 /* Clear thread_info to clear some important fields, like
267 * preempt_count
268 */
269 memset(ti, 0, sizeof(struct thread_info));
270 init_switch_stack(v);
271 }
273 if (!is_idle_domain(d)) {
274 if (!d->arch.is_vti) {
275 int order;
276 int i;
278 /* Create privregs page only if not VTi. */
279 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
280 v->arch.privregs = alloc_xenheap_pages(order);
281 BUG_ON(v->arch.privregs == NULL);
282 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
283 for (i = 0; i < (1 << order); i++)
284 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
285 i, d, XENSHARE_writable);
286 }
288 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
289 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
290 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
291 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
293 /* Is it correct ?
294 It depends on the domain rid usage.
296 A domain may share rid among its processor (eg having a
297 global VHPT). In this case, we should also share rid
298 among vcpus and the rid range should be the same.
300 However a domain may have per cpu rid allocation. In
301 this case we don't want to share rid among vcpus, but we may
302 do it if two vcpus are on the same cpu... */
304 v->arch.starting_rid = d->arch.starting_rid;
305 v->arch.ending_rid = d->arch.ending_rid;
306 v->arch.breakimm = d->arch.breakimm;
307 v->arch.last_processor = INVALID_PROCESSOR;
308 }
309 if (!VMX_DOMAIN(v)){
310 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
311 first_cpu(cpu_online_map));
312 }
314 return v;
315 }
317 void relinquish_vcpu_resources(struct vcpu *v)
318 {
319 if (v->arch.privregs != NULL) {
320 free_xenheap_pages(v->arch.privregs,
321 get_order_from_shift(XMAPPEDREGS_SHIFT));
322 v->arch.privregs = NULL;
323 }
324 kill_timer(&v->arch.hlt_timer);
325 }
327 void free_vcpu_struct(struct vcpu *v)
328 {
329 if (VMX_DOMAIN(v))
330 vmx_relinquish_vcpu_resources(v);
331 else
332 relinquish_vcpu_resources(v);
334 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
335 }
337 static void init_switch_stack(struct vcpu *v)
338 {
339 struct pt_regs *regs = vcpu_regs (v);
340 struct switch_stack *sw = (struct switch_stack *) regs - 1;
341 extern void ia64_ret_from_clone;
343 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
344 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
345 sw->b0 = (unsigned long) &ia64_ret_from_clone;
346 sw->ar_fpsr = FPSR_DEFAULT;
347 v->arch._thread.ksp = (unsigned long) sw - 16;
348 // stay on kernel stack because may get interrupts!
349 // ia64_ret_from_clone switches to user stack
350 v->arch._thread.on_ustack = 0;
351 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
352 }
354 int arch_domain_create(struct domain *d)
355 {
356 int i;
358 // the following will eventually need to be negotiated dynamically
359 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
360 d->arch.breakimm = 0x1000;
361 for (i = 0; i < NR_CPUS; i++) {
362 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
363 }
365 if (is_idle_domain(d))
366 return 0;
368 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
369 if (d->shared_info == NULL)
370 goto fail_nomem;
371 memset(d->shared_info, 0, XSI_SIZE);
372 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
373 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
374 d, XENSHARE_writable);
376 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
377 /* We may also need emulation rid for region4, though it's unlikely
378 * to see guest issue uncacheable access in metaphysical mode. But
379 * keep such info here may be more sane.
380 */
381 if (!allocate_rid_range(d,0))
382 goto fail_nomem;
384 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
386 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
387 goto fail_nomem;
389 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
390 RANGESETF_prettyprint_hex);
392 printf ("arch_domain_create: domain=%p\n", d);
393 return 0;
395 fail_nomem:
396 if (d->arch.mm.pgd != NULL)
397 pgd_free(d->arch.mm.pgd);
398 if (d->shared_info != NULL)
399 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
400 return -ENOMEM;
401 }
403 void arch_domain_destroy(struct domain *d)
404 {
405 BUG_ON(d->arch.mm.pgd != NULL);
406 if (d->shared_info != NULL)
407 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
408 if (d->arch.shadow_bitmap != NULL)
409 xfree(d->arch.shadow_bitmap);
411 /* Clear vTLB for the next domain. */
412 domain_flush_tlb_vhpt(d);
414 deallocate_rid_range(d);
415 }
417 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
418 {
419 int i;
420 struct vcpu_extra_regs *er = &c->extra_regs;
422 c->user_regs = *vcpu_regs (v);
423 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
425 /* Fill extra regs. */
426 for (i = 0; i < 8; i++) {
427 er->itrs[i].pte = v->arch.itrs[i].pte.val;
428 er->itrs[i].itir = v->arch.itrs[i].itir;
429 er->itrs[i].vadr = v->arch.itrs[i].vadr;
430 er->itrs[i].rid = v->arch.itrs[i].rid;
431 }
432 for (i = 0; i < 8; i++) {
433 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
434 er->dtrs[i].itir = v->arch.dtrs[i].itir;
435 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
436 er->dtrs[i].rid = v->arch.dtrs[i].rid;
437 }
438 er->event_callback_ip = v->arch.event_callback_ip;
439 er->dcr = v->arch.dcr;
440 er->iva = v->arch.iva;
441 }
443 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
444 {
445 struct pt_regs *regs = vcpu_regs (v);
446 struct domain *d = v->domain;
448 *regs = c->user_regs;
450 if (!d->arch.is_vti) {
451 /* domain runs at PL2/3 */
452 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
453 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
454 }
456 if (c->flags & VGCF_EXTRA_REGS) {
457 int i;
458 struct vcpu_extra_regs *er = &c->extra_regs;
460 for (i = 0; i < 8; i++) {
461 vcpu_set_itr(v, i, er->itrs[i].pte,
462 er->itrs[i].itir,
463 er->itrs[i].vadr,
464 er->itrs[i].rid);
465 }
466 for (i = 0; i < 8; i++) {
467 vcpu_set_dtr(v, i,
468 er->dtrs[i].pte,
469 er->dtrs[i].itir,
470 er->dtrs[i].vadr,
471 er->dtrs[i].rid);
472 }
473 v->arch.event_callback_ip = er->event_callback_ip;
474 v->arch.dcr = er->dcr;
475 v->arch.iva = er->iva;
476 }
478 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
479 return 0;
480 if (d->arch.is_vti)
481 vmx_final_setup_guest(v);
483 /* This overrides some registers. */
484 vcpu_init_regs(v);
486 /* Don't redo final setup */
487 set_bit(_VCPUF_initialised, &v->vcpu_flags);
488 return 0;
489 }
491 static void relinquish_memory(struct domain *d, struct list_head *list)
492 {
493 struct list_head *ent;
494 struct page_info *page;
495 #ifndef __ia64__
496 unsigned long x, y;
497 #endif
499 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
500 spin_lock_recursive(&d->page_alloc_lock);
501 ent = list->next;
502 while ( ent != list )
503 {
504 page = list_entry(ent, struct page_info, list);
505 /* Grab a reference to the page so it won't disappear from under us. */
506 if ( unlikely(!get_page(page, d)) )
507 {
508 /* Couldn't get a reference -- someone is freeing this page. */
509 ent = ent->next;
510 continue;
511 }
513 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
514 put_page_and_type(page);
516 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
517 put_page(page);
519 #ifndef __ia64__
520 /*
521 * Forcibly invalidate base page tables at this point to break circular
522 * 'linear page table' references. This is okay because MMU structures
523 * are not shared across domains and this domain is now dead. Thus base
524 * tables are not in use so a non-zero count means circular reference.
525 */
526 y = page->u.inuse.type_info;
527 for ( ; ; )
528 {
529 x = y;
530 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
531 (PGT_base_page_table|PGT_validated)) )
532 break;
534 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
535 if ( likely(y == x) )
536 {
537 free_page_type(page, PGT_base_page_table);
538 break;
539 }
540 }
541 #endif
543 /* Follow the list chain and /then/ potentially free the page. */
544 ent = ent->next;
545 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
546 put_page(page);
547 }
549 spin_unlock_recursive(&d->page_alloc_lock);
550 }
552 void domain_relinquish_resources(struct domain *d)
553 {
554 /* Relinquish every page of memory. */
556 // relase page traversing d->arch.mm.
557 relinquish_mm(d);
559 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
560 vmx_relinquish_guest_resources(d);
562 relinquish_memory(d, &d->xenpage_list);
563 relinquish_memory(d, &d->page_list);
565 if (d->arch.is_vti && d->arch.sal_data)
566 xfree(d->arch.sal_data);
567 }
569 void build_physmap_table(struct domain *d)
570 {
571 struct list_head *list_ent = d->page_list.next;
572 unsigned long mfn, i = 0;
574 while(list_ent != &d->page_list) {
575 mfn = page_to_mfn(list_entry(
576 list_ent, struct page_info, list));
577 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
579 i++;
580 list_ent = mfn_to_page(mfn)->list.next;
581 }
582 }
584 unsigned long
585 domain_set_shared_info_va (unsigned long va)
586 {
587 struct vcpu *v = current;
588 struct domain *d = v->domain;
589 struct vcpu *v1;
591 /* Check virtual address:
592 must belong to region 7,
593 must be 64Kb aligned,
594 must not be within Xen virtual space. */
595 if ((va >> 61) != 7
596 || (va & 0xffffUL) != 0
597 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
598 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
600 /* Note: this doesn't work well if other cpus are already running.
601 However this is part of the spec :-) */
602 printf ("Domain set shared_info_va to 0x%016lx\n", va);
603 d->arch.shared_info_va = va;
605 for_each_vcpu (d, v1) {
606 VCPU(v1, interrupt_mask_addr) =
607 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
608 }
610 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
612 /* Remap the shared pages. */
613 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
615 return 0;
616 }
618 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
619 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
621 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
622 {
623 unsigned int op = sc->op;
624 int rc = 0;
625 int i;
626 //struct vcpu *v;
628 if (unlikely(d == current->domain)) {
629 DPRINTK("Don't try to do a shadow op on yourself!\n");
630 return -EINVAL;
631 }
633 domain_pause(d);
635 switch (op)
636 {
637 case XEN_DOMCTL_SHADOW_OP_OFF:
638 if (shadow_mode_enabled (d)) {
639 u64 *bm = d->arch.shadow_bitmap;
641 /* Flush vhpt and tlb to restore dirty bit usage. */
642 domain_flush_tlb_vhpt(d);
644 /* Free bitmap. */
645 d->arch.shadow_bitmap_size = 0;
646 d->arch.shadow_bitmap = NULL;
647 xfree(bm);
648 }
649 break;
651 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
652 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
653 rc = -EINVAL;
654 break;
656 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
657 if (shadow_mode_enabled(d)) {
658 rc = -EINVAL;
659 break;
660 }
662 atomic64_set(&d->arch.shadow_fault_count, 0);
663 atomic64_set(&d->arch.shadow_dirty_count, 0);
665 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
666 ~(BITS_PER_LONG-1);
667 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
668 d->arch.shadow_bitmap_size / BITS_PER_LONG);
669 if (d->arch.shadow_bitmap == NULL) {
670 d->arch.shadow_bitmap_size = 0;
671 rc = -ENOMEM;
672 }
673 else {
674 memset(d->arch.shadow_bitmap, 0,
675 d->arch.shadow_bitmap_size / 8);
677 /* Flush vhtp and tlb to enable dirty bit
678 virtualization. */
679 domain_flush_tlb_vhpt(d);
680 }
681 break;
683 case XEN_DOMCTL_SHADOW_OP_CLEAN:
684 {
685 int nbr_longs;
687 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
688 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
690 atomic64_set(&d->arch.shadow_fault_count, 0);
691 atomic64_set(&d->arch.shadow_dirty_count, 0);
693 if (guest_handle_is_null(sc->dirty_bitmap) ||
694 (d->arch.shadow_bitmap == NULL)) {
695 rc = -EINVAL;
696 break;
697 }
699 if (sc->pages > d->arch.shadow_bitmap_size)
700 sc->pages = d->arch.shadow_bitmap_size;
702 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
704 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
705 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
706 SHADOW_COPY_CHUNK : nbr_longs - i;
708 if (copy_to_guest_offset(sc->dirty_bitmap, i,
709 d->arch.shadow_bitmap + i,
710 size)) {
711 rc = -EFAULT;
712 break;
713 }
715 memset(d->arch.shadow_bitmap + i,
716 0, size * sizeof(unsigned long));
717 }
719 break;
720 }
722 case XEN_DOMCTL_SHADOW_OP_PEEK:
723 {
724 unsigned long size;
726 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
727 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
729 if (guest_handle_is_null(sc->dirty_bitmap) ||
730 (d->arch.shadow_bitmap == NULL)) {
731 rc = -EINVAL;
732 break;
733 }
735 if (sc->pages > d->arch.shadow_bitmap_size)
736 sc->pages = d->arch.shadow_bitmap_size;
738 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
739 if (copy_to_guest(sc->dirty_bitmap,
740 d->arch.shadow_bitmap, size)) {
741 rc = -EFAULT;
742 break;
743 }
744 break;
745 }
746 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
747 sc->mb = 0;
748 break;
749 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
750 if (sc->mb > 0) {
751 BUG();
752 rc = -ENOMEM;
753 }
754 break;
755 default:
756 rc = -EINVAL;
757 break;
758 }
760 domain_unpause(d);
762 return rc;
763 }
765 // remove following line if not privifying in memory
766 //#define HAVE_PRIVIFY_MEMORY
767 #ifndef HAVE_PRIVIFY_MEMORY
768 #define privify_memory(x,y) do {} while(0)
769 #endif
771 // see arch/x86/xxx/domain_build.c
772 int elf_sanity_check(Elf_Ehdr *ehdr)
773 {
774 if (!(IS_ELF(*ehdr)))
775 {
776 printk("DOM0 image is not a Xen-compatible Elf image.\n");
777 return 0;
778 }
779 return 1;
780 }
782 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
783 {
784 char *elfbase = (char *) image_start;
785 Elf_Ehdr ehdr;
786 Elf_Phdr phdr;
787 int h, filesz, memsz;
788 unsigned long elfaddr, dom_mpaddr, dom_imva;
789 struct page_info *p;
791 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
792 for ( h = 0; h < ehdr.e_phnum; h++ ) {
793 memcpy(&phdr,
794 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
795 sizeof(Elf_Phdr));
796 if ((phdr.p_type != PT_LOAD))
797 continue;
799 filesz = phdr.p_filesz;
800 memsz = phdr.p_memsz;
801 elfaddr = (unsigned long) elfbase + phdr.p_offset;
802 dom_mpaddr = phdr.p_paddr;
804 while (memsz > 0) {
805 p = assign_new_domain_page(d,dom_mpaddr);
806 BUG_ON (unlikely(p == NULL));
807 dom_imva = __va_ul(page_to_maddr(p));
808 if (filesz > 0) {
809 if (filesz >= PAGE_SIZE)
810 memcpy((void *) dom_imva,
811 (void *) elfaddr,
812 PAGE_SIZE);
813 else {
814 // copy partial page
815 memcpy((void *) dom_imva,
816 (void *) elfaddr, filesz);
817 // zero the rest of page
818 memset((void *) dom_imva+filesz, 0,
819 PAGE_SIZE-filesz);
820 }
821 //FIXME: This test for code seems to find a lot more than objdump -x does
822 if (phdr.p_flags & PF_X) {
823 privify_memory(dom_imva,PAGE_SIZE);
824 flush_icache_range(dom_imva,
825 dom_imva+PAGE_SIZE);
826 }
827 }
828 else if (memsz > 0) {
829 /* always zero out entire page */
830 memset((void *) dom_imva, 0, PAGE_SIZE);
831 }
832 memsz -= PAGE_SIZE;
833 filesz -= PAGE_SIZE;
834 elfaddr += PAGE_SIZE;
835 dom_mpaddr += PAGE_SIZE;
836 }
837 }
838 }
840 void alloc_dom0(void)
841 {
842 /* Check dom0 size. */
843 if (dom0_size < 4 * 1024 * 1024) {
844 panic("dom0_mem is too small, boot aborted"
845 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
846 }
848 /* Check dom0 align. */
849 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
850 panic("dom0_align (%lx) must be power of two, boot aborted"
851 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
852 dom0_align);
853 }
854 if (dom0_align < PAGE_SIZE) {
855 panic("dom0_align must be >= %ld, boot aborted"
856 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
857 PAGE_SIZE);
858 }
859 if (dom0_size % dom0_align) {
860 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
861 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
862 dom0_size,dom0_align);
863 }
865 if (running_on_sim) {
866 dom0_size = 128*1024*1024; //FIXME: Should be configurable
867 }
869 /* no need to allocate pages for now
870 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
871 */
872 }
875 /*
876 * Domain 0 has direct access to all devices absolutely. However
877 * the major point of this stub here, is to allow alloc_dom_mem
878 * handled with order > 0 request. Dom0 requires that bit set to
879 * allocate memory for other domains.
880 */
881 static void physdev_init_dom0(struct domain *d)
882 {
883 if (iomem_permit_access(d, 0UL, ~0UL))
884 BUG();
885 if (irqs_permit_access(d, 0, NR_IRQS-1))
886 BUG();
887 if (ioports_permit_access(d, 0, 0xffff))
888 BUG();
889 }
891 int construct_dom0(struct domain *d,
892 unsigned long image_start, unsigned long image_len,
893 unsigned long initrd_start, unsigned long initrd_len,
894 char *cmdline)
895 {
896 int i, rc;
897 start_info_t *si;
898 dom0_vga_console_info_t *ci;
899 struct vcpu *v = d->vcpu[0];
900 unsigned long max_pages;
902 struct domain_setup_info dsi;
903 unsigned long p_start;
904 unsigned long pkern_start;
905 unsigned long pkern_entry;
906 unsigned long pkern_end;
907 unsigned long pinitrd_start = 0;
908 unsigned long pstart_info;
909 struct page_info *start_info_page;
910 unsigned long bp_mpa;
911 struct ia64_boot_param *bp;
913 #ifdef VALIDATE_VT
914 unsigned int vmx_dom0 = 0;
915 unsigned long mfn;
916 struct page_info *page = NULL;
917 #endif
919 //printf("construct_dom0: starting\n");
921 /* Sanity! */
922 BUG_ON(d != dom0);
923 BUG_ON(d->vcpu[0] == NULL);
924 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
926 memset(&dsi, 0, sizeof(struct domain_setup_info));
928 printk("*** LOADING DOMAIN 0 ***\n");
930 max_pages = dom0_size / PAGE_SIZE;
931 d->max_pages = max_pages;
932 d->tot_pages = 0;
933 dsi.image_addr = (unsigned long)image_start;
934 dsi.image_len = image_len;
935 rc = parseelfimage(&dsi);
936 if ( rc != 0 )
937 return rc;
939 #ifdef VALIDATE_VT
940 /* Temp workaround */
941 if (running_on_sim)
942 dsi.xen_section_string = (char *)1;
944 /* Check whether dom0 is vti domain */
945 if ((!vmx_enabled) && !dsi.xen_section_string) {
946 printk("Lack of hardware support for unmodified vmx dom0\n");
947 panic("");
948 }
950 if (vmx_enabled && !dsi.xen_section_string) {
951 printk("Dom0 is vmx domain!\n");
952 vmx_dom0 = 1;
953 }
954 #endif
956 p_start = dsi.v_start;
957 pkern_start = dsi.v_kernstart;
958 pkern_end = dsi.v_kernend;
959 pkern_entry = dsi.v_kernentry;
961 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
963 if ( (p_start & (PAGE_SIZE-1)) != 0 )
964 {
965 printk("Initial guest OS must load to a page boundary.\n");
966 return -EINVAL;
967 }
969 pstart_info = PAGE_ALIGN(pkern_end);
970 if(initrd_start && initrd_len){
971 unsigned long offset;
973 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
974 if (pinitrd_start <= pstart_info)
975 panic("%s:enough memory is not assigned to dom0", __func__);
977 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
978 struct page_info *p;
979 p = assign_new_domain_page(d, pinitrd_start + offset);
980 if (p == NULL)
981 panic("%s: can't allocate page for initrd image", __func__);
982 if (initrd_len < offset + PAGE_SIZE)
983 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
984 initrd_len - offset);
985 else
986 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
987 }
988 }
990 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
991 " Kernel image: %lx->%lx\n"
992 " Entry address: %lx\n"
993 " Init. ramdisk: %lx len %lx\n"
994 " Start info.: %lx->%lx\n",
995 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
996 pstart_info, pstart_info + PAGE_SIZE);
998 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
999 {
1000 printk("Initial guest OS requires too much space\n"
1001 "(%luMB is greater than %luMB limit)\n",
1002 (pkern_end-pkern_start)>>20,
1003 (max_pages <<PAGE_SHIFT)>>20);
1004 return -ENOMEM;
1007 // if high 3 bits of pkern start are non-zero, error
1009 // if pkern end is after end of metaphysical memory, error
1010 // (we should be able to deal with this... later)
1012 /* Mask all upcalls... */
1013 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1014 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1016 if (dom0_max_vcpus == 0)
1017 dom0_max_vcpus = MAX_VIRT_CPUS;
1018 if (dom0_max_vcpus > num_online_cpus())
1019 dom0_max_vcpus = num_online_cpus();
1020 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1021 dom0_max_vcpus = MAX_VIRT_CPUS;
1023 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1024 for ( i = 1; i < dom0_max_vcpus; i++ )
1025 if (alloc_vcpu(d, i, i) == NULL)
1026 printf ("Cannot allocate dom0 vcpu %d\n", i);
1028 /* Copy the OS image. */
1029 loaddomainelfimage(d,image_start);
1031 /* Copy the initial ramdisk. */
1032 //if ( initrd_len != 0 )
1033 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1035 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1036 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1038 /* Set up start info area. */
1039 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1040 start_info_page = assign_new_domain_page(d, pstart_info);
1041 if (start_info_page == NULL)
1042 panic("can't allocate start info page");
1043 si = page_to_virt(start_info_page);
1044 memset(si, 0, PAGE_SIZE);
1045 sprintf(si->magic, "xen-%i.%i-ia64",
1046 xen_major_version(), xen_minor_version());
1047 si->nr_pages = max_pages;
1048 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1050 printk("Dom0: 0x%lx\n", (u64)dom0);
1052 #ifdef VALIDATE_VT
1053 /* VMX specific construction for Dom0, if hardware supports VMX
1054 * and Dom0 is unmodified image
1055 */
1056 if (vmx_dom0)
1057 vmx_final_setup_guest(v);
1058 #endif
1060 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1062 /* Build firmware.
1063 Note: Linux kernel reserve memory used by start_info, so there is
1064 no need to remove it from MDT. */
1065 bp_mpa = pstart_info + sizeof(struct start_info);
1066 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1068 /* Fill boot param. */
1069 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1070 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1072 bp = (struct ia64_boot_param *)((unsigned char *)si +
1073 sizeof(start_info_t));
1074 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1076 /* We assume console has reached the last line! */
1077 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1078 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1079 bp->console_info.orig_x = 0;
1080 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1081 0 : bp->console_info.num_rows - 1;
1083 bp->initrd_start = dom0_size -
1084 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1085 bp->initrd_size = ia64_boot_param->initrd_size;
1087 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1088 sizeof(start_info_t) +
1089 sizeof(struct ia64_boot_param));
1091 if (fill_console_start_info(ci)) {
1092 si->console.dom0.info_off = sizeof(start_info_t) +
1093 sizeof(struct ia64_boot_param);
1094 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1097 vcpu_init_regs (v);
1099 vcpu_regs(v)->r28 = bp_mpa;
1101 vcpu_regs (v)->cr_iip = pkern_entry;
1103 physdev_init_dom0(d);
1105 // FIXME: Hack for keyboard input
1106 //serial_input_init();
1108 return 0;
1111 void machine_restart(char * __unused)
1113 console_start_sync();
1114 if (running_on_sim)
1115 printf ("machine_restart called. spinning...\n");
1116 else
1117 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1118 while(1);
1121 extern void cpu_halt(void);
1123 void machine_halt(void)
1125 console_start_sync();
1126 if (running_on_sim)
1127 printf ("machine_halt called. spinning...\n");
1128 else
1129 cpu_halt();
1130 while(1);
1133 void sync_vcpu_execstate(struct vcpu *v)
1135 // __ia64_save_fpu(v->arch._thread.fph);
1136 // if (VMX_DOMAIN(v))
1137 // vmx_save_state(v);
1138 // FIXME SMP: Anything else needed here for SMP?
1141 static void parse_dom0_mem(char *s)
1143 dom0_size = parse_size_and_unit(s);
1145 custom_param("dom0_mem", parse_dom0_mem);
1148 static void parse_dom0_align(char *s)
1150 dom0_align = parse_size_and_unit(s);
1152 custom_param("dom0_align", parse_dom0_align);