direct-io.hg

view xen/arch/ia64/xen/domain.c @ 11340:40f6fdb68fa9

[IA64] Fixes for dom0_ops changes

Fix typo and update op names

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author awilliam@xenbuild.aw
date Sun Aug 27 10:25:39 2006 -0600 (2006-08-27)
parents c4ea8d4d2ae1
children a19dbbe4cff5
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
50 unsigned long dom0_size = 512*1024*1024;
51 unsigned long dom0_align = 64*1024*1024;
53 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
54 static unsigned int dom0_max_vcpus = 1;
55 integer_param("dom0_max_vcpus", dom0_max_vcpus);
57 extern unsigned long running_on_sim;
59 extern char dom0_command_line[];
61 /* FIXME: where these declarations should be there ? */
62 extern void serial_input_init(void);
63 static void init_switch_stack(struct vcpu *v);
64 extern void vmx_do_launch(struct vcpu *);
66 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
67 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
69 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
70 This is a Xen virtual address. */
71 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
72 DEFINE_PER_CPU(int *, current_psr_ic_addr);
74 #include <xen/sched-if.h>
76 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
77 {
78 int cpu = smp_processor_id();
79 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
80 int last_processor = vcpu->arch.last_processor;
82 if (is_idle_domain(vcpu->domain))
83 return;
85 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
86 vcpu->arch.last_processor = cpu;
88 if ((last_vcpu_id != vcpu->vcpu_id &&
89 last_vcpu_id != INVALID_VCPU_ID) ||
90 (last_vcpu_id == vcpu->vcpu_id &&
91 last_processor != cpu &&
92 last_processor != INVALID_PROCESSOR)) {
94 // if the vTLB implementation was changed,
95 // the followings must be updated either.
96 if (VMX_DOMAIN(vcpu)) {
97 // currently vTLB for vt-i domian is per vcpu.
98 // so any flushing isn't needed.
99 } else {
100 vhpt_flush();
101 }
102 local_flush_tlb_all();
103 }
104 }
106 void schedule_tail(struct vcpu *prev)
107 {
108 extern char ia64_ivt;
109 context_saved(prev);
111 if (VMX_DOMAIN(current)) {
112 vmx_do_launch(current);
113 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
114 current->processor);
115 } else {
116 ia64_set_iva(&ia64_ivt);
117 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
118 VHPT_ENABLED);
119 load_region_regs(current);
120 vcpu_load_kernel_regs(current);
121 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
122 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
123 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
124 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
125 }
126 flush_vtlb_for_context_switch(current);
127 }
129 void context_switch(struct vcpu *prev, struct vcpu *next)
130 {
131 uint64_t spsr;
132 uint64_t pta;
134 local_irq_save(spsr);
136 __ia64_save_fpu(prev->arch._thread.fph);
137 __ia64_load_fpu(next->arch._thread.fph);
138 if (VMX_DOMAIN(prev)) {
139 vmx_save_state(prev);
140 if (!VMX_DOMAIN(next)) {
141 /* VMX domains can change the physical cr.dcr.
142 * Restore default to prevent leakage. */
143 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
144 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
145 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
146 }
147 }
148 if (VMX_DOMAIN(next))
149 vmx_load_state(next);
150 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
151 prev = ia64_switch_to(next);
153 /* Note: ia64_switch_to does not return here at vcpu initialization. */
155 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
157 if (VMX_DOMAIN(current)){
158 vmx_load_all_rr(current);
159 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
160 current->processor);
161 } else {
162 struct domain *nd;
163 extern char ia64_ivt;
165 ia64_set_iva(&ia64_ivt);
167 nd = current->domain;
168 if (!is_idle_domain(nd)) {
169 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
170 VHPT_ENABLED);
171 load_region_regs(current);
172 vcpu_load_kernel_regs(current);
173 vcpu_set_next_timer(current);
174 if (vcpu_timer_expired(current))
175 vcpu_pend_timer(current);
176 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
177 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
178 __ia64_per_cpu_var(current_psr_ic_addr) =
179 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
180 } else {
181 /* When switching to idle domain, only need to disable vhpt
182 * walker. Then all accesses happen within idle context will
183 * be handled by TR mapping and identity mapping.
184 */
185 pta = ia64_get_pta();
186 ia64_set_pta(pta & ~VHPT_ENABLED);
187 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
188 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
189 }
190 }
191 flush_vtlb_for_context_switch(current);
192 local_irq_restore(spsr);
193 context_saved(prev);
194 }
196 void continue_running(struct vcpu *same)
197 {
198 /* nothing to do */
199 }
201 static void default_idle(void)
202 {
203 local_irq_disable();
204 if ( !softirq_pending(smp_processor_id()) )
205 safe_halt();
206 local_irq_enable();
207 }
209 static void continue_cpu_idle_loop(void)
210 {
211 for ( ; ; )
212 {
213 #ifdef IA64
214 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
215 #else
216 irq_stat[cpu].idle_timestamp = jiffies;
217 #endif
218 while ( !softirq_pending(smp_processor_id()) )
219 default_idle();
220 raise_softirq(SCHEDULE_SOFTIRQ);
221 do_softirq();
222 }
223 }
225 void startup_cpu_idle_loop(void)
226 {
227 /* Just some sanity to ensure that the scheduler is set up okay. */
228 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
229 raise_softirq(SCHEDULE_SOFTIRQ);
231 continue_cpu_idle_loop();
232 }
234 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
235 * get_order_from_shift(XMAPPEDREGS_SHIFT))
236 */
237 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
238 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
239 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
240 #endif
242 void hlt_timer_fn(void *data)
243 {
244 struct vcpu *v = data;
245 vcpu_unblock(v);
246 }
248 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
249 {
250 struct vcpu *v;
251 struct thread_info *ti;
253 /* Still keep idle vcpu0 static allocated at compilation, due
254 * to some code from Linux still requires it in early phase.
255 */
256 if (is_idle_domain(d) && !vcpu_id)
257 v = idle_vcpu[0];
258 else {
259 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
260 return NULL;
261 memset(v, 0, sizeof(*v));
263 ti = alloc_thread_info(v);
264 /* Clear thread_info to clear some important fields, like
265 * preempt_count
266 */
267 memset(ti, 0, sizeof(struct thread_info));
268 init_switch_stack(v);
269 }
271 if (!is_idle_domain(d)) {
272 if (!d->arch.is_vti) {
273 int order;
274 int i;
276 /* Create privregs page only if not VTi. */
277 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
278 v->arch.privregs = alloc_xenheap_pages(order);
279 BUG_ON(v->arch.privregs == NULL);
280 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
281 for (i = 0; i < (1 << order); i++)
282 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
283 i, d, XENSHARE_writable);
284 }
286 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
287 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
288 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
289 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
291 /* Is it correct ?
292 It depends on the domain rid usage.
294 A domain may share rid among its processor (eg having a
295 global VHPT). In this case, we should also share rid
296 among vcpus and the rid range should be the same.
298 However a domain may have per cpu rid allocation. In
299 this case we don't want to share rid among vcpus, but we may
300 do it if two vcpus are on the same cpu... */
302 v->arch.starting_rid = d->arch.starting_rid;
303 v->arch.ending_rid = d->arch.ending_rid;
304 v->arch.breakimm = d->arch.breakimm;
305 v->arch.last_processor = INVALID_PROCESSOR;
306 }
307 if (!VMX_DOMAIN(v)){
308 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v, v->processor);
309 }
311 return v;
312 }
314 void relinquish_vcpu_resources(struct vcpu *v)
315 {
316 if (v->arch.privregs != NULL) {
317 free_xenheap_pages(v->arch.privregs,
318 get_order_from_shift(XMAPPEDREGS_SHIFT));
319 v->arch.privregs = NULL;
320 }
321 kill_timer(&v->arch.hlt_timer);
322 }
324 void free_vcpu_struct(struct vcpu *v)
325 {
326 if (VMX_DOMAIN(v))
327 vmx_relinquish_vcpu_resources(v);
328 else
329 relinquish_vcpu_resources(v);
331 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
332 }
334 static void init_switch_stack(struct vcpu *v)
335 {
336 struct pt_regs *regs = vcpu_regs (v);
337 struct switch_stack *sw = (struct switch_stack *) regs - 1;
338 extern void ia64_ret_from_clone;
340 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
341 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
342 sw->b0 = (unsigned long) &ia64_ret_from_clone;
343 sw->ar_fpsr = FPSR_DEFAULT;
344 v->arch._thread.ksp = (unsigned long) sw - 16;
345 // stay on kernel stack because may get interrupts!
346 // ia64_ret_from_clone switches to user stack
347 v->arch._thread.on_ustack = 0;
348 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
349 }
351 int arch_domain_create(struct domain *d)
352 {
353 int i;
355 // the following will eventually need to be negotiated dynamically
356 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
357 d->arch.breakimm = 0x1000;
358 for (i = 0; i < NR_CPUS; i++) {
359 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
360 }
362 if (is_idle_domain(d))
363 return 0;
365 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
366 if (d->shared_info == NULL)
367 goto fail_nomem;
368 memset(d->shared_info, 0, XSI_SIZE);
369 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
370 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
371 d, XENSHARE_writable);
373 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
374 /* We may also need emulation rid for region4, though it's unlikely
375 * to see guest issue uncacheable access in metaphysical mode. But
376 * keep such info here may be more sane.
377 */
378 if (!allocate_rid_range(d,0))
379 goto fail_nomem;
381 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
383 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
384 goto fail_nomem;
386 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
387 RANGESETF_prettyprint_hex);
389 printf ("arch_domain_create: domain=%p\n", d);
390 return 0;
392 fail_nomem:
393 if (d->arch.mm.pgd != NULL)
394 pgd_free(d->arch.mm.pgd);
395 if (d->shared_info != NULL)
396 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
397 return -ENOMEM;
398 }
400 void arch_domain_destroy(struct domain *d)
401 {
402 BUG_ON(d->arch.mm.pgd != NULL);
403 if (d->shared_info != NULL)
404 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
405 if (d->arch.shadow_bitmap != NULL)
406 xfree(d->arch.shadow_bitmap);
408 /* Clear vTLB for the next domain. */
409 domain_flush_tlb_vhpt(d);
411 deallocate_rid_range(d);
412 }
414 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
415 {
416 int i;
417 struct vcpu_extra_regs *er = &c->extra_regs;
419 c->user_regs = *vcpu_regs (v);
420 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
422 /* Fill extra regs. */
423 for (i = 0; i < 8; i++) {
424 er->itrs[i].pte = v->arch.itrs[i].pte.val;
425 er->itrs[i].itir = v->arch.itrs[i].itir;
426 er->itrs[i].vadr = v->arch.itrs[i].vadr;
427 er->itrs[i].rid = v->arch.itrs[i].rid;
428 }
429 for (i = 0; i < 8; i++) {
430 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
431 er->dtrs[i].itir = v->arch.dtrs[i].itir;
432 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
433 er->dtrs[i].rid = v->arch.dtrs[i].rid;
434 }
435 er->event_callback_ip = v->arch.event_callback_ip;
436 er->dcr = v->arch.dcr;
437 er->iva = v->arch.iva;
438 }
440 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
441 {
442 struct pt_regs *regs = vcpu_regs (v);
443 struct domain *d = v->domain;
445 *regs = c->user_regs;
447 if (!d->arch.is_vti) {
448 /* domain runs at PL2/3 */
449 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
450 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
451 }
453 if (c->flags & VGCF_EXTRA_REGS) {
454 int i;
455 struct vcpu_extra_regs *er = &c->extra_regs;
457 for (i = 0; i < 8; i++) {
458 vcpu_set_itr(v, i, er->itrs[i].pte,
459 er->itrs[i].itir,
460 er->itrs[i].vadr,
461 er->itrs[i].rid);
462 }
463 for (i = 0; i < 8; i++) {
464 vcpu_set_dtr(v, i,
465 er->dtrs[i].pte,
466 er->dtrs[i].itir,
467 er->dtrs[i].vadr,
468 er->dtrs[i].rid);
469 }
470 v->arch.event_callback_ip = er->event_callback_ip;
471 v->arch.dcr = er->dcr;
472 v->arch.iva = er->iva;
473 }
475 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
476 return 0;
477 if (d->arch.is_vti)
478 vmx_final_setup_guest(v);
480 /* This overrides some registers. */
481 vcpu_init_regs(v);
483 /* Don't redo final setup */
484 set_bit(_VCPUF_initialised, &v->vcpu_flags);
485 return 0;
486 }
488 static void relinquish_memory(struct domain *d, struct list_head *list)
489 {
490 struct list_head *ent;
491 struct page_info *page;
492 #ifndef __ia64__
493 unsigned long x, y;
494 #endif
496 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
497 spin_lock_recursive(&d->page_alloc_lock);
498 ent = list->next;
499 while ( ent != list )
500 {
501 page = list_entry(ent, struct page_info, list);
502 /* Grab a reference to the page so it won't disappear from under us. */
503 if ( unlikely(!get_page(page, d)) )
504 {
505 /* Couldn't get a reference -- someone is freeing this page. */
506 ent = ent->next;
507 continue;
508 }
510 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
511 put_page_and_type(page);
513 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
514 put_page(page);
516 #ifndef __ia64__
517 /*
518 * Forcibly invalidate base page tables at this point to break circular
519 * 'linear page table' references. This is okay because MMU structures
520 * are not shared across domains and this domain is now dead. Thus base
521 * tables are not in use so a non-zero count means circular reference.
522 */
523 y = page->u.inuse.type_info;
524 for ( ; ; )
525 {
526 x = y;
527 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
528 (PGT_base_page_table|PGT_validated)) )
529 break;
531 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
532 if ( likely(y == x) )
533 {
534 free_page_type(page, PGT_base_page_table);
535 break;
536 }
537 }
538 #endif
540 /* Follow the list chain and /then/ potentially free the page. */
541 ent = ent->next;
542 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
543 put_page(page);
544 }
546 spin_unlock_recursive(&d->page_alloc_lock);
547 }
549 void domain_relinquish_resources(struct domain *d)
550 {
551 /* Relinquish every page of memory. */
553 // relase page traversing d->arch.mm.
554 relinquish_mm(d);
556 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
557 vmx_relinquish_guest_resources(d);
559 relinquish_memory(d, &d->xenpage_list);
560 relinquish_memory(d, &d->page_list);
562 if (d->arch.is_vti && d->arch.sal_data)
563 xfree(d->arch.sal_data);
564 }
566 void build_physmap_table(struct domain *d)
567 {
568 struct list_head *list_ent = d->page_list.next;
569 unsigned long mfn, i = 0;
571 while(list_ent != &d->page_list) {
572 mfn = page_to_mfn(list_entry(
573 list_ent, struct page_info, list));
574 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
576 i++;
577 list_ent = mfn_to_page(mfn)->list.next;
578 }
579 }
581 unsigned long
582 domain_set_shared_info_va (unsigned long va)
583 {
584 struct vcpu *v = current;
585 struct domain *d = v->domain;
586 struct vcpu *v1;
588 /* Check virtual address:
589 must belong to region 7,
590 must be 64Kb aligned,
591 must not be within Xen virtual space. */
592 if ((va >> 61) != 7
593 || (va & 0xffffUL) != 0
594 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
595 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
597 /* Note: this doesn't work well if other cpus are already running.
598 However this is part of the spec :-) */
599 printf ("Domain set shared_info_va to 0x%016lx\n", va);
600 d->arch.shared_info_va = va;
602 for_each_vcpu (d, v1) {
603 VCPU(v1, interrupt_mask_addr) =
604 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
605 }
607 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
609 /* Remap the shared pages. */
610 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
612 return 0;
613 }
615 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
616 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
618 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
619 {
620 unsigned int op = sc->op;
621 int rc = 0;
622 int i;
623 //struct vcpu *v;
625 if (unlikely(d == current->domain)) {
626 DPRINTK("Don't try to do a shadow op on yourself!\n");
627 return -EINVAL;
628 }
630 domain_pause(d);
632 switch (op)
633 {
634 case XEN_DOMCTL_SHADOW_OP_OFF:
635 if (shadow_mode_enabled (d)) {
636 u64 *bm = d->arch.shadow_bitmap;
638 /* Flush vhpt and tlb to restore dirty bit usage. */
639 domain_flush_tlb_vhpt(d);
641 /* Free bitmap. */
642 d->arch.shadow_bitmap_size = 0;
643 d->arch.shadow_bitmap = NULL;
644 xfree(bm);
645 }
646 break;
648 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
649 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
650 rc = -EINVAL;
651 break;
653 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
654 if (shadow_mode_enabled(d)) {
655 rc = -EINVAL;
656 break;
657 }
659 atomic64_set(&d->arch.shadow_fault_count, 0);
660 atomic64_set(&d->arch.shadow_dirty_count, 0);
662 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
663 ~(BITS_PER_LONG-1);
664 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
665 d->arch.shadow_bitmap_size / BITS_PER_LONG);
666 if (d->arch.shadow_bitmap == NULL) {
667 d->arch.shadow_bitmap_size = 0;
668 rc = -ENOMEM;
669 }
670 else {
671 memset(d->arch.shadow_bitmap, 0,
672 d->arch.shadow_bitmap_size / 8);
674 /* Flush vhtp and tlb to enable dirty bit
675 virtualization. */
676 domain_flush_tlb_vhpt(d);
677 }
678 break;
680 case XEN_DOMCTL_SHADOW_OP_CLEAN:
681 {
682 int nbr_longs;
684 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
685 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
687 atomic64_set(&d->arch.shadow_fault_count, 0);
688 atomic64_set(&d->arch.shadow_dirty_count, 0);
690 if (guest_handle_is_null(sc->dirty_bitmap) ||
691 (d->arch.shadow_bitmap == NULL)) {
692 rc = -EINVAL;
693 break;
694 }
696 if (sc->pages > d->arch.shadow_bitmap_size)
697 sc->pages = d->arch.shadow_bitmap_size;
699 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
701 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
702 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
703 SHADOW_COPY_CHUNK : nbr_longs - i;
705 if (copy_to_guest_offset(sc->dirty_bitmap, i,
706 d->arch.shadow_bitmap + i,
707 size)) {
708 rc = -EFAULT;
709 break;
710 }
712 memset(d->arch.shadow_bitmap + i,
713 0, size * sizeof(unsigned long));
714 }
716 break;
717 }
719 case XEN_DOMCTL_SHADOW_OP_PEEK:
720 {
721 unsigned long size;
723 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
724 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
726 if (guest_handle_is_null(sc->dirty_bitmap) ||
727 (d->arch.shadow_bitmap == NULL)) {
728 rc = -EINVAL;
729 break;
730 }
732 if (sc->pages > d->arch.shadow_bitmap_size)
733 sc->pages = d->arch.shadow_bitmap_size;
735 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
736 if (copy_to_guest(sc->dirty_bitmap,
737 d->arch.shadow_bitmap, size)) {
738 rc = -EFAULT;
739 break;
740 }
741 break;
742 }
743 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
744 sc->mb = 0;
745 break;
746 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
747 if (sc->mb > 0) {
748 BUG();
749 rc = -ENOMEM;
750 }
751 break;
752 default:
753 rc = -EINVAL;
754 break;
755 }
757 domain_unpause(d);
759 return rc;
760 }
762 // remove following line if not privifying in memory
763 //#define HAVE_PRIVIFY_MEMORY
764 #ifndef HAVE_PRIVIFY_MEMORY
765 #define privify_memory(x,y) do {} while(0)
766 #endif
768 // see arch/x86/xxx/domain_build.c
769 int elf_sanity_check(Elf_Ehdr *ehdr)
770 {
771 if (!(IS_ELF(*ehdr)))
772 {
773 printk("DOM0 image is not a Xen-compatible Elf image.\n");
774 return 0;
775 }
776 return 1;
777 }
779 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
780 {
781 char *elfbase = (char *) image_start;
782 Elf_Ehdr ehdr;
783 Elf_Phdr phdr;
784 int h, filesz, memsz;
785 unsigned long elfaddr, dom_mpaddr, dom_imva;
786 struct page_info *p;
788 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
789 for ( h = 0; h < ehdr.e_phnum; h++ ) {
790 memcpy(&phdr,
791 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
792 sizeof(Elf_Phdr));
793 if ((phdr.p_type != PT_LOAD))
794 continue;
796 filesz = phdr.p_filesz;
797 memsz = phdr.p_memsz;
798 elfaddr = (unsigned long) elfbase + phdr.p_offset;
799 dom_mpaddr = phdr.p_paddr;
801 while (memsz > 0) {
802 p = assign_new_domain_page(d,dom_mpaddr);
803 BUG_ON (unlikely(p == NULL));
804 dom_imva = __va_ul(page_to_maddr(p));
805 if (filesz > 0) {
806 if (filesz >= PAGE_SIZE)
807 memcpy((void *) dom_imva,
808 (void *) elfaddr,
809 PAGE_SIZE);
810 else {
811 // copy partial page
812 memcpy((void *) dom_imva,
813 (void *) elfaddr, filesz);
814 // zero the rest of page
815 memset((void *) dom_imva+filesz, 0,
816 PAGE_SIZE-filesz);
817 }
818 //FIXME: This test for code seems to find a lot more than objdump -x does
819 if (phdr.p_flags & PF_X) {
820 privify_memory(dom_imva,PAGE_SIZE);
821 flush_icache_range(dom_imva,
822 dom_imva+PAGE_SIZE);
823 }
824 }
825 else if (memsz > 0) {
826 /* always zero out entire page */
827 memset((void *) dom_imva, 0, PAGE_SIZE);
828 }
829 memsz -= PAGE_SIZE;
830 filesz -= PAGE_SIZE;
831 elfaddr += PAGE_SIZE;
832 dom_mpaddr += PAGE_SIZE;
833 }
834 }
835 }
837 void alloc_dom0(void)
838 {
839 /* Check dom0 size. */
840 if (dom0_size < 4 * 1024 * 1024) {
841 panic("dom0_mem is too small, boot aborted"
842 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
843 }
845 /* Check dom0 align. */
846 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
847 panic("dom0_align (%lx) must be power of two, boot aborted"
848 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
849 dom0_align);
850 }
851 if (dom0_align < PAGE_SIZE) {
852 panic("dom0_align must be >= %ld, boot aborted"
853 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
854 PAGE_SIZE);
855 }
856 if (dom0_size % dom0_align) {
857 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
858 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
859 dom0_size,dom0_align);
860 }
862 if (running_on_sim) {
863 dom0_size = 128*1024*1024; //FIXME: Should be configurable
864 }
866 /* no need to allocate pages for now
867 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
868 */
869 }
872 /*
873 * Domain 0 has direct access to all devices absolutely. However
874 * the major point of this stub here, is to allow alloc_dom_mem
875 * handled with order > 0 request. Dom0 requires that bit set to
876 * allocate memory for other domains.
877 */
878 static void physdev_init_dom0(struct domain *d)
879 {
880 if (iomem_permit_access(d, 0UL, ~0UL))
881 BUG();
882 if (irqs_permit_access(d, 0, NR_IRQS-1))
883 BUG();
884 if (ioports_permit_access(d, 0, 0xffff))
885 BUG();
886 }
888 int construct_dom0(struct domain *d,
889 unsigned long image_start, unsigned long image_len,
890 unsigned long initrd_start, unsigned long initrd_len,
891 char *cmdline)
892 {
893 int i, rc;
894 start_info_t *si;
895 dom0_vga_console_info_t *ci;
896 struct vcpu *v = d->vcpu[0];
897 unsigned long max_pages;
899 struct domain_setup_info dsi;
900 unsigned long p_start;
901 unsigned long pkern_start;
902 unsigned long pkern_entry;
903 unsigned long pkern_end;
904 unsigned long pinitrd_start = 0;
905 unsigned long pstart_info;
906 struct page_info *start_info_page;
907 unsigned long bp_mpa;
908 struct ia64_boot_param *bp;
910 #ifdef VALIDATE_VT
911 unsigned int vmx_dom0 = 0;
912 unsigned long mfn;
913 struct page_info *page = NULL;
914 #endif
916 //printf("construct_dom0: starting\n");
918 /* Sanity! */
919 BUG_ON(d != dom0);
920 BUG_ON(d->vcpu[0] == NULL);
921 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
923 memset(&dsi, 0, sizeof(struct domain_setup_info));
925 printk("*** LOADING DOMAIN 0 ***\n");
927 max_pages = dom0_size / PAGE_SIZE;
928 d->max_pages = max_pages;
929 d->tot_pages = 0;
930 dsi.image_addr = (unsigned long)image_start;
931 dsi.image_len = image_len;
932 rc = parseelfimage(&dsi);
933 if ( rc != 0 )
934 return rc;
936 #ifdef VALIDATE_VT
937 /* Temp workaround */
938 if (running_on_sim)
939 dsi.xen_section_string = (char *)1;
941 /* Check whether dom0 is vti domain */
942 if ((!vmx_enabled) && !dsi.xen_section_string) {
943 printk("Lack of hardware support for unmodified vmx dom0\n");
944 panic("");
945 }
947 if (vmx_enabled && !dsi.xen_section_string) {
948 printk("Dom0 is vmx domain!\n");
949 vmx_dom0 = 1;
950 }
951 #endif
953 p_start = dsi.v_start;
954 pkern_start = dsi.v_kernstart;
955 pkern_end = dsi.v_kernend;
956 pkern_entry = dsi.v_kernentry;
958 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
960 if ( (p_start & (PAGE_SIZE-1)) != 0 )
961 {
962 printk("Initial guest OS must load to a page boundary.\n");
963 return -EINVAL;
964 }
966 pstart_info = PAGE_ALIGN(pkern_end);
967 if(initrd_start && initrd_len){
968 unsigned long offset;
970 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
971 if (pinitrd_start <= pstart_info)
972 panic("%s:enough memory is not assigned to dom0", __func__);
974 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
975 struct page_info *p;
976 p = assign_new_domain_page(d, pinitrd_start + offset);
977 if (p == NULL)
978 panic("%s: can't allocate page for initrd image", __func__);
979 if (initrd_len < offset + PAGE_SIZE)
980 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
981 initrd_len - offset);
982 else
983 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
984 }
985 }
987 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
988 " Kernel image: %lx->%lx\n"
989 " Entry address: %lx\n"
990 " Init. ramdisk: %lx len %lx\n"
991 " Start info.: %lx->%lx\n",
992 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
993 pstart_info, pstart_info + PAGE_SIZE);
995 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
996 {
997 printk("Initial guest OS requires too much space\n"
998 "(%luMB is greater than %luMB limit)\n",
999 (pkern_end-pkern_start)>>20,
1000 (max_pages <<PAGE_SHIFT)>>20);
1001 return -ENOMEM;
1004 // if high 3 bits of pkern start are non-zero, error
1006 // if pkern end is after end of metaphysical memory, error
1007 // (we should be able to deal with this... later)
1009 /* Mask all upcalls... */
1010 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1011 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1013 if (dom0_max_vcpus == 0)
1014 dom0_max_vcpus = MAX_VIRT_CPUS;
1015 if (dom0_max_vcpus > num_online_cpus())
1016 dom0_max_vcpus = num_online_cpus();
1017 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1018 dom0_max_vcpus = MAX_VIRT_CPUS;
1020 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1021 for ( i = 1; i < dom0_max_vcpus; i++ )
1022 if (alloc_vcpu(d, i, i) == NULL)
1023 printf ("Cannot allocate dom0 vcpu %d\n", i);
1025 /* Copy the OS image. */
1026 loaddomainelfimage(d,image_start);
1028 /* Copy the initial ramdisk. */
1029 //if ( initrd_len != 0 )
1030 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1032 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1033 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1035 /* Set up start info area. */
1036 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1037 start_info_page = assign_new_domain_page(d, pstart_info);
1038 if (start_info_page == NULL)
1039 panic("can't allocate start info page");
1040 si = page_to_virt(start_info_page);
1041 memset(si, 0, PAGE_SIZE);
1042 sprintf(si->magic, "xen-%i.%i-ia64",
1043 xen_major_version(), xen_minor_version());
1044 si->nr_pages = max_pages;
1045 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1047 printk("Dom0: 0x%lx\n", (u64)dom0);
1049 #ifdef VALIDATE_VT
1050 /* VMX specific construction for Dom0, if hardware supports VMX
1051 * and Dom0 is unmodified image
1052 */
1053 if (vmx_dom0)
1054 vmx_final_setup_guest(v);
1055 #endif
1057 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1059 /* Build firmware.
1060 Note: Linux kernel reserve memory used by start_info, so there is
1061 no need to remove it from MDT. */
1062 bp_mpa = pstart_info + sizeof(struct start_info);
1063 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1065 /* Fill boot param. */
1066 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1067 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1069 bp = (struct ia64_boot_param *)((unsigned char *)si +
1070 sizeof(start_info_t));
1071 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1073 /* We assume console has reached the last line! */
1074 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1075 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1076 bp->console_info.orig_x = 0;
1077 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1078 0 : bp->console_info.num_rows - 1;
1080 bp->initrd_start = dom0_size -
1081 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1082 bp->initrd_size = ia64_boot_param->initrd_size;
1084 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1085 sizeof(start_info_t) +
1086 sizeof(struct ia64_boot_param));
1088 if (fill_console_start_info(ci)) {
1089 si->console.dom0.info_off = sizeof(start_info_t) +
1090 sizeof(struct ia64_boot_param);
1091 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1094 vcpu_init_regs (v);
1096 vcpu_regs(v)->r28 = bp_mpa;
1098 vcpu_regs (v)->cr_iip = pkern_entry;
1100 physdev_init_dom0(d);
1102 // FIXME: Hack for keyboard input
1103 //serial_input_init();
1105 return 0;
1108 void machine_restart(char * __unused)
1110 console_start_sync();
1111 if (running_on_sim)
1112 printf ("machine_restart called. spinning...\n");
1113 else
1114 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1115 while(1);
1118 void machine_halt(void)
1120 console_start_sync();
1121 if (running_on_sim)
1122 printf ("machine_halt called. spinning...\n");
1123 else
1124 (*efi.reset_system)(EFI_RESET_SHUTDOWN,0,0,NULL);
1125 while(1);
1128 void sync_vcpu_execstate(struct vcpu *v)
1130 // __ia64_save_fpu(v->arch._thread.fph);
1131 // if (VMX_DOMAIN(v))
1132 // vmx_save_state(v);
1133 // FIXME SMP: Anything else needed here for SMP?
1136 static void parse_dom0_mem(char *s)
1138 dom0_size = parse_size_and_unit(s);
1140 custom_param("dom0_mem", parse_dom0_mem);
1143 static void parse_dom0_align(char *s)
1145 dom0_align = parse_size_and_unit(s);
1147 custom_param("dom0_align", parse_dom0_align);