direct-io.hg

view xen/arch/ia64/xen/domain.c @ 11347:f74c9368f6ff

[IA64] fix noreboot option

fix machine_hlt to support noreboot option.

Signed-off-by: Akio Takebe <takebe_akio@jp.fujitsu.com>
[modified to use existing cpu_halt() function in linux-xen files]
Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author awilliam@xenbuild.aw
date Mon Aug 28 14:09:31 2006 -0600 (2006-08-28)
parents a19dbbe4cff5
children e317ad162eba
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
50 unsigned long dom0_size = 512*1024*1024;
51 unsigned long dom0_align = 64*1024*1024;
53 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
54 static unsigned int dom0_max_vcpus = 1;
55 integer_param("dom0_max_vcpus", dom0_max_vcpus);
57 extern unsigned long running_on_sim;
59 extern char dom0_command_line[];
61 /* FIXME: where these declarations should be there ? */
62 extern void serial_input_init(void);
63 static void init_switch_stack(struct vcpu *v);
64 extern void vmx_do_launch(struct vcpu *);
66 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
67 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
69 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
70 This is a Xen virtual address. */
71 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
72 DEFINE_PER_CPU(int *, current_psr_ic_addr);
74 #include <xen/sched-if.h>
76 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
77 {
78 int cpu = smp_processor_id();
79 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
80 int last_processor = vcpu->arch.last_processor;
82 if (is_idle_domain(vcpu->domain))
83 return;
85 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
86 vcpu->arch.last_processor = cpu;
88 if ((last_vcpu_id != vcpu->vcpu_id &&
89 last_vcpu_id != INVALID_VCPU_ID) ||
90 (last_vcpu_id == vcpu->vcpu_id &&
91 last_processor != cpu &&
92 last_processor != INVALID_PROCESSOR)) {
94 // if the vTLB implementation was changed,
95 // the followings must be updated either.
96 if (VMX_DOMAIN(vcpu)) {
97 // currently vTLB for vt-i domian is per vcpu.
98 // so any flushing isn't needed.
99 } else {
100 vhpt_flush();
101 }
102 local_flush_tlb_all();
103 }
104 }
106 void schedule_tail(struct vcpu *prev)
107 {
108 extern char ia64_ivt;
109 context_saved(prev);
111 if (VMX_DOMAIN(current)) {
112 vmx_do_launch(current);
113 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
114 current->processor);
115 } else {
116 ia64_set_iva(&ia64_ivt);
117 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
118 VHPT_ENABLED);
119 load_region_regs(current);
120 vcpu_load_kernel_regs(current);
121 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
122 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
123 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
124 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
125 }
126 flush_vtlb_for_context_switch(current);
127 }
129 void context_switch(struct vcpu *prev, struct vcpu *next)
130 {
131 uint64_t spsr;
132 uint64_t pta;
134 local_irq_save(spsr);
136 __ia64_save_fpu(prev->arch._thread.fph);
137 __ia64_load_fpu(next->arch._thread.fph);
138 if (VMX_DOMAIN(prev)) {
139 vmx_save_state(prev);
140 if (!VMX_DOMAIN(next)) {
141 /* VMX domains can change the physical cr.dcr.
142 * Restore default to prevent leakage. */
143 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
144 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
145 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
146 }
147 }
148 if (VMX_DOMAIN(next))
149 vmx_load_state(next);
150 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
151 prev = ia64_switch_to(next);
153 /* Note: ia64_switch_to does not return here at vcpu initialization. */
155 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
157 if (VMX_DOMAIN(current)){
158 vmx_load_all_rr(current);
159 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
160 current->processor);
161 } else {
162 struct domain *nd;
163 extern char ia64_ivt;
165 ia64_set_iva(&ia64_ivt);
167 nd = current->domain;
168 if (!is_idle_domain(nd)) {
169 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
170 VHPT_ENABLED);
171 load_region_regs(current);
172 vcpu_load_kernel_regs(current);
173 vcpu_set_next_timer(current);
174 if (vcpu_timer_expired(current))
175 vcpu_pend_timer(current);
176 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
177 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
178 __ia64_per_cpu_var(current_psr_ic_addr) =
179 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
180 } else {
181 /* When switching to idle domain, only need to disable vhpt
182 * walker. Then all accesses happen within idle context will
183 * be handled by TR mapping and identity mapping.
184 */
185 pta = ia64_get_pta();
186 ia64_set_pta(pta & ~VHPT_ENABLED);
187 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
188 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
189 }
190 }
191 flush_vtlb_for_context_switch(current);
192 local_irq_restore(spsr);
193 context_saved(prev);
194 }
196 void continue_running(struct vcpu *same)
197 {
198 /* nothing to do */
199 }
201 static void default_idle(void)
202 {
203 local_irq_disable();
204 if ( !softirq_pending(smp_processor_id()) )
205 safe_halt();
206 local_irq_enable();
207 }
209 static void continue_cpu_idle_loop(void)
210 {
211 for ( ; ; )
212 {
213 #ifdef IA64
214 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
215 #else
216 irq_stat[cpu].idle_timestamp = jiffies;
217 #endif
218 while ( !softirq_pending(smp_processor_id()) )
219 default_idle();
220 raise_softirq(SCHEDULE_SOFTIRQ);
221 do_softirq();
222 }
223 }
225 void startup_cpu_idle_loop(void)
226 {
227 /* Just some sanity to ensure that the scheduler is set up okay. */
228 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
229 raise_softirq(SCHEDULE_SOFTIRQ);
231 continue_cpu_idle_loop();
232 }
234 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
235 * get_order_from_shift(XMAPPEDREGS_SHIFT))
236 */
237 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
238 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
239 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
240 #endif
242 void hlt_timer_fn(void *data)
243 {
244 struct vcpu *v = data;
245 vcpu_unblock(v);
246 }
248 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
249 {
250 struct vcpu *v;
251 struct thread_info *ti;
253 /* Still keep idle vcpu0 static allocated at compilation, due
254 * to some code from Linux still requires it in early phase.
255 */
256 if (is_idle_domain(d) && !vcpu_id)
257 v = idle_vcpu[0];
258 else {
259 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
260 return NULL;
261 memset(v, 0, sizeof(*v));
263 ti = alloc_thread_info(v);
264 /* Clear thread_info to clear some important fields, like
265 * preempt_count
266 */
267 memset(ti, 0, sizeof(struct thread_info));
268 init_switch_stack(v);
269 }
271 if (!is_idle_domain(d)) {
272 if (!d->arch.is_vti) {
273 int order;
274 int i;
276 /* Create privregs page only if not VTi. */
277 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
278 v->arch.privregs = alloc_xenheap_pages(order);
279 BUG_ON(v->arch.privregs == NULL);
280 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
281 for (i = 0; i < (1 << order); i++)
282 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
283 i, d, XENSHARE_writable);
284 }
286 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
287 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
288 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
289 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
291 /* Is it correct ?
292 It depends on the domain rid usage.
294 A domain may share rid among its processor (eg having a
295 global VHPT). In this case, we should also share rid
296 among vcpus and the rid range should be the same.
298 However a domain may have per cpu rid allocation. In
299 this case we don't want to share rid among vcpus, but we may
300 do it if two vcpus are on the same cpu... */
302 v->arch.starting_rid = d->arch.starting_rid;
303 v->arch.ending_rid = d->arch.ending_rid;
304 v->arch.breakimm = d->arch.breakimm;
305 v->arch.last_processor = INVALID_PROCESSOR;
306 }
307 if (!VMX_DOMAIN(v)){
308 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v, v->processor);
309 }
311 return v;
312 }
314 void relinquish_vcpu_resources(struct vcpu *v)
315 {
316 if (v->arch.privregs != NULL) {
317 free_xenheap_pages(v->arch.privregs,
318 get_order_from_shift(XMAPPEDREGS_SHIFT));
319 v->arch.privregs = NULL;
320 }
321 kill_timer(&v->arch.hlt_timer);
322 }
324 void free_vcpu_struct(struct vcpu *v)
325 {
326 if (VMX_DOMAIN(v))
327 vmx_relinquish_vcpu_resources(v);
328 else
329 relinquish_vcpu_resources(v);
331 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
332 }
334 static void init_switch_stack(struct vcpu *v)
335 {
336 struct pt_regs *regs = vcpu_regs (v);
337 struct switch_stack *sw = (struct switch_stack *) regs - 1;
338 extern void ia64_ret_from_clone;
340 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
341 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
342 sw->b0 = (unsigned long) &ia64_ret_from_clone;
343 sw->ar_fpsr = FPSR_DEFAULT;
344 v->arch._thread.ksp = (unsigned long) sw - 16;
345 // stay on kernel stack because may get interrupts!
346 // ia64_ret_from_clone switches to user stack
347 v->arch._thread.on_ustack = 0;
348 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
349 }
351 int arch_domain_create(struct domain *d)
352 {
353 int i;
355 // the following will eventually need to be negotiated dynamically
356 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
357 d->arch.breakimm = 0x1000;
358 for (i = 0; i < NR_CPUS; i++) {
359 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
360 }
362 if (is_idle_domain(d))
363 return 0;
365 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
366 if (d->shared_info == NULL)
367 goto fail_nomem;
368 memset(d->shared_info, 0, XSI_SIZE);
369 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
370 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
371 d, XENSHARE_writable);
373 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
374 /* We may also need emulation rid for region4, though it's unlikely
375 * to see guest issue uncacheable access in metaphysical mode. But
376 * keep such info here may be more sane.
377 */
378 if (!allocate_rid_range(d,0))
379 goto fail_nomem;
381 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
383 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
384 goto fail_nomem;
386 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
387 RANGESETF_prettyprint_hex);
389 printf ("arch_domain_create: domain=%p\n", d);
390 return 0;
392 fail_nomem:
393 if (d->arch.mm.pgd != NULL)
394 pgd_free(d->arch.mm.pgd);
395 if (d->shared_info != NULL)
396 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
397 return -ENOMEM;
398 }
400 void arch_domain_destroy(struct domain *d)
401 {
402 BUG_ON(d->arch.mm.pgd != NULL);
403 if (d->shared_info != NULL) {
404 /* If this domain is domVTi, the shared_info page may
405 * be replaced with domheap. Then the shared_info page
406 * frees in relinquish_mm().
407 */
408 if (IS_XEN_HEAP_FRAME(virt_to_page(d->shared_info))) {
409 free_xenheap_pages(d->shared_info,
410 get_order_from_shift(XSI_SHIFT));
411 }
412 }
413 if (d->arch.shadow_bitmap != NULL)
414 xfree(d->arch.shadow_bitmap);
416 /* Clear vTLB for the next domain. */
417 domain_flush_tlb_vhpt(d);
419 deallocate_rid_range(d);
420 }
422 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
423 {
424 int i;
425 struct vcpu_extra_regs *er = &c->extra_regs;
427 c->user_regs = *vcpu_regs (v);
428 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
430 /* Fill extra regs. */
431 for (i = 0; i < 8; i++) {
432 er->itrs[i].pte = v->arch.itrs[i].pte.val;
433 er->itrs[i].itir = v->arch.itrs[i].itir;
434 er->itrs[i].vadr = v->arch.itrs[i].vadr;
435 er->itrs[i].rid = v->arch.itrs[i].rid;
436 }
437 for (i = 0; i < 8; i++) {
438 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
439 er->dtrs[i].itir = v->arch.dtrs[i].itir;
440 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
441 er->dtrs[i].rid = v->arch.dtrs[i].rid;
442 }
443 er->event_callback_ip = v->arch.event_callback_ip;
444 er->dcr = v->arch.dcr;
445 er->iva = v->arch.iva;
446 }
448 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
449 {
450 struct pt_regs *regs = vcpu_regs (v);
451 struct domain *d = v->domain;
453 *regs = c->user_regs;
455 if (!d->arch.is_vti) {
456 /* domain runs at PL2/3 */
457 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
458 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
459 }
461 if (c->flags & VGCF_EXTRA_REGS) {
462 int i;
463 struct vcpu_extra_regs *er = &c->extra_regs;
465 for (i = 0; i < 8; i++) {
466 vcpu_set_itr(v, i, er->itrs[i].pte,
467 er->itrs[i].itir,
468 er->itrs[i].vadr,
469 er->itrs[i].rid);
470 }
471 for (i = 0; i < 8; i++) {
472 vcpu_set_dtr(v, i,
473 er->dtrs[i].pte,
474 er->dtrs[i].itir,
475 er->dtrs[i].vadr,
476 er->dtrs[i].rid);
477 }
478 v->arch.event_callback_ip = er->event_callback_ip;
479 v->arch.dcr = er->dcr;
480 v->arch.iva = er->iva;
481 }
483 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
484 return 0;
485 if (d->arch.is_vti)
486 vmx_final_setup_guest(v);
488 /* This overrides some registers. */
489 vcpu_init_regs(v);
491 /* Don't redo final setup */
492 set_bit(_VCPUF_initialised, &v->vcpu_flags);
493 return 0;
494 }
496 static void relinquish_memory(struct domain *d, struct list_head *list)
497 {
498 struct list_head *ent;
499 struct page_info *page;
500 #ifndef __ia64__
501 unsigned long x, y;
502 #endif
504 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
505 spin_lock_recursive(&d->page_alloc_lock);
506 ent = list->next;
507 while ( ent != list )
508 {
509 page = list_entry(ent, struct page_info, list);
510 /* Grab a reference to the page so it won't disappear from under us. */
511 if ( unlikely(!get_page(page, d)) )
512 {
513 /* Couldn't get a reference -- someone is freeing this page. */
514 ent = ent->next;
515 continue;
516 }
518 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
519 put_page_and_type(page);
521 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
522 put_page(page);
524 #ifndef __ia64__
525 /*
526 * Forcibly invalidate base page tables at this point to break circular
527 * 'linear page table' references. This is okay because MMU structures
528 * are not shared across domains and this domain is now dead. Thus base
529 * tables are not in use so a non-zero count means circular reference.
530 */
531 y = page->u.inuse.type_info;
532 for ( ; ; )
533 {
534 x = y;
535 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
536 (PGT_base_page_table|PGT_validated)) )
537 break;
539 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
540 if ( likely(y == x) )
541 {
542 free_page_type(page, PGT_base_page_table);
543 break;
544 }
545 }
546 #endif
548 /* Follow the list chain and /then/ potentially free the page. */
549 ent = ent->next;
550 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
551 put_page(page);
552 }
554 spin_unlock_recursive(&d->page_alloc_lock);
555 }
557 void domain_relinquish_resources(struct domain *d)
558 {
559 /* Relinquish every page of memory. */
561 // relase page traversing d->arch.mm.
562 relinquish_mm(d);
564 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
565 vmx_relinquish_guest_resources(d);
567 relinquish_memory(d, &d->xenpage_list);
568 relinquish_memory(d, &d->page_list);
570 if (d->arch.is_vti && d->arch.sal_data)
571 xfree(d->arch.sal_data);
572 }
574 void build_physmap_table(struct domain *d)
575 {
576 struct list_head *list_ent = d->page_list.next;
577 unsigned long mfn, i = 0;
579 while(list_ent != &d->page_list) {
580 mfn = page_to_mfn(list_entry(
581 list_ent, struct page_info, list));
582 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
584 i++;
585 list_ent = mfn_to_page(mfn)->list.next;
586 }
587 }
589 unsigned long
590 domain_set_shared_info_va (unsigned long va)
591 {
592 struct vcpu *v = current;
593 struct domain *d = v->domain;
594 struct vcpu *v1;
596 /* Check virtual address:
597 must belong to region 7,
598 must be 64Kb aligned,
599 must not be within Xen virtual space. */
600 if ((va >> 61) != 7
601 || (va & 0xffffUL) != 0
602 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
603 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
605 /* Note: this doesn't work well if other cpus are already running.
606 However this is part of the spec :-) */
607 printf ("Domain set shared_info_va to 0x%016lx\n", va);
608 d->arch.shared_info_va = va;
610 for_each_vcpu (d, v1) {
611 VCPU(v1, interrupt_mask_addr) =
612 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
613 }
615 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
617 /* Remap the shared pages. */
618 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
620 return 0;
621 }
623 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
624 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
626 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
627 {
628 unsigned int op = sc->op;
629 int rc = 0;
630 int i;
631 //struct vcpu *v;
633 if (unlikely(d == current->domain)) {
634 DPRINTK("Don't try to do a shadow op on yourself!\n");
635 return -EINVAL;
636 }
638 domain_pause(d);
640 switch (op)
641 {
642 case XEN_DOMCTL_SHADOW_OP_OFF:
643 if (shadow_mode_enabled (d)) {
644 u64 *bm = d->arch.shadow_bitmap;
646 /* Flush vhpt and tlb to restore dirty bit usage. */
647 domain_flush_tlb_vhpt(d);
649 /* Free bitmap. */
650 d->arch.shadow_bitmap_size = 0;
651 d->arch.shadow_bitmap = NULL;
652 xfree(bm);
653 }
654 break;
656 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
657 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
658 rc = -EINVAL;
659 break;
661 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
662 if (shadow_mode_enabled(d)) {
663 rc = -EINVAL;
664 break;
665 }
667 atomic64_set(&d->arch.shadow_fault_count, 0);
668 atomic64_set(&d->arch.shadow_dirty_count, 0);
670 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
671 ~(BITS_PER_LONG-1);
672 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
673 d->arch.shadow_bitmap_size / BITS_PER_LONG);
674 if (d->arch.shadow_bitmap == NULL) {
675 d->arch.shadow_bitmap_size = 0;
676 rc = -ENOMEM;
677 }
678 else {
679 memset(d->arch.shadow_bitmap, 0,
680 d->arch.shadow_bitmap_size / 8);
682 /* Flush vhtp and tlb to enable dirty bit
683 virtualization. */
684 domain_flush_tlb_vhpt(d);
685 }
686 break;
688 case XEN_DOMCTL_SHADOW_OP_CLEAN:
689 {
690 int nbr_longs;
692 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
693 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
695 atomic64_set(&d->arch.shadow_fault_count, 0);
696 atomic64_set(&d->arch.shadow_dirty_count, 0);
698 if (guest_handle_is_null(sc->dirty_bitmap) ||
699 (d->arch.shadow_bitmap == NULL)) {
700 rc = -EINVAL;
701 break;
702 }
704 if (sc->pages > d->arch.shadow_bitmap_size)
705 sc->pages = d->arch.shadow_bitmap_size;
707 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
709 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
710 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
711 SHADOW_COPY_CHUNK : nbr_longs - i;
713 if (copy_to_guest_offset(sc->dirty_bitmap, i,
714 d->arch.shadow_bitmap + i,
715 size)) {
716 rc = -EFAULT;
717 break;
718 }
720 memset(d->arch.shadow_bitmap + i,
721 0, size * sizeof(unsigned long));
722 }
724 break;
725 }
727 case XEN_DOMCTL_SHADOW_OP_PEEK:
728 {
729 unsigned long size;
731 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
732 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
734 if (guest_handle_is_null(sc->dirty_bitmap) ||
735 (d->arch.shadow_bitmap == NULL)) {
736 rc = -EINVAL;
737 break;
738 }
740 if (sc->pages > d->arch.shadow_bitmap_size)
741 sc->pages = d->arch.shadow_bitmap_size;
743 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
744 if (copy_to_guest(sc->dirty_bitmap,
745 d->arch.shadow_bitmap, size)) {
746 rc = -EFAULT;
747 break;
748 }
749 break;
750 }
751 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
752 sc->mb = 0;
753 break;
754 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
755 if (sc->mb > 0) {
756 BUG();
757 rc = -ENOMEM;
758 }
759 break;
760 default:
761 rc = -EINVAL;
762 break;
763 }
765 domain_unpause(d);
767 return rc;
768 }
770 // remove following line if not privifying in memory
771 //#define HAVE_PRIVIFY_MEMORY
772 #ifndef HAVE_PRIVIFY_MEMORY
773 #define privify_memory(x,y) do {} while(0)
774 #endif
776 // see arch/x86/xxx/domain_build.c
777 int elf_sanity_check(Elf_Ehdr *ehdr)
778 {
779 if (!(IS_ELF(*ehdr)))
780 {
781 printk("DOM0 image is not a Xen-compatible Elf image.\n");
782 return 0;
783 }
784 return 1;
785 }
787 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
788 {
789 char *elfbase = (char *) image_start;
790 Elf_Ehdr ehdr;
791 Elf_Phdr phdr;
792 int h, filesz, memsz;
793 unsigned long elfaddr, dom_mpaddr, dom_imva;
794 struct page_info *p;
796 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
797 for ( h = 0; h < ehdr.e_phnum; h++ ) {
798 memcpy(&phdr,
799 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
800 sizeof(Elf_Phdr));
801 if ((phdr.p_type != PT_LOAD))
802 continue;
804 filesz = phdr.p_filesz;
805 memsz = phdr.p_memsz;
806 elfaddr = (unsigned long) elfbase + phdr.p_offset;
807 dom_mpaddr = phdr.p_paddr;
809 while (memsz > 0) {
810 p = assign_new_domain_page(d,dom_mpaddr);
811 BUG_ON (unlikely(p == NULL));
812 dom_imva = __va_ul(page_to_maddr(p));
813 if (filesz > 0) {
814 if (filesz >= PAGE_SIZE)
815 memcpy((void *) dom_imva,
816 (void *) elfaddr,
817 PAGE_SIZE);
818 else {
819 // copy partial page
820 memcpy((void *) dom_imva,
821 (void *) elfaddr, filesz);
822 // zero the rest of page
823 memset((void *) dom_imva+filesz, 0,
824 PAGE_SIZE-filesz);
825 }
826 //FIXME: This test for code seems to find a lot more than objdump -x does
827 if (phdr.p_flags & PF_X) {
828 privify_memory(dom_imva,PAGE_SIZE);
829 flush_icache_range(dom_imva,
830 dom_imva+PAGE_SIZE);
831 }
832 }
833 else if (memsz > 0) {
834 /* always zero out entire page */
835 memset((void *) dom_imva, 0, PAGE_SIZE);
836 }
837 memsz -= PAGE_SIZE;
838 filesz -= PAGE_SIZE;
839 elfaddr += PAGE_SIZE;
840 dom_mpaddr += PAGE_SIZE;
841 }
842 }
843 }
845 void alloc_dom0(void)
846 {
847 /* Check dom0 size. */
848 if (dom0_size < 4 * 1024 * 1024) {
849 panic("dom0_mem is too small, boot aborted"
850 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
851 }
853 /* Check dom0 align. */
854 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
855 panic("dom0_align (%lx) must be power of two, boot aborted"
856 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
857 dom0_align);
858 }
859 if (dom0_align < PAGE_SIZE) {
860 panic("dom0_align must be >= %ld, boot aborted"
861 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
862 PAGE_SIZE);
863 }
864 if (dom0_size % dom0_align) {
865 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
866 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
867 dom0_size,dom0_align);
868 }
870 if (running_on_sim) {
871 dom0_size = 128*1024*1024; //FIXME: Should be configurable
872 }
874 /* no need to allocate pages for now
875 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
876 */
877 }
880 /*
881 * Domain 0 has direct access to all devices absolutely. However
882 * the major point of this stub here, is to allow alloc_dom_mem
883 * handled with order > 0 request. Dom0 requires that bit set to
884 * allocate memory for other domains.
885 */
886 static void physdev_init_dom0(struct domain *d)
887 {
888 if (iomem_permit_access(d, 0UL, ~0UL))
889 BUG();
890 if (irqs_permit_access(d, 0, NR_IRQS-1))
891 BUG();
892 if (ioports_permit_access(d, 0, 0xffff))
893 BUG();
894 }
896 int construct_dom0(struct domain *d,
897 unsigned long image_start, unsigned long image_len,
898 unsigned long initrd_start, unsigned long initrd_len,
899 char *cmdline)
900 {
901 int i, rc;
902 start_info_t *si;
903 dom0_vga_console_info_t *ci;
904 struct vcpu *v = d->vcpu[0];
905 unsigned long max_pages;
907 struct domain_setup_info dsi;
908 unsigned long p_start;
909 unsigned long pkern_start;
910 unsigned long pkern_entry;
911 unsigned long pkern_end;
912 unsigned long pinitrd_start = 0;
913 unsigned long pstart_info;
914 struct page_info *start_info_page;
915 unsigned long bp_mpa;
916 struct ia64_boot_param *bp;
918 #ifdef VALIDATE_VT
919 unsigned int vmx_dom0 = 0;
920 unsigned long mfn;
921 struct page_info *page = NULL;
922 #endif
924 //printf("construct_dom0: starting\n");
926 /* Sanity! */
927 BUG_ON(d != dom0);
928 BUG_ON(d->vcpu[0] == NULL);
929 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
931 memset(&dsi, 0, sizeof(struct domain_setup_info));
933 printk("*** LOADING DOMAIN 0 ***\n");
935 max_pages = dom0_size / PAGE_SIZE;
936 d->max_pages = max_pages;
937 d->tot_pages = 0;
938 dsi.image_addr = (unsigned long)image_start;
939 dsi.image_len = image_len;
940 rc = parseelfimage(&dsi);
941 if ( rc != 0 )
942 return rc;
944 #ifdef VALIDATE_VT
945 /* Temp workaround */
946 if (running_on_sim)
947 dsi.xen_section_string = (char *)1;
949 /* Check whether dom0 is vti domain */
950 if ((!vmx_enabled) && !dsi.xen_section_string) {
951 printk("Lack of hardware support for unmodified vmx dom0\n");
952 panic("");
953 }
955 if (vmx_enabled && !dsi.xen_section_string) {
956 printk("Dom0 is vmx domain!\n");
957 vmx_dom0 = 1;
958 }
959 #endif
961 p_start = dsi.v_start;
962 pkern_start = dsi.v_kernstart;
963 pkern_end = dsi.v_kernend;
964 pkern_entry = dsi.v_kernentry;
966 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
968 if ( (p_start & (PAGE_SIZE-1)) != 0 )
969 {
970 printk("Initial guest OS must load to a page boundary.\n");
971 return -EINVAL;
972 }
974 pstart_info = PAGE_ALIGN(pkern_end);
975 if(initrd_start && initrd_len){
976 unsigned long offset;
978 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
979 if (pinitrd_start <= pstart_info)
980 panic("%s:enough memory is not assigned to dom0", __func__);
982 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
983 struct page_info *p;
984 p = assign_new_domain_page(d, pinitrd_start + offset);
985 if (p == NULL)
986 panic("%s: can't allocate page for initrd image", __func__);
987 if (initrd_len < offset + PAGE_SIZE)
988 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
989 initrd_len - offset);
990 else
991 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
992 }
993 }
995 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
996 " Kernel image: %lx->%lx\n"
997 " Entry address: %lx\n"
998 " Init. ramdisk: %lx len %lx\n"
999 " Start info.: %lx->%lx\n",
1000 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1001 pstart_info, pstart_info + PAGE_SIZE);
1003 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1005 printk("Initial guest OS requires too much space\n"
1006 "(%luMB is greater than %luMB limit)\n",
1007 (pkern_end-pkern_start)>>20,
1008 (max_pages <<PAGE_SHIFT)>>20);
1009 return -ENOMEM;
1012 // if high 3 bits of pkern start are non-zero, error
1014 // if pkern end is after end of metaphysical memory, error
1015 // (we should be able to deal with this... later)
1017 /* Mask all upcalls... */
1018 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1019 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1021 if (dom0_max_vcpus == 0)
1022 dom0_max_vcpus = MAX_VIRT_CPUS;
1023 if (dom0_max_vcpus > num_online_cpus())
1024 dom0_max_vcpus = num_online_cpus();
1025 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1026 dom0_max_vcpus = MAX_VIRT_CPUS;
1028 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1029 for ( i = 1; i < dom0_max_vcpus; i++ )
1030 if (alloc_vcpu(d, i, i) == NULL)
1031 printf ("Cannot allocate dom0 vcpu %d\n", i);
1033 /* Copy the OS image. */
1034 loaddomainelfimage(d,image_start);
1036 /* Copy the initial ramdisk. */
1037 //if ( initrd_len != 0 )
1038 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1040 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1041 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1043 /* Set up start info area. */
1044 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1045 start_info_page = assign_new_domain_page(d, pstart_info);
1046 if (start_info_page == NULL)
1047 panic("can't allocate start info page");
1048 si = page_to_virt(start_info_page);
1049 memset(si, 0, PAGE_SIZE);
1050 sprintf(si->magic, "xen-%i.%i-ia64",
1051 xen_major_version(), xen_minor_version());
1052 si->nr_pages = max_pages;
1053 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1055 printk("Dom0: 0x%lx\n", (u64)dom0);
1057 #ifdef VALIDATE_VT
1058 /* VMX specific construction for Dom0, if hardware supports VMX
1059 * and Dom0 is unmodified image
1060 */
1061 if (vmx_dom0)
1062 vmx_final_setup_guest(v);
1063 #endif
1065 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1067 /* Build firmware.
1068 Note: Linux kernel reserve memory used by start_info, so there is
1069 no need to remove it from MDT. */
1070 bp_mpa = pstart_info + sizeof(struct start_info);
1071 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1073 /* Fill boot param. */
1074 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1075 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1077 bp = (struct ia64_boot_param *)((unsigned char *)si +
1078 sizeof(start_info_t));
1079 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1081 /* We assume console has reached the last line! */
1082 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1083 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1084 bp->console_info.orig_x = 0;
1085 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1086 0 : bp->console_info.num_rows - 1;
1088 bp->initrd_start = dom0_size -
1089 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1090 bp->initrd_size = ia64_boot_param->initrd_size;
1092 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1093 sizeof(start_info_t) +
1094 sizeof(struct ia64_boot_param));
1096 if (fill_console_start_info(ci)) {
1097 si->console.dom0.info_off = sizeof(start_info_t) +
1098 sizeof(struct ia64_boot_param);
1099 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1102 vcpu_init_regs (v);
1104 vcpu_regs(v)->r28 = bp_mpa;
1106 vcpu_regs (v)->cr_iip = pkern_entry;
1108 physdev_init_dom0(d);
1110 // FIXME: Hack for keyboard input
1111 //serial_input_init();
1113 return 0;
1116 void machine_restart(char * __unused)
1118 console_start_sync();
1119 if (running_on_sim)
1120 printf ("machine_restart called. spinning...\n");
1121 else
1122 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1123 while(1);
1126 extern void cpu_halt(void);
1128 void machine_halt(void)
1130 console_start_sync();
1131 if (running_on_sim)
1132 printf ("machine_halt called. spinning...\n");
1133 else
1134 cpu_halt();
1135 while(1);
1138 void sync_vcpu_execstate(struct vcpu *v)
1140 // __ia64_save_fpu(v->arch._thread.fph);
1141 // if (VMX_DOMAIN(v))
1142 // vmx_save_state(v);
1143 // FIXME SMP: Anything else needed here for SMP?
1146 static void parse_dom0_mem(char *s)
1148 dom0_size = parse_size_and_unit(s);
1150 custom_param("dom0_mem", parse_dom0_mem);
1153 static void parse_dom0_align(char *s)
1155 dom0_align = parse_size_and_unit(s);
1157 custom_param("dom0_align", parse_dom0_align);