direct-io.hg

view xen/arch/ia64/xen/domain.c @ 11351:e317ad162eba

[IA64] init the hlt timer on a valid cpu and migrate

Based on a patch from Anthony Xu, v->processor is not initialized
in alloc_vcpu_struct() when we initialize the hlt_timer. Init the
timer on the first cpu and migrate it in schedule_tail().

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author awilliam@xenbuild.aw
date Tue Aug 29 08:09:28 2006 -0600 (2006-08-29)
parents f74c9368f6ff
children 586c5fe8cf3e
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
50 unsigned long dom0_size = 512*1024*1024;
51 unsigned long dom0_align = 64*1024*1024;
53 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
54 static unsigned int dom0_max_vcpus = 1;
55 integer_param("dom0_max_vcpus", dom0_max_vcpus);
57 extern unsigned long running_on_sim;
59 extern char dom0_command_line[];
61 /* FIXME: where these declarations should be there ? */
62 extern void serial_input_init(void);
63 static void init_switch_stack(struct vcpu *v);
64 extern void vmx_do_launch(struct vcpu *);
66 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
67 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
69 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
70 This is a Xen virtual address. */
71 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
72 DEFINE_PER_CPU(int *, current_psr_ic_addr);
74 #include <xen/sched-if.h>
76 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
77 {
78 int cpu = smp_processor_id();
79 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
80 int last_processor = vcpu->arch.last_processor;
82 if (is_idle_domain(vcpu->domain))
83 return;
85 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
86 vcpu->arch.last_processor = cpu;
88 if ((last_vcpu_id != vcpu->vcpu_id &&
89 last_vcpu_id != INVALID_VCPU_ID) ||
90 (last_vcpu_id == vcpu->vcpu_id &&
91 last_processor != cpu &&
92 last_processor != INVALID_PROCESSOR)) {
94 // if the vTLB implementation was changed,
95 // the followings must be updated either.
96 if (VMX_DOMAIN(vcpu)) {
97 // currently vTLB for vt-i domian is per vcpu.
98 // so any flushing isn't needed.
99 } else {
100 vhpt_flush();
101 }
102 local_flush_tlb_all();
103 }
104 }
106 void schedule_tail(struct vcpu *prev)
107 {
108 extern char ia64_ivt;
109 context_saved(prev);
111 if (VMX_DOMAIN(current)) {
112 vmx_do_launch(current);
113 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
114 current->processor);
115 } else {
116 ia64_set_iva(&ia64_ivt);
117 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
118 VHPT_ENABLED);
119 load_region_regs(current);
120 vcpu_load_kernel_regs(current);
121 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
122 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
123 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
124 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
125 migrate_timer(&current->arch.hlt_timer, current->processor);
126 }
127 flush_vtlb_for_context_switch(current);
128 }
130 void context_switch(struct vcpu *prev, struct vcpu *next)
131 {
132 uint64_t spsr;
133 uint64_t pta;
135 local_irq_save(spsr);
137 __ia64_save_fpu(prev->arch._thread.fph);
138 __ia64_load_fpu(next->arch._thread.fph);
139 if (VMX_DOMAIN(prev)) {
140 vmx_save_state(prev);
141 if (!VMX_DOMAIN(next)) {
142 /* VMX domains can change the physical cr.dcr.
143 * Restore default to prevent leakage. */
144 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
145 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
146 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
147 }
148 }
149 if (VMX_DOMAIN(next))
150 vmx_load_state(next);
151 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
152 prev = ia64_switch_to(next);
154 /* Note: ia64_switch_to does not return here at vcpu initialization. */
156 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
158 if (VMX_DOMAIN(current)){
159 vmx_load_all_rr(current);
160 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
161 current->processor);
162 } else {
163 struct domain *nd;
164 extern char ia64_ivt;
166 ia64_set_iva(&ia64_ivt);
168 nd = current->domain;
169 if (!is_idle_domain(nd)) {
170 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
171 VHPT_ENABLED);
172 load_region_regs(current);
173 vcpu_load_kernel_regs(current);
174 vcpu_set_next_timer(current);
175 if (vcpu_timer_expired(current))
176 vcpu_pend_timer(current);
177 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
178 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
179 __ia64_per_cpu_var(current_psr_ic_addr) =
180 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
181 } else {
182 /* When switching to idle domain, only need to disable vhpt
183 * walker. Then all accesses happen within idle context will
184 * be handled by TR mapping and identity mapping.
185 */
186 pta = ia64_get_pta();
187 ia64_set_pta(pta & ~VHPT_ENABLED);
188 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
189 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
190 }
191 }
192 flush_vtlb_for_context_switch(current);
193 local_irq_restore(spsr);
194 context_saved(prev);
195 }
197 void continue_running(struct vcpu *same)
198 {
199 /* nothing to do */
200 }
202 static void default_idle(void)
203 {
204 local_irq_disable();
205 if ( !softirq_pending(smp_processor_id()) )
206 safe_halt();
207 local_irq_enable();
208 }
210 static void continue_cpu_idle_loop(void)
211 {
212 for ( ; ; )
213 {
214 #ifdef IA64
215 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
216 #else
217 irq_stat[cpu].idle_timestamp = jiffies;
218 #endif
219 while ( !softirq_pending(smp_processor_id()) )
220 default_idle();
221 raise_softirq(SCHEDULE_SOFTIRQ);
222 do_softirq();
223 }
224 }
226 void startup_cpu_idle_loop(void)
227 {
228 /* Just some sanity to ensure that the scheduler is set up okay. */
229 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
230 raise_softirq(SCHEDULE_SOFTIRQ);
232 continue_cpu_idle_loop();
233 }
235 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
236 * get_order_from_shift(XMAPPEDREGS_SHIFT))
237 */
238 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
239 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
240 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
241 #endif
243 void hlt_timer_fn(void *data)
244 {
245 struct vcpu *v = data;
246 vcpu_unblock(v);
247 }
249 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
250 {
251 struct vcpu *v;
252 struct thread_info *ti;
254 /* Still keep idle vcpu0 static allocated at compilation, due
255 * to some code from Linux still requires it in early phase.
256 */
257 if (is_idle_domain(d) && !vcpu_id)
258 v = idle_vcpu[0];
259 else {
260 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
261 return NULL;
262 memset(v, 0, sizeof(*v));
264 ti = alloc_thread_info(v);
265 /* Clear thread_info to clear some important fields, like
266 * preempt_count
267 */
268 memset(ti, 0, sizeof(struct thread_info));
269 init_switch_stack(v);
270 }
272 if (!is_idle_domain(d)) {
273 if (!d->arch.is_vti) {
274 int order;
275 int i;
277 /* Create privregs page only if not VTi. */
278 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
279 v->arch.privregs = alloc_xenheap_pages(order);
280 BUG_ON(v->arch.privregs == NULL);
281 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
282 for (i = 0; i < (1 << order); i++)
283 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
284 i, d, XENSHARE_writable);
285 }
287 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
288 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
289 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
290 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
292 /* Is it correct ?
293 It depends on the domain rid usage.
295 A domain may share rid among its processor (eg having a
296 global VHPT). In this case, we should also share rid
297 among vcpus and the rid range should be the same.
299 However a domain may have per cpu rid allocation. In
300 this case we don't want to share rid among vcpus, but we may
301 do it if two vcpus are on the same cpu... */
303 v->arch.starting_rid = d->arch.starting_rid;
304 v->arch.ending_rid = d->arch.ending_rid;
305 v->arch.breakimm = d->arch.breakimm;
306 v->arch.last_processor = INVALID_PROCESSOR;
307 }
308 if (!VMX_DOMAIN(v)){
309 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
310 first_cpu(cpu_online_map));
311 }
313 return v;
314 }
316 void relinquish_vcpu_resources(struct vcpu *v)
317 {
318 if (v->arch.privregs != NULL) {
319 free_xenheap_pages(v->arch.privregs,
320 get_order_from_shift(XMAPPEDREGS_SHIFT));
321 v->arch.privregs = NULL;
322 }
323 kill_timer(&v->arch.hlt_timer);
324 }
326 void free_vcpu_struct(struct vcpu *v)
327 {
328 if (VMX_DOMAIN(v))
329 vmx_relinquish_vcpu_resources(v);
330 else
331 relinquish_vcpu_resources(v);
333 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
334 }
336 static void init_switch_stack(struct vcpu *v)
337 {
338 struct pt_regs *regs = vcpu_regs (v);
339 struct switch_stack *sw = (struct switch_stack *) regs - 1;
340 extern void ia64_ret_from_clone;
342 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
343 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
344 sw->b0 = (unsigned long) &ia64_ret_from_clone;
345 sw->ar_fpsr = FPSR_DEFAULT;
346 v->arch._thread.ksp = (unsigned long) sw - 16;
347 // stay on kernel stack because may get interrupts!
348 // ia64_ret_from_clone switches to user stack
349 v->arch._thread.on_ustack = 0;
350 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
351 }
353 int arch_domain_create(struct domain *d)
354 {
355 int i;
357 // the following will eventually need to be negotiated dynamically
358 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
359 d->arch.breakimm = 0x1000;
360 for (i = 0; i < NR_CPUS; i++) {
361 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
362 }
364 if (is_idle_domain(d))
365 return 0;
367 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
368 if (d->shared_info == NULL)
369 goto fail_nomem;
370 memset(d->shared_info, 0, XSI_SIZE);
371 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
372 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
373 d, XENSHARE_writable);
375 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
376 /* We may also need emulation rid for region4, though it's unlikely
377 * to see guest issue uncacheable access in metaphysical mode. But
378 * keep such info here may be more sane.
379 */
380 if (!allocate_rid_range(d,0))
381 goto fail_nomem;
383 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
385 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
386 goto fail_nomem;
388 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
389 RANGESETF_prettyprint_hex);
391 printf ("arch_domain_create: domain=%p\n", d);
392 return 0;
394 fail_nomem:
395 if (d->arch.mm.pgd != NULL)
396 pgd_free(d->arch.mm.pgd);
397 if (d->shared_info != NULL)
398 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
399 return -ENOMEM;
400 }
402 void arch_domain_destroy(struct domain *d)
403 {
404 BUG_ON(d->arch.mm.pgd != NULL);
405 if (d->shared_info != NULL) {
406 /* If this domain is domVTi, the shared_info page may
407 * be replaced with domheap. Then the shared_info page
408 * frees in relinquish_mm().
409 */
410 if (IS_XEN_HEAP_FRAME(virt_to_page(d->shared_info))) {
411 free_xenheap_pages(d->shared_info,
412 get_order_from_shift(XSI_SHIFT));
413 }
414 }
415 if (d->arch.shadow_bitmap != NULL)
416 xfree(d->arch.shadow_bitmap);
418 /* Clear vTLB for the next domain. */
419 domain_flush_tlb_vhpt(d);
421 deallocate_rid_range(d);
422 }
424 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
425 {
426 int i;
427 struct vcpu_extra_regs *er = &c->extra_regs;
429 c->user_regs = *vcpu_regs (v);
430 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
432 /* Fill extra regs. */
433 for (i = 0; i < 8; i++) {
434 er->itrs[i].pte = v->arch.itrs[i].pte.val;
435 er->itrs[i].itir = v->arch.itrs[i].itir;
436 er->itrs[i].vadr = v->arch.itrs[i].vadr;
437 er->itrs[i].rid = v->arch.itrs[i].rid;
438 }
439 for (i = 0; i < 8; i++) {
440 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
441 er->dtrs[i].itir = v->arch.dtrs[i].itir;
442 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
443 er->dtrs[i].rid = v->arch.dtrs[i].rid;
444 }
445 er->event_callback_ip = v->arch.event_callback_ip;
446 er->dcr = v->arch.dcr;
447 er->iva = v->arch.iva;
448 }
450 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
451 {
452 struct pt_regs *regs = vcpu_regs (v);
453 struct domain *d = v->domain;
455 *regs = c->user_regs;
457 if (!d->arch.is_vti) {
458 /* domain runs at PL2/3 */
459 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
460 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
461 }
463 if (c->flags & VGCF_EXTRA_REGS) {
464 int i;
465 struct vcpu_extra_regs *er = &c->extra_regs;
467 for (i = 0; i < 8; i++) {
468 vcpu_set_itr(v, i, er->itrs[i].pte,
469 er->itrs[i].itir,
470 er->itrs[i].vadr,
471 er->itrs[i].rid);
472 }
473 for (i = 0; i < 8; i++) {
474 vcpu_set_dtr(v, i,
475 er->dtrs[i].pte,
476 er->dtrs[i].itir,
477 er->dtrs[i].vadr,
478 er->dtrs[i].rid);
479 }
480 v->arch.event_callback_ip = er->event_callback_ip;
481 v->arch.dcr = er->dcr;
482 v->arch.iva = er->iva;
483 }
485 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
486 return 0;
487 if (d->arch.is_vti)
488 vmx_final_setup_guest(v);
490 /* This overrides some registers. */
491 vcpu_init_regs(v);
493 /* Don't redo final setup */
494 set_bit(_VCPUF_initialised, &v->vcpu_flags);
495 return 0;
496 }
498 static void relinquish_memory(struct domain *d, struct list_head *list)
499 {
500 struct list_head *ent;
501 struct page_info *page;
502 #ifndef __ia64__
503 unsigned long x, y;
504 #endif
506 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
507 spin_lock_recursive(&d->page_alloc_lock);
508 ent = list->next;
509 while ( ent != list )
510 {
511 page = list_entry(ent, struct page_info, list);
512 /* Grab a reference to the page so it won't disappear from under us. */
513 if ( unlikely(!get_page(page, d)) )
514 {
515 /* Couldn't get a reference -- someone is freeing this page. */
516 ent = ent->next;
517 continue;
518 }
520 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
521 put_page_and_type(page);
523 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
524 put_page(page);
526 #ifndef __ia64__
527 /*
528 * Forcibly invalidate base page tables at this point to break circular
529 * 'linear page table' references. This is okay because MMU structures
530 * are not shared across domains and this domain is now dead. Thus base
531 * tables are not in use so a non-zero count means circular reference.
532 */
533 y = page->u.inuse.type_info;
534 for ( ; ; )
535 {
536 x = y;
537 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
538 (PGT_base_page_table|PGT_validated)) )
539 break;
541 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
542 if ( likely(y == x) )
543 {
544 free_page_type(page, PGT_base_page_table);
545 break;
546 }
547 }
548 #endif
550 /* Follow the list chain and /then/ potentially free the page. */
551 ent = ent->next;
552 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
553 put_page(page);
554 }
556 spin_unlock_recursive(&d->page_alloc_lock);
557 }
559 void domain_relinquish_resources(struct domain *d)
560 {
561 /* Relinquish every page of memory. */
563 // relase page traversing d->arch.mm.
564 relinquish_mm(d);
566 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
567 vmx_relinquish_guest_resources(d);
569 relinquish_memory(d, &d->xenpage_list);
570 relinquish_memory(d, &d->page_list);
572 if (d->arch.is_vti && d->arch.sal_data)
573 xfree(d->arch.sal_data);
574 }
576 void build_physmap_table(struct domain *d)
577 {
578 struct list_head *list_ent = d->page_list.next;
579 unsigned long mfn, i = 0;
581 while(list_ent != &d->page_list) {
582 mfn = page_to_mfn(list_entry(
583 list_ent, struct page_info, list));
584 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
586 i++;
587 list_ent = mfn_to_page(mfn)->list.next;
588 }
589 }
591 unsigned long
592 domain_set_shared_info_va (unsigned long va)
593 {
594 struct vcpu *v = current;
595 struct domain *d = v->domain;
596 struct vcpu *v1;
598 /* Check virtual address:
599 must belong to region 7,
600 must be 64Kb aligned,
601 must not be within Xen virtual space. */
602 if ((va >> 61) != 7
603 || (va & 0xffffUL) != 0
604 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
605 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
607 /* Note: this doesn't work well if other cpus are already running.
608 However this is part of the spec :-) */
609 printf ("Domain set shared_info_va to 0x%016lx\n", va);
610 d->arch.shared_info_va = va;
612 for_each_vcpu (d, v1) {
613 VCPU(v1, interrupt_mask_addr) =
614 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
615 }
617 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
619 /* Remap the shared pages. */
620 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
622 return 0;
623 }
625 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
626 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
628 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
629 {
630 unsigned int op = sc->op;
631 int rc = 0;
632 int i;
633 //struct vcpu *v;
635 if (unlikely(d == current->domain)) {
636 DPRINTK("Don't try to do a shadow op on yourself!\n");
637 return -EINVAL;
638 }
640 domain_pause(d);
642 switch (op)
643 {
644 case XEN_DOMCTL_SHADOW_OP_OFF:
645 if (shadow_mode_enabled (d)) {
646 u64 *bm = d->arch.shadow_bitmap;
648 /* Flush vhpt and tlb to restore dirty bit usage. */
649 domain_flush_tlb_vhpt(d);
651 /* Free bitmap. */
652 d->arch.shadow_bitmap_size = 0;
653 d->arch.shadow_bitmap = NULL;
654 xfree(bm);
655 }
656 break;
658 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
659 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
660 rc = -EINVAL;
661 break;
663 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
664 if (shadow_mode_enabled(d)) {
665 rc = -EINVAL;
666 break;
667 }
669 atomic64_set(&d->arch.shadow_fault_count, 0);
670 atomic64_set(&d->arch.shadow_dirty_count, 0);
672 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
673 ~(BITS_PER_LONG-1);
674 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
675 d->arch.shadow_bitmap_size / BITS_PER_LONG);
676 if (d->arch.shadow_bitmap == NULL) {
677 d->arch.shadow_bitmap_size = 0;
678 rc = -ENOMEM;
679 }
680 else {
681 memset(d->arch.shadow_bitmap, 0,
682 d->arch.shadow_bitmap_size / 8);
684 /* Flush vhtp and tlb to enable dirty bit
685 virtualization. */
686 domain_flush_tlb_vhpt(d);
687 }
688 break;
690 case XEN_DOMCTL_SHADOW_OP_CLEAN:
691 {
692 int nbr_longs;
694 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
695 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
697 atomic64_set(&d->arch.shadow_fault_count, 0);
698 atomic64_set(&d->arch.shadow_dirty_count, 0);
700 if (guest_handle_is_null(sc->dirty_bitmap) ||
701 (d->arch.shadow_bitmap == NULL)) {
702 rc = -EINVAL;
703 break;
704 }
706 if (sc->pages > d->arch.shadow_bitmap_size)
707 sc->pages = d->arch.shadow_bitmap_size;
709 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
711 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
712 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
713 SHADOW_COPY_CHUNK : nbr_longs - i;
715 if (copy_to_guest_offset(sc->dirty_bitmap, i,
716 d->arch.shadow_bitmap + i,
717 size)) {
718 rc = -EFAULT;
719 break;
720 }
722 memset(d->arch.shadow_bitmap + i,
723 0, size * sizeof(unsigned long));
724 }
726 break;
727 }
729 case XEN_DOMCTL_SHADOW_OP_PEEK:
730 {
731 unsigned long size;
733 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
734 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
736 if (guest_handle_is_null(sc->dirty_bitmap) ||
737 (d->arch.shadow_bitmap == NULL)) {
738 rc = -EINVAL;
739 break;
740 }
742 if (sc->pages > d->arch.shadow_bitmap_size)
743 sc->pages = d->arch.shadow_bitmap_size;
745 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
746 if (copy_to_guest(sc->dirty_bitmap,
747 d->arch.shadow_bitmap, size)) {
748 rc = -EFAULT;
749 break;
750 }
751 break;
752 }
753 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
754 sc->mb = 0;
755 break;
756 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
757 if (sc->mb > 0) {
758 BUG();
759 rc = -ENOMEM;
760 }
761 break;
762 default:
763 rc = -EINVAL;
764 break;
765 }
767 domain_unpause(d);
769 return rc;
770 }
772 // remove following line if not privifying in memory
773 //#define HAVE_PRIVIFY_MEMORY
774 #ifndef HAVE_PRIVIFY_MEMORY
775 #define privify_memory(x,y) do {} while(0)
776 #endif
778 // see arch/x86/xxx/domain_build.c
779 int elf_sanity_check(Elf_Ehdr *ehdr)
780 {
781 if (!(IS_ELF(*ehdr)))
782 {
783 printk("DOM0 image is not a Xen-compatible Elf image.\n");
784 return 0;
785 }
786 return 1;
787 }
789 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
790 {
791 char *elfbase = (char *) image_start;
792 Elf_Ehdr ehdr;
793 Elf_Phdr phdr;
794 int h, filesz, memsz;
795 unsigned long elfaddr, dom_mpaddr, dom_imva;
796 struct page_info *p;
798 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
799 for ( h = 0; h < ehdr.e_phnum; h++ ) {
800 memcpy(&phdr,
801 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
802 sizeof(Elf_Phdr));
803 if ((phdr.p_type != PT_LOAD))
804 continue;
806 filesz = phdr.p_filesz;
807 memsz = phdr.p_memsz;
808 elfaddr = (unsigned long) elfbase + phdr.p_offset;
809 dom_mpaddr = phdr.p_paddr;
811 while (memsz > 0) {
812 p = assign_new_domain_page(d,dom_mpaddr);
813 BUG_ON (unlikely(p == NULL));
814 dom_imva = __va_ul(page_to_maddr(p));
815 if (filesz > 0) {
816 if (filesz >= PAGE_SIZE)
817 memcpy((void *) dom_imva,
818 (void *) elfaddr,
819 PAGE_SIZE);
820 else {
821 // copy partial page
822 memcpy((void *) dom_imva,
823 (void *) elfaddr, filesz);
824 // zero the rest of page
825 memset((void *) dom_imva+filesz, 0,
826 PAGE_SIZE-filesz);
827 }
828 //FIXME: This test for code seems to find a lot more than objdump -x does
829 if (phdr.p_flags & PF_X) {
830 privify_memory(dom_imva,PAGE_SIZE);
831 flush_icache_range(dom_imva,
832 dom_imva+PAGE_SIZE);
833 }
834 }
835 else if (memsz > 0) {
836 /* always zero out entire page */
837 memset((void *) dom_imva, 0, PAGE_SIZE);
838 }
839 memsz -= PAGE_SIZE;
840 filesz -= PAGE_SIZE;
841 elfaddr += PAGE_SIZE;
842 dom_mpaddr += PAGE_SIZE;
843 }
844 }
845 }
847 void alloc_dom0(void)
848 {
849 /* Check dom0 size. */
850 if (dom0_size < 4 * 1024 * 1024) {
851 panic("dom0_mem is too small, boot aborted"
852 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
853 }
855 /* Check dom0 align. */
856 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
857 panic("dom0_align (%lx) must be power of two, boot aborted"
858 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
859 dom0_align);
860 }
861 if (dom0_align < PAGE_SIZE) {
862 panic("dom0_align must be >= %ld, boot aborted"
863 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
864 PAGE_SIZE);
865 }
866 if (dom0_size % dom0_align) {
867 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
868 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
869 dom0_size,dom0_align);
870 }
872 if (running_on_sim) {
873 dom0_size = 128*1024*1024; //FIXME: Should be configurable
874 }
876 /* no need to allocate pages for now
877 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
878 */
879 }
882 /*
883 * Domain 0 has direct access to all devices absolutely. However
884 * the major point of this stub here, is to allow alloc_dom_mem
885 * handled with order > 0 request. Dom0 requires that bit set to
886 * allocate memory for other domains.
887 */
888 static void physdev_init_dom0(struct domain *d)
889 {
890 if (iomem_permit_access(d, 0UL, ~0UL))
891 BUG();
892 if (irqs_permit_access(d, 0, NR_IRQS-1))
893 BUG();
894 if (ioports_permit_access(d, 0, 0xffff))
895 BUG();
896 }
898 int construct_dom0(struct domain *d,
899 unsigned long image_start, unsigned long image_len,
900 unsigned long initrd_start, unsigned long initrd_len,
901 char *cmdline)
902 {
903 int i, rc;
904 start_info_t *si;
905 dom0_vga_console_info_t *ci;
906 struct vcpu *v = d->vcpu[0];
907 unsigned long max_pages;
909 struct domain_setup_info dsi;
910 unsigned long p_start;
911 unsigned long pkern_start;
912 unsigned long pkern_entry;
913 unsigned long pkern_end;
914 unsigned long pinitrd_start = 0;
915 unsigned long pstart_info;
916 struct page_info *start_info_page;
917 unsigned long bp_mpa;
918 struct ia64_boot_param *bp;
920 #ifdef VALIDATE_VT
921 unsigned int vmx_dom0 = 0;
922 unsigned long mfn;
923 struct page_info *page = NULL;
924 #endif
926 //printf("construct_dom0: starting\n");
928 /* Sanity! */
929 BUG_ON(d != dom0);
930 BUG_ON(d->vcpu[0] == NULL);
931 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
933 memset(&dsi, 0, sizeof(struct domain_setup_info));
935 printk("*** LOADING DOMAIN 0 ***\n");
937 max_pages = dom0_size / PAGE_SIZE;
938 d->max_pages = max_pages;
939 d->tot_pages = 0;
940 dsi.image_addr = (unsigned long)image_start;
941 dsi.image_len = image_len;
942 rc = parseelfimage(&dsi);
943 if ( rc != 0 )
944 return rc;
946 #ifdef VALIDATE_VT
947 /* Temp workaround */
948 if (running_on_sim)
949 dsi.xen_section_string = (char *)1;
951 /* Check whether dom0 is vti domain */
952 if ((!vmx_enabled) && !dsi.xen_section_string) {
953 printk("Lack of hardware support for unmodified vmx dom0\n");
954 panic("");
955 }
957 if (vmx_enabled && !dsi.xen_section_string) {
958 printk("Dom0 is vmx domain!\n");
959 vmx_dom0 = 1;
960 }
961 #endif
963 p_start = dsi.v_start;
964 pkern_start = dsi.v_kernstart;
965 pkern_end = dsi.v_kernend;
966 pkern_entry = dsi.v_kernentry;
968 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
970 if ( (p_start & (PAGE_SIZE-1)) != 0 )
971 {
972 printk("Initial guest OS must load to a page boundary.\n");
973 return -EINVAL;
974 }
976 pstart_info = PAGE_ALIGN(pkern_end);
977 if(initrd_start && initrd_len){
978 unsigned long offset;
980 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
981 if (pinitrd_start <= pstart_info)
982 panic("%s:enough memory is not assigned to dom0", __func__);
984 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
985 struct page_info *p;
986 p = assign_new_domain_page(d, pinitrd_start + offset);
987 if (p == NULL)
988 panic("%s: can't allocate page for initrd image", __func__);
989 if (initrd_len < offset + PAGE_SIZE)
990 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
991 initrd_len - offset);
992 else
993 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
994 }
995 }
997 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
998 " Kernel image: %lx->%lx\n"
999 " Entry address: %lx\n"
1000 " Init. ramdisk: %lx len %lx\n"
1001 " Start info.: %lx->%lx\n",
1002 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1003 pstart_info, pstart_info + PAGE_SIZE);
1005 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1007 printk("Initial guest OS requires too much space\n"
1008 "(%luMB is greater than %luMB limit)\n",
1009 (pkern_end-pkern_start)>>20,
1010 (max_pages <<PAGE_SHIFT)>>20);
1011 return -ENOMEM;
1014 // if high 3 bits of pkern start are non-zero, error
1016 // if pkern end is after end of metaphysical memory, error
1017 // (we should be able to deal with this... later)
1019 /* Mask all upcalls... */
1020 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1021 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1023 if (dom0_max_vcpus == 0)
1024 dom0_max_vcpus = MAX_VIRT_CPUS;
1025 if (dom0_max_vcpus > num_online_cpus())
1026 dom0_max_vcpus = num_online_cpus();
1027 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1028 dom0_max_vcpus = MAX_VIRT_CPUS;
1030 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1031 for ( i = 1; i < dom0_max_vcpus; i++ )
1032 if (alloc_vcpu(d, i, i) == NULL)
1033 printf ("Cannot allocate dom0 vcpu %d\n", i);
1035 /* Copy the OS image. */
1036 loaddomainelfimage(d,image_start);
1038 /* Copy the initial ramdisk. */
1039 //if ( initrd_len != 0 )
1040 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1042 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1043 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1045 /* Set up start info area. */
1046 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1047 start_info_page = assign_new_domain_page(d, pstart_info);
1048 if (start_info_page == NULL)
1049 panic("can't allocate start info page");
1050 si = page_to_virt(start_info_page);
1051 memset(si, 0, PAGE_SIZE);
1052 sprintf(si->magic, "xen-%i.%i-ia64",
1053 xen_major_version(), xen_minor_version());
1054 si->nr_pages = max_pages;
1055 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1057 printk("Dom0: 0x%lx\n", (u64)dom0);
1059 #ifdef VALIDATE_VT
1060 /* VMX specific construction for Dom0, if hardware supports VMX
1061 * and Dom0 is unmodified image
1062 */
1063 if (vmx_dom0)
1064 vmx_final_setup_guest(v);
1065 #endif
1067 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1069 /* Build firmware.
1070 Note: Linux kernel reserve memory used by start_info, so there is
1071 no need to remove it from MDT. */
1072 bp_mpa = pstart_info + sizeof(struct start_info);
1073 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1075 /* Fill boot param. */
1076 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1077 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1079 bp = (struct ia64_boot_param *)((unsigned char *)si +
1080 sizeof(start_info_t));
1081 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1083 /* We assume console has reached the last line! */
1084 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1085 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1086 bp->console_info.orig_x = 0;
1087 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1088 0 : bp->console_info.num_rows - 1;
1090 bp->initrd_start = dom0_size -
1091 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1092 bp->initrd_size = ia64_boot_param->initrd_size;
1094 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1095 sizeof(start_info_t) +
1096 sizeof(struct ia64_boot_param));
1098 if (fill_console_start_info(ci)) {
1099 si->console.dom0.info_off = sizeof(start_info_t) +
1100 sizeof(struct ia64_boot_param);
1101 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1104 vcpu_init_regs (v);
1106 vcpu_regs(v)->r28 = bp_mpa;
1108 vcpu_regs (v)->cr_iip = pkern_entry;
1110 physdev_init_dom0(d);
1112 // FIXME: Hack for keyboard input
1113 //serial_input_init();
1115 return 0;
1118 void machine_restart(char * __unused)
1120 console_start_sync();
1121 if (running_on_sim)
1122 printf ("machine_restart called. spinning...\n");
1123 else
1124 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1125 while(1);
1128 extern void cpu_halt(void);
1130 void machine_halt(void)
1132 console_start_sync();
1133 if (running_on_sim)
1134 printf ("machine_halt called. spinning...\n");
1135 else
1136 cpu_halt();
1137 while(1);
1140 void sync_vcpu_execstate(struct vcpu *v)
1142 // __ia64_save_fpu(v->arch._thread.fph);
1143 // if (VMX_DOMAIN(v))
1144 // vmx_save_state(v);
1145 // FIXME SMP: Anything else needed here for SMP?
1148 static void parse_dom0_mem(char *s)
1150 dom0_size = parse_size_and_unit(s);
1152 custom_param("dom0_mem", parse_dom0_mem);
1155 static void parse_dom0_align(char *s)
1157 dom0_align = parse_size_and_unit(s);
1159 custom_param("dom0_align", parse_dom0_align);