ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 10692:306d7857928c

[IA64] Save & restore.

xc_ia64_linux_save.c and xc_ia64_linux_restore.c added.
vcpu context has more registers and states (eg: tr registers).
Per cpu irqs are deallocated when cpu is switched off.
#if/#endif added in reboot.c for ia64.

Signed-off-by: Tristan Gingold <tristan.gingold@bull.net>
author awilliam@xenbuild.aw
date Tue Jul 11 12:51:18 2006 -0600 (2006-07-11)
parents d7a511069a32
children b2abc70be89e
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/ptrace.h>
29 #include <asm/system.h>
30 #include <asm/io.h>
31 #include <asm/processor.h>
32 #include <asm/desc.h>
33 #include <asm/hw_irq.h>
34 #include <asm/setup.h>
35 //#include <asm/mpspec.h>
36 #include <xen/irq.h>
37 #include <xen/event.h>
38 //#include <xen/shadow.h>
39 #include <xen/console.h>
40 #include <xen/compile.h>
42 #include <xen/elf.h>
43 //#include <asm/page.h>
44 #include <asm/pgalloc.h>
46 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
48 #include <asm/vcpu.h> /* for function declarations */
49 #include <public/arch-ia64.h>
50 #include <xen/domain.h>
51 #include <asm/vmx.h>
52 #include <asm/vmx_vcpu.h>
53 #include <asm/vmx_vpd.h>
54 #include <asm/vmx_phy_mode.h>
55 #include <asm/pal.h>
56 #include <asm/vhpt.h>
57 #include <public/hvm/ioreq.h>
58 #include <public/arch-ia64.h>
59 #include <asm/tlbflush.h>
60 #include <asm/regionreg.h>
61 #include <asm/dom_fw.h>
62 #include <asm/privop_stat.h>
64 #ifndef CONFIG_XEN_IA64_DOM0_VP
65 #define CONFIG_DOMAIN0_CONTIGUOUS
66 #endif
67 unsigned long dom0_start = -1L;
68 unsigned long dom0_size = 512*1024*1024;
69 unsigned long dom0_align = 64*1024*1024;
71 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
72 static unsigned int dom0_max_vcpus = 1;
73 integer_param("dom0_max_vcpus", dom0_max_vcpus);
75 extern unsigned long running_on_sim;
77 extern char dom0_command_line[];
79 /* FIXME: where these declarations should be there ? */
80 extern void serial_input_init(void);
81 static void init_switch_stack(struct vcpu *v);
82 extern void vmx_do_launch(struct vcpu *);
84 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
85 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
87 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
88 This is a Xen virtual address. */
89 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
90 DEFINE_PER_CPU(int *, current_psr_ic_addr);
92 #include <xen/sched-if.h>
94 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
95 {
96 int cpu = smp_processor_id();
97 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
98 int last_processor = vcpu->arch.last_processor;
100 if (is_idle_domain(vcpu->domain))
101 return;
103 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
104 vcpu->arch.last_processor = cpu;
106 if ((last_vcpu_id != vcpu->vcpu_id &&
107 last_vcpu_id != INVALID_VCPU_ID) ||
108 (last_vcpu_id == vcpu->vcpu_id &&
109 last_processor != cpu &&
110 last_processor != INVALID_PROCESSOR)) {
112 // if the vTLB implementation was changed,
113 // the followings must be updated either.
114 if (VMX_DOMAIN(vcpu)) {
115 // currently vTLB for vt-i domian is per vcpu.
116 // so any flushing isn't needed.
117 } else {
118 vhpt_flush();
119 }
120 local_flush_tlb_all();
121 }
122 }
124 void schedule_tail(struct vcpu *prev)
125 {
126 extern char ia64_ivt;
127 context_saved(prev);
129 if (VMX_DOMAIN(current)) {
130 vmx_do_launch(current);
131 } else {
132 ia64_set_iva(&ia64_ivt);
133 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
134 VHPT_ENABLED);
135 load_region_regs(current);
136 vcpu_load_kernel_regs(current);
137 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
138 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
139 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
140 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
141 }
142 flush_vtlb_for_context_switch(current);
143 }
145 void context_switch(struct vcpu *prev, struct vcpu *next)
146 {
147 uint64_t spsr;
148 uint64_t pta;
150 local_irq_save(spsr);
151 context_switch_count++;
153 __ia64_save_fpu(prev->arch._thread.fph);
154 __ia64_load_fpu(next->arch._thread.fph);
155 if (VMX_DOMAIN(prev))
156 vmx_save_state(prev);
157 if (VMX_DOMAIN(next))
158 vmx_load_state(next);
159 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
160 prev = ia64_switch_to(next);
162 /* Note: ia64_switch_to does not return here at vcpu initialization. */
164 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
166 // leave this debug for now: it acts as a heartbeat when more than
167 // one domain is active
168 {
169 static long cnt[16] = { 50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50};
170 static int i = 100;
171 int id = ((struct vcpu *)current)->domain->domain_id & 0xf;
172 if (!cnt[id]--) { cnt[id] = 500000; printk("%x",id); }
173 if (!i--) { i = 1000000; printk("+"); }
174 }
176 if (VMX_DOMAIN(current)){
177 vmx_load_all_rr(current);
178 } else {
179 struct domain *nd;
180 extern char ia64_ivt;
182 ia64_set_iva(&ia64_ivt);
184 nd = current->domain;
185 if (!is_idle_domain(nd)) {
186 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
187 VHPT_ENABLED);
188 load_region_regs(current);
189 vcpu_load_kernel_regs(current);
190 vcpu_set_next_timer(current);
191 if (vcpu_timer_expired(current))
192 vcpu_pend_timer(current);
193 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
194 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
195 __ia64_per_cpu_var(current_psr_ic_addr) =
196 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
197 } else {
198 /* When switching to idle domain, only need to disable vhpt
199 * walker. Then all accesses happen within idle context will
200 * be handled by TR mapping and identity mapping.
201 */
202 pta = ia64_get_pta();
203 ia64_set_pta(pta & ~VHPT_ENABLED);
204 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
205 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
206 }
207 }
208 flush_vtlb_for_context_switch(current);
209 local_irq_restore(spsr);
210 context_saved(prev);
211 }
213 void continue_running(struct vcpu *same)
214 {
215 /* nothing to do */
216 }
218 static void default_idle(void)
219 {
220 local_irq_disable();
221 if ( !softirq_pending(smp_processor_id()) )
222 safe_halt();
223 local_irq_enable();
224 }
226 static void continue_cpu_idle_loop(void)
227 {
228 for ( ; ; )
229 {
230 #ifdef IA64
231 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
232 #else
233 irq_stat[cpu].idle_timestamp = jiffies;
234 #endif
235 while ( !softirq_pending(smp_processor_id()) )
236 default_idle();
237 raise_softirq(SCHEDULE_SOFTIRQ);
238 do_softirq();
239 }
240 }
242 void startup_cpu_idle_loop(void)
243 {
244 /* Just some sanity to ensure that the scheduler is set up okay. */
245 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
246 raise_softirq(SCHEDULE_SOFTIRQ);
248 continue_cpu_idle_loop();
249 }
251 void hlt_timer_fn(void *data)
252 {
253 struct vcpu *v = data;
254 if (vcpu_timer_expired(v))
255 vcpu_pend_timer(v);
256 vcpu_unblock(v);
257 }
259 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
260 {
261 struct vcpu *v;
262 struct thread_info *ti;
264 /* Still keep idle vcpu0 static allocated at compilation, due
265 * to some code from Linux still requires it in early phase.
266 */
267 if (is_idle_domain(d) && !vcpu_id)
268 v = idle_vcpu[0];
269 else {
270 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
271 return NULL;
272 memset(v, 0, sizeof(*v));
274 ti = alloc_thread_info(v);
275 /* Clear thread_info to clear some important fields, like
276 * preempt_count
277 */
278 memset(ti, 0, sizeof(struct thread_info));
279 init_switch_stack(v);
280 }
282 if (!is_idle_domain(d)) {
283 if (!d->arch.is_vti) {
284 /* Create privregs page only if not VTi. */
285 v->arch.privregs =
286 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
287 BUG_ON(v->arch.privregs == NULL);
288 memset(v->arch.privregs, 0, PAGE_SIZE);
289 share_xen_page_with_guest(virt_to_page(v->arch.privregs),
290 d, XENSHARE_writable);
291 }
293 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
294 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
295 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
296 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
298 /* Is it correct ?
299 It depends on the domain rid usage.
301 A domain may share rid among its processor (eg having a
302 global VHPT). In this case, we should also share rid
303 among vcpus and the rid range should be the same.
305 However a domain may have per cpu rid allocation. In
306 this case we don't want to share rid among vcpus, but we may
307 do it if two vcpus are on the same cpu... */
309 v->arch.starting_rid = d->arch.starting_rid;
310 v->arch.ending_rid = d->arch.ending_rid;
311 v->arch.breakimm = d->arch.breakimm;
312 v->arch.last_processor = INVALID_PROCESSOR;
313 }
314 if (!VMX_DOMAIN(v))
315 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v, v->processor);
317 return v;
318 }
320 void free_vcpu_struct(struct vcpu *v)
321 {
322 if (VMX_DOMAIN(v))
323 vmx_relinquish_vcpu_resources(v);
324 else {
325 if (v->arch.privregs != NULL)
326 free_xenheap_pages(v->arch.privregs,
327 get_order_from_shift(XMAPPEDREGS_SHIFT));
328 kill_timer(&v->arch.hlt_timer);
329 }
331 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
332 }
334 static void init_switch_stack(struct vcpu *v)
335 {
336 struct pt_regs *regs = vcpu_regs (v);
337 struct switch_stack *sw = (struct switch_stack *) regs - 1;
338 extern void ia64_ret_from_clone;
340 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
341 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
342 sw->b0 = (unsigned long) &ia64_ret_from_clone;
343 sw->ar_fpsr = FPSR_DEFAULT;
344 v->arch._thread.ksp = (unsigned long) sw - 16;
345 // stay on kernel stack because may get interrupts!
346 // ia64_ret_from_clone switches to user stack
347 v->arch._thread.on_ustack = 0;
348 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
349 }
351 int arch_domain_create(struct domain *d)
352 {
353 int i;
355 // the following will eventually need to be negotiated dynamically
356 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
357 d->arch.breakimm = 0x1000;
358 for (i = 0; i < NR_CPUS; i++) {
359 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
360 }
362 if (is_idle_domain(d))
363 return 0;
365 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
366 if (d->shared_info == NULL)
367 goto fail_nomem;
368 memset(d->shared_info, 0, XSI_SIZE);
369 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
370 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
371 d, XENSHARE_writable);
373 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
374 /* We may also need emulation rid for region4, though it's unlikely
375 * to see guest issue uncacheable access in metaphysical mode. But
376 * keep such info here may be more sane.
377 */
378 if (!allocate_rid_range(d,0))
379 goto fail_nomem;
381 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
383 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
384 goto fail_nomem;
386 printf ("arch_domain_create: domain=%p\n", d);
387 return 0;
389 fail_nomem:
390 if (d->arch.mm.pgd != NULL)
391 pgd_free(d->arch.mm.pgd);
392 if (d->shared_info != NULL)
393 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
394 return -ENOMEM;
395 }
397 void arch_domain_destroy(struct domain *d)
398 {
399 BUG_ON(d->arch.mm.pgd != NULL);
400 if (d->shared_info != NULL)
401 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
403 domain_flush_destroy (d);
405 deallocate_rid_range(d);
406 }
408 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
409 {
410 int i;
411 struct vcpu_extra_regs *er = &c->extra_regs;
413 c->user_regs = *vcpu_regs (v);
414 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
416 /* Fill extra regs. */
417 for (i = 0; i < 8; i++) {
418 er->itrs[i].pte = v->arch.itrs[i].pte.val;
419 er->itrs[i].itir = v->arch.itrs[i].itir;
420 er->itrs[i].vadr = v->arch.itrs[i].vadr;
421 er->itrs[i].rid = v->arch.itrs[i].rid;
422 }
423 for (i = 0; i < 8; i++) {
424 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
425 er->dtrs[i].itir = v->arch.dtrs[i].itir;
426 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
427 er->dtrs[i].rid = v->arch.dtrs[i].rid;
428 }
429 er->event_callback_ip = v->arch.event_callback_ip;
430 er->dcr = v->arch.dcr;
431 er->iva = v->arch.iva;
432 }
434 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
435 {
436 struct pt_regs *regs = vcpu_regs (v);
437 struct domain *d = v->domain;
439 *regs = c->user_regs;
441 if (!d->arch.is_vti) {
442 /* domain runs at PL2/3 */
443 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
444 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
445 }
447 if (c->flags & VGCF_EXTRA_REGS) {
448 int i;
449 struct vcpu_extra_regs *er = &c->extra_regs;
451 for (i = 0; i < 8; i++) {
452 vcpu_set_itr(v, i, er->itrs[i].pte,
453 er->itrs[i].itir,
454 er->itrs[i].vadr,
455 er->itrs[i].rid);
456 }
457 for (i = 0; i < 8; i++) {
458 vcpu_set_dtr(v, i,
459 er->dtrs[i].pte,
460 er->dtrs[i].itir,
461 er->dtrs[i].vadr,
462 er->dtrs[i].rid);
463 }
464 v->arch.event_callback_ip = er->event_callback_ip;
465 v->arch.dcr = er->dcr;
466 v->arch.iva = er->iva;
467 }
469 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
470 return 0;
471 if (d->arch.is_vti)
472 vmx_final_setup_guest(v);
474 /* This overrides some registers. */
475 vcpu_init_regs(v);
477 /* Don't redo final setup */
478 set_bit(_VCPUF_initialised, &v->vcpu_flags);
479 return 0;
480 }
482 static void relinquish_memory(struct domain *d, struct list_head *list)
483 {
484 struct list_head *ent;
485 struct page_info *page;
486 #ifndef __ia64__
487 unsigned long x, y;
488 #endif
490 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
491 spin_lock_recursive(&d->page_alloc_lock);
492 ent = list->next;
493 while ( ent != list )
494 {
495 page = list_entry(ent, struct page_info, list);
496 /* Grab a reference to the page so it won't disappear from under us. */
497 if ( unlikely(!get_page(page, d)) )
498 {
499 /* Couldn't get a reference -- someone is freeing this page. */
500 ent = ent->next;
501 continue;
502 }
504 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
505 put_page_and_type(page);
507 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
508 put_page(page);
510 #ifndef __ia64__
511 /*
512 * Forcibly invalidate base page tables at this point to break circular
513 * 'linear page table' references. This is okay because MMU structures
514 * are not shared across domains and this domain is now dead. Thus base
515 * tables are not in use so a non-zero count means circular reference.
516 */
517 y = page->u.inuse.type_info;
518 for ( ; ; )
519 {
520 x = y;
521 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
522 (PGT_base_page_table|PGT_validated)) )
523 break;
525 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
526 if ( likely(y == x) )
527 {
528 free_page_type(page, PGT_base_page_table);
529 break;
530 }
531 }
532 #endif
534 /* Follow the list chain and /then/ potentially free the page. */
535 ent = ent->next;
536 #ifdef CONFIG_XEN_IA64_DOM0_VP
537 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
538 #endif
539 put_page(page);
540 }
542 spin_unlock_recursive(&d->page_alloc_lock);
543 }
545 void domain_relinquish_resources(struct domain *d)
546 {
547 /* Relinquish every page of memory. */
549 // relase page traversing d->arch.mm.
550 relinquish_mm(d);
552 relinquish_memory(d, &d->xenpage_list);
553 relinquish_memory(d, &d->page_list);
555 if (d->arch.is_vti && d->arch.sal_data)
556 xfree(d->arch.sal_data);
557 }
559 void build_physmap_table(struct domain *d)
560 {
561 struct list_head *list_ent = d->page_list.next;
562 unsigned long mfn, i = 0;
564 while(list_ent != &d->page_list) {
565 mfn = page_to_mfn(list_entry(
566 list_ent, struct page_info, list));
567 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
569 i++;
570 list_ent = mfn_to_page(mfn)->list.next;
571 }
572 }
574 unsigned long
575 domain_set_shared_info_va (unsigned long va)
576 {
577 struct vcpu *v = current;
578 struct domain *d = v->domain;
579 struct vcpu *v1;
581 /* Check virtual address:
582 must belong to region 7,
583 must be 64Kb aligned,
584 must not be within Xen virtual space. */
585 if ((va >> 61) != 7
586 || (va & 0xffffUL) != 0
587 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
588 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
590 /* Note: this doesn't work well if other cpus are already running.
591 However this is part of the spec :-) */
592 printf ("Domain set shared_info_va to 0x%016lx\n", va);
593 d->arch.shared_info_va = va;
595 for_each_vcpu (d, v1) {
596 VCPU(v1, interrupt_mask_addr) =
597 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
598 }
600 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
602 /* Remap the shared pages. */
603 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
605 return 0;
606 }
609 // remove following line if not privifying in memory
610 //#define HAVE_PRIVIFY_MEMORY
611 #ifndef HAVE_PRIVIFY_MEMORY
612 #define privify_memory(x,y) do {} while(0)
613 #endif
615 // see arch/x86/xxx/domain_build.c
616 int elf_sanity_check(Elf_Ehdr *ehdr)
617 {
618 if (!(IS_ELF(*ehdr)))
619 {
620 printk("DOM0 image is not a Xen-compatible Elf image.\n");
621 return 0;
622 }
623 return 1;
624 }
626 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
627 {
628 char *elfbase = (char *) image_start;
629 Elf_Ehdr ehdr;
630 Elf_Phdr phdr;
631 int h, filesz, memsz;
632 unsigned long elfaddr, dom_mpaddr, dom_imva;
633 struct page_info *p;
635 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
636 for ( h = 0; h < ehdr.e_phnum; h++ ) {
637 memcpy(&phdr,
638 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
639 sizeof(Elf_Phdr));
640 if ((phdr.p_type != PT_LOAD))
641 continue;
643 filesz = phdr.p_filesz;
644 memsz = phdr.p_memsz;
645 elfaddr = (unsigned long) elfbase + phdr.p_offset;
646 dom_mpaddr = phdr.p_paddr;
648 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
649 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
650 if (d == dom0) {
651 if (dom_mpaddr+memsz>dom0_size)
652 panic("Dom0 doesn't fit in memory space!\n");
653 dom_imva = __va_ul(dom_mpaddr + dom0_start);
654 memcpy((void *)dom_imva, (void *)elfaddr, filesz);
655 if (memsz > filesz)
656 memset((void *)dom_imva+filesz, 0,
657 memsz-filesz);
658 //FIXME: This test for code seems to find a lot more than objdump -x does
659 if (phdr.p_flags & PF_X) {
660 privify_memory(dom_imva,filesz);
661 flush_icache_range (dom_imva, dom_imva+filesz);
662 }
663 }
664 else
665 #endif
666 while (memsz > 0) {
667 p = assign_new_domain_page(d,dom_mpaddr);
668 BUG_ON (unlikely(p == NULL));
669 dom_imva = __va_ul(page_to_maddr(p));
670 if (filesz > 0) {
671 if (filesz >= PAGE_SIZE)
672 memcpy((void *) dom_imva,
673 (void *) elfaddr,
674 PAGE_SIZE);
675 else {
676 // copy partial page
677 memcpy((void *) dom_imva,
678 (void *) elfaddr, filesz);
679 // zero the rest of page
680 memset((void *) dom_imva+filesz, 0,
681 PAGE_SIZE-filesz);
682 }
683 //FIXME: This test for code seems to find a lot more than objdump -x does
684 if (phdr.p_flags & PF_X) {
685 privify_memory(dom_imva,PAGE_SIZE);
686 flush_icache_range(dom_imva,
687 dom_imva+PAGE_SIZE);
688 }
689 }
690 else if (memsz > 0) {
691 /* always zero out entire page */
692 memset((void *) dom_imva, 0, PAGE_SIZE);
693 }
694 memsz -= PAGE_SIZE;
695 filesz -= PAGE_SIZE;
696 elfaddr += PAGE_SIZE;
697 dom_mpaddr += PAGE_SIZE;
698 }
699 }
700 }
702 void alloc_dom0(void)
703 {
704 /* Check dom0 size. */
705 if (dom0_size < 4 * 1024 * 1024) {
706 panic("dom0_mem is too small, boot aborted"
707 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
708 }
710 /* Check dom0 align. */
711 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
712 panic("dom0_align (%lx) must be power of two, boot aborted"
713 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
714 dom0_align);
715 }
716 if (dom0_align < PAGE_SIZE) {
717 panic("dom0_align must be >= %ld, boot aborted"
718 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
719 PAGE_SIZE);
720 }
721 if (dom0_size % dom0_align) {
722 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
723 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
724 dom0_size,dom0_align);
725 }
727 if (running_on_sim) {
728 dom0_size = 128*1024*1024; //FIXME: Should be configurable
729 }
730 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
731 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
733 /* FIXME: The first trunk (say 256M) should always be assigned to
734 * Dom0, since Dom0's physical == machine address for DMA purpose.
735 * Some old version linux, like 2.4, assumes physical memory existing
736 * in 2nd 64M space.
737 */
738 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
739 dom0_start <<= PAGE_SHIFT;
740 if (!dom0_start) {
741 panic("alloc_dom0: can't allocate contiguous memory size=%lu\n",
742 dom0_size);
743 }
744 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
745 #else
746 // no need to allocate pages for now
747 // pages are allocated by map_new_domain_page() via loaddomainelfimage()
748 dom0_start = 0;
749 #endif
751 }
754 /*
755 * Domain 0 has direct access to all devices absolutely. However
756 * the major point of this stub here, is to allow alloc_dom_mem
757 * handled with order > 0 request. Dom0 requires that bit set to
758 * allocate memory for other domains.
759 */
760 static void physdev_init_dom0(struct domain *d)
761 {
762 if (iomem_permit_access(d, 0UL, ~0UL))
763 BUG();
764 if (irqs_permit_access(d, 0, NR_IRQS-1))
765 BUG();
766 }
768 int construct_dom0(struct domain *d,
769 unsigned long image_start, unsigned long image_len,
770 unsigned long initrd_start, unsigned long initrd_len,
771 char *cmdline)
772 {
773 int i, rc;
774 unsigned long alloc_start, alloc_end;
775 start_info_t *si;
776 struct vcpu *v = d->vcpu[0];
777 unsigned long max_pages;
779 struct domain_setup_info dsi;
780 unsigned long p_start;
781 unsigned long pkern_start;
782 unsigned long pkern_entry;
783 unsigned long pkern_end;
784 unsigned long pinitrd_start = 0;
785 unsigned long pstart_info;
786 struct page_info *start_info_page;
787 unsigned long bp_mpa;
788 struct ia64_boot_param *bp;
790 #ifdef VALIDATE_VT
791 unsigned int vmx_dom0 = 0;
792 unsigned long mfn;
793 struct page_info *page = NULL;
794 #endif
796 //printf("construct_dom0: starting\n");
798 /* Sanity! */
799 BUG_ON(d != dom0);
800 BUG_ON(d->vcpu[0] == NULL);
801 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
803 memset(&dsi, 0, sizeof(struct domain_setup_info));
805 printk("*** LOADING DOMAIN 0 ***\n");
807 alloc_start = dom0_start;
808 alloc_end = dom0_start + dom0_size;
809 max_pages = dom0_size / PAGE_SIZE;
810 d->max_pages = max_pages;
811 #ifndef CONFIG_XEN_IA64_DOM0_VP
812 d->tot_pages = d->max_pages;
813 #else
814 d->tot_pages = 0;
815 #endif
816 dsi.image_addr = (unsigned long)image_start;
817 dsi.image_len = image_len;
818 rc = parseelfimage(&dsi);
819 if ( rc != 0 )
820 return rc;
822 #ifdef VALIDATE_VT
823 /* Temp workaround */
824 if (running_on_sim)
825 dsi.xen_section_string = (char *)1;
827 /* Check whether dom0 is vti domain */
828 if ((!vmx_enabled) && !dsi.xen_section_string) {
829 printk("Lack of hardware support for unmodified vmx dom0\n");
830 panic("");
831 }
833 if (vmx_enabled && !dsi.xen_section_string) {
834 printk("Dom0 is vmx domain!\n");
835 vmx_dom0 = 1;
836 }
837 #endif
839 p_start = dsi.v_start;
840 pkern_start = dsi.v_kernstart;
841 pkern_end = dsi.v_kernend;
842 pkern_entry = dsi.v_kernentry;
844 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
846 if ( (p_start & (PAGE_SIZE-1)) != 0 )
847 {
848 printk("Initial guest OS must load to a page boundary.\n");
849 return -EINVAL;
850 }
852 pstart_info = PAGE_ALIGN(pkern_end);
853 if(initrd_start && initrd_len){
854 unsigned long offset;
856 pinitrd_start= (dom0_start + dom0_size) -
857 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
858 if (pinitrd_start <= pstart_info)
859 panic("%s:enough memory is not assigned to dom0", __func__);
861 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
862 struct page_info *p;
863 p = assign_new_domain_page(d, pinitrd_start + offset);
864 if (p == NULL)
865 panic("%s: can't allocate page for initrd image", __func__);
866 if (initrd_len < offset + PAGE_SIZE)
867 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
868 initrd_len - offset);
869 else
870 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
871 }
872 }
874 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
875 " Kernel image: %lx->%lx\n"
876 " Entry address: %lx\n"
877 " Init. ramdisk: %lx len %lx\n"
878 " Start info.: %lx->%lx\n",
879 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
880 pstart_info, pstart_info + PAGE_SIZE);
882 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
883 {
884 printk("Initial guest OS requires too much space\n"
885 "(%luMB is greater than %luMB limit)\n",
886 (pkern_end-pkern_start)>>20,
887 (max_pages <<PAGE_SHIFT)>>20);
888 return -ENOMEM;
889 }
891 // if high 3 bits of pkern start are non-zero, error
893 // if pkern end is after end of metaphysical memory, error
894 // (we should be able to deal with this... later)
896 /* Mask all upcalls... */
897 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
898 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
900 if (dom0_max_vcpus == 0)
901 dom0_max_vcpus = MAX_VIRT_CPUS;
902 if (dom0_max_vcpus > num_online_cpus())
903 dom0_max_vcpus = num_online_cpus();
904 if (dom0_max_vcpus > MAX_VIRT_CPUS)
905 dom0_max_vcpus = MAX_VIRT_CPUS;
907 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
908 for ( i = 1; i < dom0_max_vcpus; i++ )
909 if (alloc_vcpu(d, i, i) == NULL)
910 printf ("Cannot allocate dom0 vcpu %d\n", i);
912 #if defined(VALIDATE_VT) && !defined(CONFIG_XEN_IA64_DOM0_VP)
913 /* Construct a frame-allocation list for the initial domain, since these
914 * pages are allocated by boot allocator and pfns are not set properly
915 */
916 for ( mfn = (alloc_start>>PAGE_SHIFT);
917 mfn < (alloc_end>>PAGE_SHIFT);
918 mfn++ )
919 {
920 page = mfn_to_page(mfn);
921 page_set_owner(page, d);
922 page->u.inuse.type_info = 0;
923 page->count_info = PGC_allocated | 1;
924 list_add_tail(&page->list, &d->page_list);
926 /* Construct 1:1 mapping */
927 set_gpfn_from_mfn(mfn, mfn);
928 }
929 #endif
931 /* Copy the OS image. */
932 loaddomainelfimage(d,image_start);
934 /* Copy the initial ramdisk. */
935 //if ( initrd_len != 0 )
936 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
938 /* Set up start info area. */
939 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
940 start_info_page = assign_new_domain_page(d, pstart_info);
941 if (start_info_page == NULL)
942 panic("can't allocate start info page");
943 si = page_to_virt(start_info_page);
944 memset(si, 0, PAGE_SIZE);
945 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
946 si->nr_pages = max_pages;
947 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
949 printk("Dom0: 0x%lx\n", (u64)dom0);
951 #ifdef VALIDATE_VT
952 /* VMX specific construction for Dom0, if hardware supports VMX
953 * and Dom0 is unmodified image
954 */
955 if (vmx_dom0)
956 vmx_final_setup_guest(v);
957 #endif
959 set_bit(_VCPUF_initialised, &v->vcpu_flags);
961 /* Build firmware.
962 Note: Linux kernel reserve memory used by start_info, so there is
963 no need to remove it from MDT. */
964 bp_mpa = pstart_info + sizeof(struct start_info);
965 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
967 /* Fill boot param. */
968 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
969 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
971 bp = (struct ia64_boot_param *)(si + 1);
972 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
974 /* We assume console has reached the last line! */
975 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
976 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
977 bp->console_info.orig_x = 0;
978 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
979 0 : bp->console_info.num_rows - 1;
981 bp->initrd_start = (dom0_start+dom0_size) -
982 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
983 bp->initrd_size = ia64_boot_param->initrd_size;
985 vcpu_init_regs (v);
987 vcpu_regs(v)->r28 = bp_mpa;
989 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
990 pkern_entry += dom0_start;
991 #endif
992 vcpu_regs (v)->cr_iip = pkern_entry;
994 physdev_init_dom0(d);
996 // FIXME: Hack for keyboard input
997 //serial_input_init();
999 return 0;
1002 void machine_restart(char * __unused)
1004 console_start_sync();
1005 if (running_on_sim)
1006 printf ("machine_restart called. spinning...\n");
1007 else
1008 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1009 while(1);
1012 void machine_halt(void)
1014 console_start_sync();
1015 if (running_on_sim)
1016 printf ("machine_halt called. spinning...\n");
1017 else
1018 (*efi.reset_system)(EFI_RESET_SHUTDOWN,0,0,NULL);
1019 while(1);
1022 void sync_vcpu_execstate(struct vcpu *v)
1024 // __ia64_save_fpu(v->arch._thread.fph);
1025 // if (VMX_DOMAIN(v))
1026 // vmx_save_state(v);
1027 // FIXME SMP: Anything else needed here for SMP?
1030 static void parse_dom0_mem(char *s)
1032 dom0_size = parse_size_and_unit(s);
1034 custom_param("dom0_mem", parse_dom0_mem);
1037 static void parse_dom0_align(char *s)
1039 dom0_align = parse_size_and_unit(s);
1041 custom_param("dom0_align", parse_dom0_align);