ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 8723:61e7afb7344b

The memset in init_switch_stack is overwriting the processor stack.
We need to avoid manipulating the switch stack area of the currently
running cpu. Original patch by Kevin Tian.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Feb 01 00:56:19 2006 +0100 (2006-02-01)
parents a9ead230cc60
children 0c94043f5c5b
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <xen/iocap.h>
23 #include <asm/ptrace.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/hw_irq.h>
29 //#include <asm/mpspec.h>
30 #include <xen/irq.h>
31 #include <xen/event.h>
32 //#include <xen/shadow.h>
33 #include <xen/console.h>
34 #include <xen/compile.h>
36 #include <xen/elf.h>
37 //#include <asm/page.h>
38 #include <asm/pgalloc.h>
39 #include <asm/dma.h> /* for MAX_DMA_ADDRESS */
41 #include <asm/asm-offsets.h> /* for IA64_THREAD_INFO_SIZE */
43 #include <asm/vcpu.h> /* for function declarations */
44 #include <public/arch-ia64.h>
45 #include <asm/vmx.h>
46 #include <asm/vmx_vcpu.h>
47 #include <asm/vmx_vpd.h>
48 #include <asm/pal.h>
49 #include <public/hvm/ioreq.h>
51 #define CONFIG_DOMAIN0_CONTIGUOUS
52 unsigned long dom0_start = -1L;
53 unsigned long dom0_size = 512*1024*1024;
54 unsigned long dom0_align = 64*1024*1024;
56 // initialized by arch/ia64/setup.c:find_initrd()
57 unsigned long initrd_start = 0, initrd_end = 0;
58 extern unsigned long running_on_sim;
60 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
62 //extern int loadelfimage(char *);
63 extern int readelfimage_base_and_size(char *, unsigned long,
64 unsigned long *, unsigned long *, unsigned long *);
66 unsigned long map_domain_page0(struct domain *);
67 extern unsigned long dom_fw_setup(struct domain *, char *, int);
68 static void init_switch_stack(struct vcpu *v);
70 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
71 void arch_domain_destroy(struct domain *d)
72 {
73 printf("arch_domain_destroy: not implemented\n");
74 //free_page((unsigned long)d->mm.perdomain_pt);
75 free_xenheap_page(d->shared_info);
76 }
78 static void default_idle(void)
79 {
80 int cpu = smp_processor_id();
81 local_irq_disable();
82 if ( !softirq_pending(cpu))
83 safe_halt();
84 local_irq_enable();
85 }
87 static void continue_cpu_idle_loop(void)
88 {
89 int cpu = smp_processor_id();
90 for ( ; ; )
91 {
92 #ifdef IA64
93 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
94 #else
95 irq_stat[cpu].idle_timestamp = jiffies;
96 #endif
97 while ( !softirq_pending(cpu) )
98 default_idle();
99 add_preempt_count(SOFTIRQ_OFFSET);
100 raise_softirq(SCHEDULE_SOFTIRQ);
101 do_softirq();
102 sub_preempt_count(SOFTIRQ_OFFSET);
103 }
104 }
106 void startup_cpu_idle_loop(void)
107 {
108 int cpu = smp_processor_id ();
109 /* Just some sanity to ensure that the scheduler is set up okay. */
110 ASSERT(current->domain == IDLE_DOMAIN_ID);
111 printf ("idle%dA\n", cpu);
112 raise_softirq(SCHEDULE_SOFTIRQ);
113 #if 0 /* All this work is done within continue_cpu_idle_loop */
114 printf ("idle%dB\n", cpu);
115 asm volatile ("mov ar.k2=r0");
116 do_softirq();
117 printf ("idle%dC\n", cpu);
119 /*
120 * Declares CPU setup done to the boot processor.
121 * Therefore memory barrier to ensure state is visible.
122 */
123 smp_mb();
124 #endif
125 #if 0
126 //do we have to ensure the idle task has a shared page so that, for example,
127 //region registers can be loaded from it. Apparently not...
128 idle0_task.shared_info = (void *)alloc_xenheap_page();
129 memset(idle0_task.shared_info, 0, PAGE_SIZE);
130 /* pin mapping */
131 // FIXME: Does this belong here? Or do only at domain switch time?
132 {
133 /* WARNING: following must be inlined to avoid nested fault */
134 unsigned long psr = ia64_clear_ic();
135 ia64_itr(0x2, IA64_TR_SHARED_INFO, SHAREDINFO_ADDR,
136 pte_val(pfn_pte(ia64_tpa(idle0_task.shared_info) >> PAGE_SHIFT, PAGE_KERNEL)),
137 PAGE_SHIFT);
138 ia64_set_psr(psr);
139 ia64_srlz_i();
140 }
141 #endif
143 continue_cpu_idle_loop();
144 }
146 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
147 {
148 struct vcpu *v;
149 struct thread_info *ti;
151 /* Still keep idle vcpu0 static allocated at compilation, due
152 * to some code from Linux still requires it in early phase.
153 */
154 if (is_idle_domain(d) && !vcpu_id)
155 v = idle_vcpu[0];
156 else {
157 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
158 return NULL;
159 memset(v, 0, sizeof(*v));
161 ti = alloc_thread_info(v);
162 /* Clear thread_info to clear some important fields, like
163 * preempt_count
164 */
165 memset(ti, 0, sizeof(struct thread_info));
166 init_switch_stack(v);
167 }
169 if (!is_idle_domain(d)) {
170 v->arch.privregs =
171 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
172 BUG_ON(v->arch.privregs == NULL);
173 memset(v->arch.privregs, 0, PAGE_SIZE);
175 if (!vcpu_id)
176 memset(&d->shared_info->evtchn_mask[0], 0xff,
177 sizeof(d->shared_info->evtchn_mask));
179 v->vcpu_info = &(d->shared_info->vcpu_info[0]);
180 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
181 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
182 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
183 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
184 v->arch.starting_rid = d->arch.starting_rid;
185 v->arch.ending_rid = d->arch.ending_rid;
186 v->arch.breakimm = d->arch.breakimm;
187 }
189 return v;
190 }
192 void free_vcpu_struct(struct vcpu *v)
193 {
194 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
195 }
197 static void init_switch_stack(struct vcpu *v)
198 {
199 struct pt_regs *regs = vcpu_regs (v);
200 struct switch_stack *sw = (struct switch_stack *) regs - 1;
201 extern void ia64_ret_from_clone;
203 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
204 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
205 sw->b0 = (unsigned long) &ia64_ret_from_clone;
206 sw->ar_fpsr = FPSR_DEFAULT;
207 v->arch._thread.ksp = (unsigned long) sw - 16;
208 // stay on kernel stack because may get interrupts!
209 // ia64_ret_from_clone (which b0 gets in new_thread) switches
210 // to user stack
211 v->arch._thread.on_ustack = 0;
212 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
213 }
215 int arch_domain_create(struct domain *d)
216 {
217 // the following will eventually need to be negotiated dynamically
218 d->xen_vastart = XEN_START_ADDR;
219 d->xen_vaend = XEN_END_ADDR;
220 d->shared_info_va = SHAREDINFO_ADDR;
222 if (is_idle_domain(d))
223 return 0;
225 if ((d->shared_info = (void *)alloc_xenheap_page()) == NULL)
226 goto fail_nomem;
227 memset(d->shared_info, 0, PAGE_SIZE);
229 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
230 /* We may also need emulation rid for region4, though it's unlikely
231 * to see guest issue uncacheable access in metaphysical mode. But
232 * keep such info here may be more sane.
233 */
234 if (((d->arch.metaphysical_rr0 = allocate_metaphysical_rr()) == -1UL)
235 || ((d->arch.metaphysical_rr4 = allocate_metaphysical_rr()) == -1UL))
236 BUG();
237 #define DOMAIN_RID_BITS_DEFAULT 18
238 if (!allocate_rid_range(d,DOMAIN_RID_BITS_DEFAULT)) // FIXME
239 BUG();
240 d->arch.breakimm = 0x1000;
241 d->arch.sys_pgnr = 0;
243 if ((d->arch.mm = xmalloc(struct mm_struct)) == NULL)
244 goto fail_nomem;
245 memset(d->arch.mm, 0, sizeof(*d->arch.mm));
247 if ((d->arch.mm->pgd = pgd_alloc(d->arch.mm)) == NULL)
248 goto fail_nomem;
250 printf ("arch_domain_create: domain=%p\n", d);
251 return 0;
253 fail_nomem:
254 free_xenheap_page(d->shared_info);
255 xfree(d->arch.mm);
256 pgd_free(d->arch.mm->pgd);
257 return -ENOMEM;
258 }
260 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
261 {
262 struct pt_regs *regs = vcpu_regs (v);
264 printf("arch_getdomaininfo_ctxt\n");
265 c->regs = *regs;
266 c->vcpu.evtchn_vector = v->vcpu_info->arch.evtchn_vector;
268 c->shared = v->domain->shared_info->arch;
269 }
271 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
272 {
273 struct pt_regs *regs = vcpu_regs (v);
274 struct domain *d = v->domain;
275 int i, rc, ret;
276 unsigned long progress = 0;
278 printf("arch_set_info_guest\n");
279 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
280 return 0;
281 if (c->flags & VGCF_VMX_GUEST) {
282 if (!vmx_enabled) {
283 printk("No VMX hardware feature for vmx domain.\n");
284 return -EINVAL;
285 }
287 if (v == d->vcpu[0])
288 vmx_setup_platform(d, c);
290 vmx_final_setup_guest(v);
291 }
293 *regs = c->regs;
294 d->arch.sys_pgnr = c->sys_pgnr;
295 d->arch.initrd_start = c->initrd.start;
296 d->arch.initrd_len = c->initrd.size;
297 d->arch.cmdline = c->cmdline;
298 new_thread(v, regs->cr_iip, 0, 0);
300 sync_split_caches();
301 v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector;
302 if ( c->vcpu.privregs && copy_from_user(v->arch.privregs,
303 c->vcpu.privregs, sizeof(mapped_regs_t))) {
304 printk("Bad ctxt address in arch_set_info_guest: 0x%lx\n", c->vcpu.privregs);
305 return -EFAULT;
306 }
308 v->arch.domain_itm_last = -1L;
309 d->shared_info->arch = c->shared;
311 /* Don't redo final setup */
312 set_bit(_VCPUF_initialised, &v->vcpu_flags);
313 return 0;
314 }
316 void domain_relinquish_resources(struct domain *d)
317 {
318 /* FIXME */
319 printf("domain_relinquish_resources: not implemented\n");
320 }
322 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
323 // and linux/arch/ia64/kernel/process.c:kernel_thread()
324 void new_thread(struct vcpu *v,
325 unsigned long start_pc,
326 unsigned long start_stack,
327 unsigned long start_info)
328 {
329 struct domain *d = v->domain;
330 struct pt_regs *regs;
331 struct ia64_boot_param *bp;
332 extern char saved_command_line[];
335 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
336 if (d == dom0) start_pc += dom0_start;
337 #endif
339 regs = vcpu_regs (v);
340 if (VMX_DOMAIN(v)) {
341 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
342 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
343 } else {
344 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
345 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN
346 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS);
347 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
348 }
349 regs->cr_iip = start_pc;
350 regs->cr_ifs = 1UL << 63; /* or clear? */
351 regs->ar_fpsr = FPSR_DEFAULT;
353 if (VMX_DOMAIN(v)) {
354 vmx_init_all_rr(v);
355 if (d == dom0)
356 // VCPU(v,vgr[12]) = dom_fw_setup(d,saved_command_line,256L);
357 regs->r28 = dom_fw_setup(d,saved_command_line,256L);
358 /* Virtual processor context setup */
359 VCPU(v, vpsr) = IA64_PSR_BN;
360 VCPU(v, dcr) = 0;
361 } else {
362 init_all_rr(v);
363 if (d == dom0)
364 regs->r28 = dom_fw_setup(d,saved_command_line,256L);
365 else {
366 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
367 if (*d->arch.cmdline == '\0') {
368 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
369 regs->r28 = dom_fw_setup(d,DEFAULT_CMDLINE,256L);
370 printf("domU command line defaulted to"
371 DEFAULT_CMDLINE "\n");
372 }
373 else regs->r28 = dom_fw_setup(d,d->arch.cmdline,256L);
374 }
375 VCPU(v, banknum) = 1;
376 VCPU(v, metaphysical_mode) = 1;
377 d->shared_info->arch.flags = (d == dom0) ? (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
378 }
379 }
381 static struct page * map_new_domain0_page(unsigned long mpaddr)
382 {
383 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
384 printk("map_new_domain0_page: bad domain0 mpaddr %p!\n",mpaddr);
385 printk("map_new_domain0_page: start=%p,end=%p!\n",dom0_start,dom0_start+dom0_size);
386 while(1);
387 }
388 return pfn_to_page((mpaddr >> PAGE_SHIFT));
389 }
391 /* allocate new page for domain and map it to the specified metaphysical addr */
392 struct page * map_new_domain_page(struct domain *d, unsigned long mpaddr)
393 {
394 struct mm_struct *mm = d->arch.mm;
395 struct page *p = (struct page *)0;
396 pgd_t *pgd;
397 pud_t *pud;
398 pmd_t *pmd;
399 pte_t *pte;
400 extern unsigned long vhpt_paddr, vhpt_pend;
402 if (!mm->pgd) {
403 printk("map_new_domain_page: domain pgd must exist!\n");
404 return(p);
405 }
406 pgd = pgd_offset(mm,mpaddr);
407 if (pgd_none(*pgd))
408 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
410 pud = pud_offset(pgd, mpaddr);
411 if (pud_none(*pud))
412 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
414 pmd = pmd_offset(pud, mpaddr);
415 if (pmd_none(*pmd))
416 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
417 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
419 pte = pte_offset_map(pmd, mpaddr);
420 if (pte_none(*pte)) {
421 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
422 if (d == dom0) p = map_new_domain0_page(mpaddr);
423 else
424 #endif
425 {
426 p = alloc_domheap_page(d);
427 // zero out pages for security reasons
428 if (p) memset(__va(page_to_phys(p)),0,PAGE_SIZE);
429 }
430 if (unlikely(!p)) {
431 printf("map_new_domain_page: Can't alloc!!!! Aaaargh!\n");
432 return(p);
433 }
434 if (unlikely(page_to_phys(p) > vhpt_paddr && page_to_phys(p) < vhpt_pend)) {
435 printf("map_new_domain_page: reassigned vhpt page %p!!\n",page_to_phys(p));
436 }
437 set_pte(pte, pfn_pte(page_to_phys(p) >> PAGE_SHIFT,
438 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
439 }
440 else printk("map_new_domain_page: mpaddr %lx already mapped!\n",mpaddr);
441 return p;
442 }
444 /* map a physical address to the specified metaphysical addr */
445 void map_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr)
446 {
447 struct mm_struct *mm = d->arch.mm;
448 pgd_t *pgd;
449 pud_t *pud;
450 pmd_t *pmd;
451 pte_t *pte;
453 if (!mm->pgd) {
454 printk("map_domain_page: domain pgd must exist!\n");
455 return;
456 }
457 pgd = pgd_offset(mm,mpaddr);
458 if (pgd_none(*pgd))
459 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
461 pud = pud_offset(pgd, mpaddr);
462 if (pud_none(*pud))
463 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
465 pmd = pmd_offset(pud, mpaddr);
466 if (pmd_none(*pmd))
467 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
468 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
470 pte = pte_offset_map(pmd, mpaddr);
471 if (pte_none(*pte)) {
472 set_pte(pte, pfn_pte(physaddr >> PAGE_SHIFT,
473 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
474 }
475 else printk("map_domain_page: mpaddr %lx already mapped!\n",mpaddr);
476 }
477 #if 0
478 /* map a physical address with specified I/O flag */
479 void map_domain_io_page(struct domain *d, unsigned long mpaddr, unsigned long flags)
480 {
481 struct mm_struct *mm = d->arch.mm;
482 pgd_t *pgd;
483 pud_t *pud;
484 pmd_t *pmd;
485 pte_t *pte;
486 pte_t io_pte;
488 if (!mm->pgd) {
489 printk("map_domain_page: domain pgd must exist!\n");
490 return;
491 }
492 ASSERT(flags & GPFN_IO_MASK);
494 pgd = pgd_offset(mm,mpaddr);
495 if (pgd_none(*pgd))
496 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
498 pud = pud_offset(pgd, mpaddr);
499 if (pud_none(*pud))
500 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
502 pmd = pmd_offset(pud, mpaddr);
503 if (pmd_none(*pmd))
504 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
505 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
507 pte = pte_offset_map(pmd, mpaddr);
508 if (pte_none(*pte)) {
509 pte_val(io_pte) = flags;
510 set_pte(pte, io_pte);
511 }
512 else printk("map_domain_page: mpaddr %lx already mapped!\n",mpaddr);
513 }
514 #endif
515 void mpafoo(unsigned long mpaddr)
516 {
517 extern unsigned long privop_trace;
518 if (mpaddr == 0x3800)
519 privop_trace = 1;
520 }
522 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
523 {
524 struct mm_struct *mm = d->arch.mm;
525 pgd_t *pgd = pgd_offset(mm, mpaddr);
526 pud_t *pud;
527 pmd_t *pmd;
528 pte_t *pte;
530 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
531 if (d == dom0) {
532 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
533 //printk("lookup_domain_mpa: bad dom0 mpaddr %p!\n",mpaddr);
534 //printk("lookup_domain_mpa: start=%p,end=%p!\n",dom0_start,dom0_start+dom0_size);
535 mpafoo(mpaddr);
536 }
537 pte_t pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
538 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
539 pte = &pteval;
540 return *(unsigned long *)pte;
541 }
542 #endif
543 tryagain:
544 if (pgd_present(*pgd)) {
545 pud = pud_offset(pgd,mpaddr);
546 if (pud_present(*pud)) {
547 pmd = pmd_offset(pud,mpaddr);
548 if (pmd_present(*pmd)) {
549 pte = pte_offset_map(pmd,mpaddr);
550 if (pte_present(*pte)) {
551 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
552 return *(unsigned long *)pte;
553 } else if (VMX_DOMAIN(d->vcpu[0]))
554 return GPFN_INV_MASK;
555 }
556 }
557 }
558 /* if lookup fails and mpaddr is "legal", "create" the page */
559 if ((mpaddr >> PAGE_SHIFT) < d->max_pages) {
560 if (map_new_domain_page(d,mpaddr)) goto tryagain;
561 }
562 printk("lookup_domain_mpa: bad mpa %p (> %p\n",
563 mpaddr,d->max_pages<<PAGE_SHIFT);
564 mpafoo(mpaddr);
565 return 0;
566 }
568 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
569 #if 1
570 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
571 {
572 unsigned long pte = lookup_domain_mpa(d,mpaddr);
573 unsigned long imva;
575 pte &= _PAGE_PPN_MASK;
576 imva = __va(pte);
577 imva |= mpaddr & ~PAGE_MASK;
578 return(imva);
579 }
580 #else
581 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
582 {
583 unsigned long imva = __gpa_to_mpa(d, mpaddr);
585 return __va(imva);
586 }
587 #endif
589 // remove following line if not privifying in memory
590 //#define HAVE_PRIVIFY_MEMORY
591 #ifndef HAVE_PRIVIFY_MEMORY
592 #define privify_memory(x,y) do {} while(0)
593 #endif
595 // see arch/x86/xxx/domain_build.c
596 int elf_sanity_check(Elf_Ehdr *ehdr)
597 {
598 return (IS_ELF(*ehdr));
599 }
601 static void copy_memory(void *dst, void *src, int size)
602 {
603 int remain;
605 if (IS_XEN_ADDRESS(dom0,src)) {
606 memcpy(dst,src,size);
607 }
608 else {
609 printf("About to call __copy_from_user(%p,%p,%d)\n",
610 dst,src,size);
611 while (remain = __copy_from_user(dst,src,size)) {
612 printf("incomplete user copy, %d remain of %d\n",
613 remain,size);
614 dst += size - remain; src += size - remain;
615 size -= remain;
616 }
617 }
618 }
620 void loaddomainelfimage(struct domain *d, unsigned long image_start)
621 {
622 char *elfbase = image_start;
623 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
624 Elf_Ehdr ehdr;
625 Elf_Phdr phdr;
626 int h, filesz, memsz, paddr;
627 unsigned long elfaddr, dom_mpaddr, dom_imva;
628 struct page *p;
629 unsigned long pteval;
631 copy_memory(&ehdr,image_start,sizeof(Elf_Ehdr));
632 for ( h = 0; h < ehdr.e_phnum; h++ ) {
633 copy_memory(&phdr,elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
634 sizeof(Elf_Phdr));
635 //if ( !is_loadable_phdr(phdr) )
636 if ((phdr.p_type != PT_LOAD)) {
637 continue;
638 }
639 filesz = phdr.p_filesz; memsz = phdr.p_memsz;
640 elfaddr = elfbase + phdr.p_offset;
641 dom_mpaddr = phdr.p_paddr;
642 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
643 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
644 if (d == dom0) {
645 if (dom_mpaddr+memsz>dom0_size || dom_mpaddr+filesz>dom0_size) {
646 printf("Domain0 doesn't fit in allocated space!\n");
647 while(1);
648 }
649 dom_imva = __va(dom_mpaddr + dom0_start);
650 copy_memory(dom_imva,elfaddr,filesz);
651 if (memsz > filesz) memset(dom_imva+filesz,0,memsz-filesz);
652 //FIXME: This test for code seems to find a lot more than objdump -x does
653 if (phdr.p_flags & PF_X) privify_memory(dom_imva,filesz);
654 }
655 else
656 #endif
657 while (memsz > 0) {
658 #ifdef DOMU_AUTO_RESTART
659 pteval = lookup_domain_mpa(d,dom_mpaddr);
660 if (pteval) dom_imva = __va(pteval & _PFN_MASK);
661 else { printf("loaddomainelfimage: BAD!\n"); while(1); }
662 #else
663 p = map_new_domain_page(d,dom_mpaddr);
664 if (unlikely(!p)) BUG();
665 dom_imva = __va(page_to_phys(p));
666 #endif
667 if (filesz > 0) {
668 if (filesz >= PAGE_SIZE)
669 copy_memory(dom_imva,elfaddr,PAGE_SIZE);
670 else { // copy partial page, zero the rest of page
671 copy_memory(dom_imva,elfaddr,filesz);
672 memset(dom_imva+filesz,0,PAGE_SIZE-filesz);
673 }
674 //FIXME: This test for code seems to find a lot more than objdump -x does
675 if (phdr.p_flags & PF_X)
676 privify_memory(dom_imva,PAGE_SIZE);
677 }
678 else if (memsz > 0) // always zero out entire page
679 memset(dom_imva,0,PAGE_SIZE);
680 memsz -= PAGE_SIZE; filesz -= PAGE_SIZE;
681 elfaddr += PAGE_SIZE; dom_mpaddr += PAGE_SIZE;
682 }
683 }
684 }
686 int
687 parsedomainelfimage(char *elfbase, unsigned long elfsize, unsigned long *entry)
688 {
689 Elf_Ehdr ehdr;
691 copy_memory(&ehdr,elfbase,sizeof(Elf_Ehdr));
693 if ( !elf_sanity_check(&ehdr) ) {
694 printk("ELF sanity check failed.\n");
695 return -EINVAL;
696 }
698 if ( (ehdr.e_phoff + (ehdr.e_phnum * ehdr.e_phentsize)) > elfsize )
699 {
700 printk("ELF program headers extend beyond end of image.\n");
701 return -EINVAL;
702 }
704 if ( (ehdr.e_shoff + (ehdr.e_shnum * ehdr.e_shentsize)) > elfsize )
705 {
706 printk("ELF section headers extend beyond end of image.\n");
707 return -EINVAL;
708 }
710 #if 0
711 /* Find the section-header strings table. */
712 if ( ehdr.e_shstrndx == SHN_UNDEF )
713 {
714 printk("ELF image has no section-header strings table (shstrtab).\n");
715 return -EINVAL;
716 }
717 #endif
719 *entry = ehdr.e_entry;
720 printf("parsedomainelfimage: entry point = %p\n",*entry);
722 return 0;
723 }
726 void alloc_dom0(void)
727 {
728 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
729 if (platform_is_hp_ski()) {
730 dom0_size = 128*1024*1024; //FIXME: Should be configurable
731 }
732 printf("alloc_dom0: starting (initializing %d MB...)\n",dom0_size/(1024*1024));
734 /* FIXME: The first trunk (say 256M) should always be assigned to
735 * Dom0, since Dom0's physical == machine address for DMA purpose.
736 * Some old version linux, like 2.4, assumes physical memory existing
737 * in 2nd 64M space.
738 */
739 dom0_start = alloc_boot_pages(
740 dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
741 dom0_start <<= PAGE_SHIFT;
742 if (!dom0_start) {
743 printf("construct_dom0: can't allocate contiguous memory size=%p\n",
744 dom0_size);
745 while(1);
746 }
747 printf("alloc_dom0: dom0_start=%p\n",dom0_start);
748 #else
749 dom0_start = 0;
750 #endif
752 }
755 /*
756 * Domain 0 has direct access to all devices absolutely. However
757 * the major point of this stub here, is to allow alloc_dom_mem
758 * handled with order > 0 request. Dom0 requires that bit set to
759 * allocate memory for other domains.
760 */
761 void physdev_init_dom0(struct domain *d)
762 {
763 if (iomem_permit_access(d, 0UL, ~0UL))
764 BUG();
765 if (irqs_permit_access(d, 0, NR_PIRQS-1))
766 BUG();
767 }
769 unsigned int vmx_dom0 = 0;
770 int construct_dom0(struct domain *d,
771 unsigned long image_start, unsigned long image_len,
772 unsigned long initrd_start, unsigned long initrd_len,
773 char *cmdline)
774 {
775 char *dst;
776 int i, rc;
777 unsigned long pfn, mfn;
778 unsigned long nr_pt_pages;
779 unsigned long count;
780 unsigned long alloc_start, alloc_end;
781 struct pfn_info *page = NULL;
782 start_info_t *si;
783 struct vcpu *v = d->vcpu[0];
785 struct domain_setup_info dsi;
786 unsigned long p_start;
787 unsigned long pkern_start;
788 unsigned long pkern_entry;
789 unsigned long pkern_end;
790 unsigned long pinitrd_start = 0;
791 unsigned long ret, progress = 0;
793 //printf("construct_dom0: starting\n");
795 #ifndef CLONE_DOMAIN0
796 /* Sanity! */
797 BUG_ON(d != dom0);
798 BUG_ON(d->vcpu[0] == NULL);
799 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
800 #endif
802 memset(&dsi, 0, sizeof(struct domain_setup_info));
804 printk("*** LOADING DOMAIN 0 ***\n");
806 alloc_start = dom0_start;
807 alloc_end = dom0_start + dom0_size;
808 d->tot_pages = d->max_pages = dom0_size/PAGE_SIZE;
809 dsi.image_addr = (unsigned long)image_start;
810 dsi.image_len = image_len;
811 rc = parseelfimage(&dsi);
812 if ( rc != 0 )
813 return rc;
815 #ifdef VALIDATE_VT
816 /* Temp workaround */
817 if (running_on_sim)
818 dsi.xen_section_string = (char *)1;
820 /* Check whether dom0 is vti domain */
821 if ((!vmx_enabled) && !dsi.xen_section_string) {
822 printk("Lack of hardware support for unmodified vmx dom0\n");
823 panic("");
824 }
826 if (vmx_enabled && !dsi.xen_section_string) {
827 printk("Dom0 is vmx domain!\n");
828 vmx_dom0 = 1;
829 }
830 #endif
832 p_start = dsi.v_start;
833 pkern_start = dsi.v_kernstart;
834 pkern_end = dsi.v_kernend;
835 pkern_entry = dsi.v_kernentry;
837 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
839 if ( (p_start & (PAGE_SIZE-1)) != 0 )
840 {
841 printk("Initial guest OS must load to a page boundary.\n");
842 return -EINVAL;
843 }
845 if(initrd_start&&initrd_len){
846 pinitrd_start=(dom0_start+dom0_size) -
847 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
849 memcpy(__va(pinitrd_start),initrd_start,initrd_len);
850 }
852 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
853 " Kernel image: %lx->%lx\n"
854 " Entry address: %lx\n"
855 " Init. ramdisk: %lx len %lx\n",
856 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len);
858 if ( (pkern_end - pkern_start) > (d->max_pages * PAGE_SIZE) )
859 {
860 printk("Initial guest OS requires too much space\n"
861 "(%luMB is greater than %luMB limit)\n",
862 (pkern_end-pkern_start)>>20, (d->max_pages<<PAGE_SHIFT)>>20);
863 return -ENOMEM;
864 }
866 // if high 3 bits of pkern start are non-zero, error
868 // if pkern end is after end of metaphysical memory, error
869 // (we should be able to deal with this... later)
872 //
874 #if 0
875 strcpy(d->name,"Domain0");
876 #endif
878 /* Mask all upcalls... */
879 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
880 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
882 #ifdef VALIDATE_VT
883 /* Construct a frame-allocation list for the initial domain, since these
884 * pages are allocated by boot allocator and pfns are not set properly
885 */
886 for ( mfn = (alloc_start>>PAGE_SHIFT);
887 mfn < (alloc_end>>PAGE_SHIFT);
888 mfn++ )
889 {
890 page = &frame_table[mfn];
891 page_set_owner(page, d);
892 page->u.inuse.type_info = 0;
893 page->count_info = PGC_allocated | 1;
894 list_add_tail(&page->list, &d->page_list);
896 /* Construct 1:1 mapping */
897 machine_to_phys_mapping[mfn] = mfn;
898 }
900 #endif
902 /* Copy the OS image. */
903 loaddomainelfimage(d,image_start);
905 /* Copy the initial ramdisk. */
906 //if ( initrd_len != 0 )
907 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
910 /* Set up start info area. */
911 si = (start_info_t *)alloc_xenheap_page();
912 memset(si, 0, PAGE_SIZE);
913 d->shared_info->arch.start_info_pfn = __pa(si) >> PAGE_SHIFT;
914 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
915 si->nr_pages = d->tot_pages;
917 #if 0
918 si->shared_info = virt_to_phys(d->shared_info);
919 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
920 //si->pt_base = vpt_start;
921 //si->nr_pt_frames = nr_pt_pages;
922 //si->mfn_list = vphysmap_start;
924 if ( initrd_len != 0 )
925 {
926 //si->mod_start = vinitrd_start;
927 si->mod_len = initrd_len;
928 printk("Initrd len 0x%lx, start at 0x%08lx\n",
929 si->mod_len, si->mod_start);
930 }
932 dst = si->cmd_line;
933 if ( cmdline != NULL )
934 {
935 for ( i = 0; i < 255; i++ )
936 {
937 if ( cmdline[i] == '\0' )
938 break;
939 *dst++ = cmdline[i];
940 }
941 }
942 *dst = '\0';
944 zap_low_mappings(); /* Do the same for the idle page tables. */
945 #endif
947 /* Give up the VGA console if DOM0 is configured to grab it. */
948 if (cmdline != NULL)
949 console_endboot(strstr(cmdline, "tty0") != NULL);
951 /* VMX specific construction for Dom0, if hardware supports VMX
952 * and Dom0 is unmodified image
953 */
954 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
955 if (vmx_dom0)
956 vmx_final_setup_guest(v);
958 set_bit(_VCPUF_initialised, &v->vcpu_flags);
960 new_thread(v, pkern_entry, 0, 0);
961 physdev_init_dom0(d);
962 sync_split_caches();
964 // FIXME: Hack for keyboard input
965 #ifdef CLONE_DOMAIN0
966 if (d == dom0)
967 #endif
968 serial_input_init();
969 if (d == dom0) {
970 VCPU(v, delivery_mask[0]) = -1L;
971 VCPU(v, delivery_mask[1]) = -1L;
972 VCPU(v, delivery_mask[2]) = -1L;
973 VCPU(v, delivery_mask[3]) = -1L;
974 }
975 else __set_bit(0x30, VCPU(v, delivery_mask));
977 return 0;
978 }
980 // FIXME: When dom0 can construct domains, this goes away (or is rewritten)
981 int construct_domU(struct domain *d,
982 unsigned long image_start, unsigned long image_len,
983 unsigned long initrd_start, unsigned long initrd_len,
984 char *cmdline)
985 {
986 int i, rc;
987 struct vcpu *v = d->vcpu[0];
988 unsigned long pkern_entry;
990 #ifndef DOMU_AUTO_RESTART
991 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
992 #endif
994 printk("*** LOADING DOMAIN %d ***\n",d->domain_id);
996 d->max_pages = dom0_size/PAGE_SIZE; // FIXME: use dom0 size
997 // FIXME: use domain0 command line
998 rc = parsedomainelfimage(image_start, image_len, &pkern_entry);
999 printk("parsedomainelfimage returns %d\n",rc);
1000 if ( rc != 0 ) return rc;
1002 /* Mask all upcalls... */
1003 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
1004 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1006 /* Copy the OS image. */
1007 printk("calling loaddomainelfimage(%p,%p)\n",d,image_start);
1008 loaddomainelfimage(d,image_start);
1009 printk("loaddomainelfimage returns\n");
1011 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1013 printk("calling new_thread, entry=%p\n",pkern_entry);
1014 #ifdef DOMU_AUTO_RESTART
1015 v->domain->arch.image_start = image_start;
1016 v->domain->arch.image_len = image_len;
1017 v->domain->arch.entry = pkern_entry;
1018 #endif
1019 new_thread(v, pkern_entry, 0, 0);
1020 printk("new_thread returns\n");
1021 sync_split_caches();
1022 __set_bit(0x30, VCPU(v, delivery_mask));
1024 return 0;
1027 #ifdef DOMU_AUTO_RESTART
1028 void reconstruct_domU(struct vcpu *v)
1030 /* re-copy the OS image to reset data values to original */
1031 printk("reconstruct_domU: restarting domain %d...\n",
1032 v->domain->domain_id);
1033 loaddomainelfimage(v->domain,v->domain->arch.image_start);
1034 new_thread(v, v->domain->arch.entry, 0, 0);
1035 sync_split_caches();
1037 #endif
1039 void machine_restart(char * __unused)
1041 if (platform_is_hp_ski()) dummy();
1042 printf("machine_restart called: spinning....\n");
1043 while(1);
1046 void machine_halt(void)
1048 if (platform_is_hp_ski()) dummy();
1049 printf("machine_halt called: spinning....\n");
1050 while(1);
1053 void dummy_called(char *function)
1055 if (platform_is_hp_ski()) asm("break 0;;");
1056 printf("dummy called in %s: spinning....\n", function);
1057 while(1);
1061 #if 0
1062 void switch_to(struct vcpu *prev, struct vcpu *next)
1064 struct vcpu *last;
1066 __switch_to(prev,next,last);
1067 //set_current(next);
1069 #endif
1071 void domain_pend_keyboard_interrupt(int irq)
1073 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1076 void sync_vcpu_execstate(struct vcpu *v)
1078 ia64_save_fpu(v->arch._thread.fph);
1079 if (VMX_DOMAIN(v))
1080 vmx_save_state(v);
1081 else {
1082 if (IA64_HAS_EXTRA_STATE(v))
1083 ia64_save_extra(v);
1085 // FIXME SMP: Anything else needed here for SMP?
1088 // FIXME: It would be nice to print out a nice error message for bad
1089 // values of these boot-time parameters, but it seems we are too early
1090 // in the boot and attempts to print freeze the system?
1091 #define abort(x...) do {} while(0)
1092 #define warn(x...) do {} while(0)
1094 static void parse_dom0_mem(char *s)
1096 unsigned long bytes = parse_size_and_unit(s);
1098 if (dom0_size < 4 * 1024 * 1024) {
1099 abort("parse_dom0_mem: too small, boot aborted"
1100 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1102 if (dom0_size % dom0_align) {
1103 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1104 warn("parse_dom0_mem: dom0_size rounded up from"
1105 " %lx to %lx bytes, due to dom0_align=%lx\n",
1106 bytes,dom0_size,dom0_align);
1108 else dom0_size = bytes;
1110 custom_param("dom0_mem", parse_dom0_mem);
1113 static void parse_dom0_align(char *s)
1115 unsigned long bytes = parse_size_and_unit(s);
1117 if ((bytes - 1) ^ bytes) { /* not a power of two */
1118 abort("parse_dom0_align: dom0_align must be power of two, "
1119 "boot aborted"
1120 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1122 else if (bytes < PAGE_SIZE) {
1123 abort("parse_dom0_align: dom0_align must be >= %ld, "
1124 "boot aborted"
1125 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1126 PAGE_SIZE);
1128 else dom0_align = bytes;
1129 if (dom0_size % dom0_align) {
1130 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1131 warn("parse_dom0_align: dom0_size rounded up from"
1132 " %ld to %ld bytes, due to dom0_align=%lx\n",
1133 bytes,dom0_size,dom0_align);
1136 custom_param("dom0_align", parse_dom0_align);