ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 8440:903fb46f240e

Small bug fixes
author djm@kirby.fc.hp.com
date Tue Jan 03 08:59:00 2006 -0600 (2006-01-03)
parents f89906acd9f6
children cd914808acf1
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <asm/ptrace.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/hw_irq.h>
28 //#include <asm/mpspec.h>
29 #include <xen/irq.h>
30 #include <xen/event.h>
31 //#include <xen/shadow.h>
32 #include <xen/console.h>
33 #include <xen/compile.h>
35 #include <xen/elf.h>
36 //#include <asm/page.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h> /* for MAX_DMA_ADDRESS */
40 #include <asm/asm-offsets.h> /* for IA64_THREAD_INFO_SIZE */
42 #include <asm/vcpu.h> /* for function declarations */
43 #include <public/arch-ia64.h>
44 #include <asm/vmx.h>
45 #include <asm/vmx_vcpu.h>
46 #include <asm/vmx_vpd.h>
47 #include <asm/pal.h>
48 #include <public/io/ioreq.h>
50 #define CONFIG_DOMAIN0_CONTIGUOUS
51 unsigned long dom0_start = -1L;
52 unsigned long dom0_size = 512*1024*1024;
53 unsigned long dom0_align = 64*1024*1024;
55 // initialized by arch/ia64/setup.c:find_initrd()
56 unsigned long initrd_start = 0, initrd_end = 0;
57 extern unsigned long running_on_sim;
59 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
61 //extern int loadelfimage(char *);
62 extern int readelfimage_base_and_size(char *, unsigned long,
63 unsigned long *, unsigned long *, unsigned long *);
65 unsigned long map_domain_page0(struct domain *);
66 extern unsigned long dom_fw_setup(struct domain *, char *, int);
68 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
69 void free_perdomain_pt(struct domain *d)
70 {
71 printf("free_perdomain_pt: not implemented\n");
72 //free_page((unsigned long)d->mm.perdomain_pt);
73 }
75 static void default_idle(void)
76 {
77 int cpu = smp_processor_id();
78 local_irq_disable();
79 if ( !softirq_pending(cpu))
80 safe_halt();
81 local_irq_enable();
82 }
84 static void continue_cpu_idle_loop(void)
85 {
86 int cpu = smp_processor_id();
87 for ( ; ; )
88 {
89 printf ("idle%dD\n", cpu);
90 #ifdef IA64
91 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
92 #else
93 irq_stat[cpu].idle_timestamp = jiffies;
94 #endif
95 while ( !softirq_pending(cpu) )
96 default_idle();
97 add_preempt_count(SOFTIRQ_OFFSET);
98 raise_softirq(SCHEDULE_SOFTIRQ);
99 do_softirq();
100 sub_preempt_count(SOFTIRQ_OFFSET);
101 }
102 }
104 void startup_cpu_idle_loop(void)
105 {
106 int cpu = smp_processor_id ();
107 /* Just some sanity to ensure that the scheduler is set up okay. */
108 ASSERT(current->domain == IDLE_DOMAIN_ID);
109 printf ("idle%dA\n", cpu);
110 raise_softirq(SCHEDULE_SOFTIRQ);
111 #if 0 /* All this work is done within continue_cpu_idle_loop */
112 printf ("idle%dB\n", cpu);
113 asm volatile ("mov ar.k2=r0");
114 do_softirq();
115 printf ("idle%dC\n", cpu);
117 /*
118 * Declares CPU setup done to the boot processor.
119 * Therefore memory barrier to ensure state is visible.
120 */
121 smp_mb();
122 #endif
123 #if 0
124 //do we have to ensure the idle task has a shared page so that, for example,
125 //region registers can be loaded from it. Apparently not...
126 idle0_task.shared_info = (void *)alloc_xenheap_page();
127 memset(idle0_task.shared_info, 0, PAGE_SIZE);
128 /* pin mapping */
129 // FIXME: Does this belong here? Or do only at domain switch time?
130 {
131 /* WARNING: following must be inlined to avoid nested fault */
132 unsigned long psr = ia64_clear_ic();
133 ia64_itr(0x2, IA64_TR_SHARED_INFO, SHAREDINFO_ADDR,
134 pte_val(pfn_pte(ia64_tpa(idle0_task.shared_info) >> PAGE_SHIFT, PAGE_KERNEL)),
135 PAGE_SHIFT);
136 ia64_set_psr(psr);
137 ia64_srlz_i();
138 }
139 #endif
141 continue_cpu_idle_loop();
142 }
144 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
145 {
146 struct vcpu *v;
148 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
149 return NULL;
151 memset(v, 0, sizeof(*v));
152 memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
153 v->arch.privregs =
154 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
155 printf("arch_vcpu_info=%p\n", v->arch.privregs);
156 memset(v->arch.privregs, 0, PAGE_SIZE);
158 return v;
159 }
161 void free_vcpu_struct(struct vcpu *v)
162 {
163 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
164 }
166 static void init_switch_stack(struct vcpu *v)
167 {
168 struct pt_regs *regs = vcpu_regs (v);
169 struct switch_stack *sw = (struct switch_stack *) regs - 1;
170 extern void ia64_ret_from_clone;
172 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
173 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
174 sw->b0 = (unsigned long) &ia64_ret_from_clone;
175 sw->ar_fpsr = FPSR_DEFAULT;
176 v->arch._thread.ksp = (unsigned long) sw - 16;
177 // stay on kernel stack because may get interrupts!
178 // ia64_ret_from_clone (which b0 gets in new_thread) switches
179 // to user stack
180 v->arch._thread.on_ustack = 0;
181 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
182 }
184 void arch_do_createdomain(struct vcpu *v)
185 {
186 struct domain *d = v->domain;
187 struct thread_info *ti = alloc_thread_info(v);
189 /* Clear thread_info to clear some important fields, like preempt_count */
190 memset(ti, 0, sizeof(struct thread_info));
191 init_switch_stack(v);
193 d->shared_info = (void *)alloc_xenheap_page();
194 if (!d->shared_info) {
195 printk("ERROR/HALTING: CAN'T ALLOC PAGE\n");
196 while (1);
197 }
198 memset(d->shared_info, 0, PAGE_SIZE);
199 if (v == d->vcpu[0])
200 memset(&d->shared_info->evtchn_mask[0], 0xff,
201 sizeof(d->shared_info->evtchn_mask));
202 #if 0
203 d->vcpu[0].arch.privregs =
204 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
205 printf("arch_vcpu_info=%p\n", d->vcpu[0].arch.privregs);
206 memset(d->vcpu.arch.privregs, 0, PAGE_SIZE);
207 #endif
208 v->vcpu_info = &(d->shared_info->vcpu_info[0]);
210 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
212 /* We may also need emulation rid for region4, though it's unlikely
213 * to see guest issue uncacheable access in metaphysical mode. But
214 * keep such info here may be more sane.
215 */
216 if (((d->arch.metaphysical_rr0 = allocate_metaphysical_rr()) == -1UL)
217 || ((d->arch.metaphysical_rr4 = allocate_metaphysical_rr()) == -1UL))
218 BUG();
219 // VCPU(v, metaphysical_mode) = 1;
220 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
221 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
222 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
223 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
224 #define DOMAIN_RID_BITS_DEFAULT 18
225 if (!allocate_rid_range(d,DOMAIN_RID_BITS_DEFAULT)) // FIXME
226 BUG();
227 v->arch.starting_rid = d->arch.starting_rid;
228 v->arch.ending_rid = d->arch.ending_rid;
229 // the following will eventually need to be negotiated dynamically
230 d->xen_vastart = XEN_START_ADDR;
231 d->xen_vaend = XEN_END_ADDR;
232 d->shared_info_va = SHAREDINFO_ADDR;
233 d->arch.breakimm = 0x1000;
234 v->arch.breakimm = d->arch.breakimm;
236 d->arch.sys_pgnr = 0;
237 if (d->domain_id != IDLE_DOMAIN_ID) {
238 d->arch.mm = xmalloc(struct mm_struct);
239 if (unlikely(!d->arch.mm)) {
240 printk("Can't allocate mm_struct for domain %d\n",d->domain_id);
241 return -ENOMEM;
242 }
243 memset(d->arch.mm, 0, sizeof(*d->arch.mm));
244 d->arch.mm->pgd = pgd_alloc(d->arch.mm);
245 if (unlikely(!d->arch.mm->pgd)) {
246 printk("Can't allocate pgd for domain %d\n",d->domain_id);
247 return -ENOMEM;
248 }
249 } else
250 d->arch.mm = NULL;
251 printf ("arch_do_create_domain: domain=%p\n", d);
252 }
254 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
255 {
256 struct pt_regs *regs = vcpu_regs (v);
258 printf("arch_getdomaininfo_ctxt\n");
259 c->regs = *regs;
260 c->vcpu.evtchn_vector = v->vcpu_info->arch.evtchn_vector;
262 c->shared = v->domain->shared_info->arch;
263 }
265 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
266 {
267 struct pt_regs *regs = vcpu_regs (v);
268 struct domain *d = v->domain;
269 int i, rc, ret;
270 unsigned long progress = 0;
272 printf("arch_set_info_guest\n");
273 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
274 return 0;
275 if (c->flags & VGCF_VMX_GUEST) {
276 if (!vmx_enabled) {
277 printk("No VMX hardware feature for vmx domain.\n");
278 return -EINVAL;
279 }
281 if (v == d->vcpu[0])
282 vmx_setup_platform(d, c);
284 vmx_final_setup_guest(v);
285 }
287 *regs = c->regs;
288 d->arch.sys_pgnr = c->sys_pgnr;
289 d->arch.initrd_start = c->initrd.start;
290 d->arch.initrd_len = c->initrd.size;
291 d->arch.cmdline = c->cmdline;
292 new_thread(v, regs->cr_iip, 0, 0);
294 sync_split_caches();
295 v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector;
296 if ( c->vcpu.privregs && copy_from_user(v->arch.privregs,
297 c->vcpu.privregs, sizeof(mapped_regs_t))) {
298 printk("Bad ctxt address in arch_set_info_guest: 0x%lx\n", c->vcpu.privregs);
299 return -EFAULT;
300 }
302 v->arch.domain_itm_last = -1L;
303 d->shared_info->arch = c->shared;
305 /* Don't redo final setup */
306 set_bit(_VCPUF_initialised, &v->vcpu_flags);
307 return 0;
308 }
310 void domain_relinquish_resources(struct domain *d)
311 {
312 /* FIXME */
313 printf("domain_relinquish_resources: not implemented\n");
314 }
316 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
317 // and linux/arch/ia64/kernel/process.c:kernel_thread()
318 void new_thread(struct vcpu *v,
319 unsigned long start_pc,
320 unsigned long start_stack,
321 unsigned long start_info)
322 {
323 struct domain *d = v->domain;
324 struct pt_regs *regs;
325 struct ia64_boot_param *bp;
326 extern char saved_command_line[];
329 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
330 if (d == dom0) start_pc += dom0_start;
331 #endif
333 regs = vcpu_regs (v);
334 if (VMX_DOMAIN(v)) {
335 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
336 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
337 } else {
338 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
339 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN
340 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS);
341 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
342 }
343 regs->cr_iip = start_pc;
344 regs->cr_ifs = 1UL << 63; /* or clear? */
345 regs->ar_fpsr = FPSR_DEFAULT;
347 if (VMX_DOMAIN(v)) {
348 vmx_init_all_rr(v);
349 if (d == dom0)
350 // VCPU(v,vgr[12]) = dom_fw_setup(d,saved_command_line,256L);
351 regs->r28 = dom_fw_setup(d,saved_command_line,256L);
352 /* Virtual processor context setup */
353 VCPU(v, vpsr) = IA64_PSR_BN;
354 VCPU(v, dcr) = 0;
355 } else {
356 init_all_rr(v);
357 if (d == dom0)
358 regs->r28 = dom_fw_setup(d,saved_command_line,256L);
359 else {
360 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
361 if (*d->arch.cmdline == '\0') {
362 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
363 regs->r28 = dom_fw_setup(d,DEFAULT_CMDLINE,256L);
364 printf("domU command line defaulted to"
365 DEFAULT_CMDLINE "\n");
366 }
367 else regs->r28 = dom_fw_setup(d,d->arch.cmdline,256L);
368 }
369 VCPU(v, banknum) = 1;
370 VCPU(v, metaphysical_mode) = 1;
371 d->shared_info->arch.flags = (d == dom0) ? (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
372 }
373 }
375 static struct page * map_new_domain0_page(unsigned long mpaddr)
376 {
377 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
378 printk("map_new_domain0_page: bad domain0 mpaddr %p!\n",mpaddr);
379 printk("map_new_domain0_page: start=%p,end=%p!\n",dom0_start,dom0_start+dom0_size);
380 while(1);
381 }
382 return pfn_to_page((mpaddr >> PAGE_SHIFT));
383 }
385 /* allocate new page for domain and map it to the specified metaphysical addr */
386 struct page * map_new_domain_page(struct domain *d, unsigned long mpaddr)
387 {
388 struct mm_struct *mm = d->arch.mm;
389 struct page *p = (struct page *)0;
390 pgd_t *pgd;
391 pud_t *pud;
392 pmd_t *pmd;
393 pte_t *pte;
394 extern unsigned long vhpt_paddr, vhpt_pend;
396 if (!mm->pgd) {
397 printk("map_new_domain_page: domain pgd must exist!\n");
398 return(p);
399 }
400 pgd = pgd_offset(mm,mpaddr);
401 if (pgd_none(*pgd))
402 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
404 pud = pud_offset(pgd, mpaddr);
405 if (pud_none(*pud))
406 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
408 pmd = pmd_offset(pud, mpaddr);
409 if (pmd_none(*pmd))
410 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
411 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
413 pte = pte_offset_map(pmd, mpaddr);
414 if (pte_none(*pte)) {
415 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
416 if (d == dom0) p = map_new_domain0_page(mpaddr);
417 else
418 #endif
419 {
420 p = alloc_domheap_page(d);
421 // zero out pages for security reasons
422 if (p) memset(__va(page_to_phys(p)),0,PAGE_SIZE);
423 }
424 if (unlikely(!p)) {
425 printf("map_new_domain_page: Can't alloc!!!! Aaaargh!\n");
426 return(p);
427 }
428 if (unlikely(page_to_phys(p) > vhpt_paddr && page_to_phys(p) < vhpt_pend)) {
429 printf("map_new_domain_page: reassigned vhpt page %p!!\n",page_to_phys(p));
430 }
431 set_pte(pte, pfn_pte(page_to_phys(p) >> PAGE_SHIFT,
432 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
433 }
434 else printk("map_new_domain_page: mpaddr %lx already mapped!\n",mpaddr);
435 return p;
436 }
438 /* map a physical address to the specified metaphysical addr */
439 void map_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr)
440 {
441 struct mm_struct *mm = d->arch.mm;
442 pgd_t *pgd;
443 pud_t *pud;
444 pmd_t *pmd;
445 pte_t *pte;
447 if (!mm->pgd) {
448 printk("map_domain_page: domain pgd must exist!\n");
449 return;
450 }
451 pgd = pgd_offset(mm,mpaddr);
452 if (pgd_none(*pgd))
453 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
455 pud = pud_offset(pgd, mpaddr);
456 if (pud_none(*pud))
457 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
459 pmd = pmd_offset(pud, mpaddr);
460 if (pmd_none(*pmd))
461 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
462 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
464 pte = pte_offset_map(pmd, mpaddr);
465 if (pte_none(*pte)) {
466 set_pte(pte, pfn_pte(physaddr >> PAGE_SHIFT,
467 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
468 }
469 else printk("map_domain_page: mpaddr %lx already mapped!\n",mpaddr);
470 }
471 #if 0
472 /* map a physical address with specified I/O flag */
473 void map_domain_io_page(struct domain *d, unsigned long mpaddr, unsigned long flags)
474 {
475 struct mm_struct *mm = d->arch.mm;
476 pgd_t *pgd;
477 pud_t *pud;
478 pmd_t *pmd;
479 pte_t *pte;
480 pte_t io_pte;
482 if (!mm->pgd) {
483 printk("map_domain_page: domain pgd must exist!\n");
484 return;
485 }
486 ASSERT(flags & GPFN_IO_MASK);
488 pgd = pgd_offset(mm,mpaddr);
489 if (pgd_none(*pgd))
490 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
492 pud = pud_offset(pgd, mpaddr);
493 if (pud_none(*pud))
494 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
496 pmd = pmd_offset(pud, mpaddr);
497 if (pmd_none(*pmd))
498 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
499 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
501 pte = pte_offset_map(pmd, mpaddr);
502 if (pte_none(*pte)) {
503 pte_val(io_pte) = flags;
504 set_pte(pte, io_pte);
505 }
506 else printk("map_domain_page: mpaddr %lx already mapped!\n",mpaddr);
507 }
508 #endif
509 void mpafoo(unsigned long mpaddr)
510 {
511 extern unsigned long privop_trace;
512 if (mpaddr == 0x3800)
513 privop_trace = 1;
514 }
516 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
517 {
518 struct mm_struct *mm = d->arch.mm;
519 pgd_t *pgd = pgd_offset(mm, mpaddr);
520 pud_t *pud;
521 pmd_t *pmd;
522 pte_t *pte;
524 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
525 if (d == dom0) {
526 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
527 //printk("lookup_domain_mpa: bad dom0 mpaddr %p!\n",mpaddr);
528 //printk("lookup_domain_mpa: start=%p,end=%p!\n",dom0_start,dom0_start+dom0_size);
529 mpafoo(mpaddr);
530 }
531 pte_t pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
532 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
533 pte = &pteval;
534 return *(unsigned long *)pte;
535 }
536 #endif
537 tryagain:
538 if (pgd_present(*pgd)) {
539 pud = pud_offset(pgd,mpaddr);
540 if (pud_present(*pud)) {
541 pmd = pmd_offset(pud,mpaddr);
542 if (pmd_present(*pmd)) {
543 pte = pte_offset_map(pmd,mpaddr);
544 if (pte_present(*pte)) {
545 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
546 return *(unsigned long *)pte;
547 } else if (VMX_DOMAIN(d->vcpu[0]))
548 return GPFN_INV_MASK;
549 }
550 }
551 }
552 /* if lookup fails and mpaddr is "legal", "create" the page */
553 if ((mpaddr >> PAGE_SHIFT) < d->max_pages) {
554 if (map_new_domain_page(d,mpaddr)) goto tryagain;
555 }
556 printk("lookup_domain_mpa: bad mpa %p (> %p\n",
557 mpaddr,d->max_pages<<PAGE_SHIFT);
558 mpafoo(mpaddr);
559 return 0;
560 }
562 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
563 #if 1
564 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
565 {
566 unsigned long pte = lookup_domain_mpa(d,mpaddr);
567 unsigned long imva;
569 pte &= _PAGE_PPN_MASK;
570 imva = __va(pte);
571 imva |= mpaddr & ~PAGE_MASK;
572 return(imva);
573 }
574 #else
575 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
576 {
577 unsigned long imva = __gpa_to_mpa(d, mpaddr);
579 return __va(imva);
580 }
581 #endif
583 // remove following line if not privifying in memory
584 //#define HAVE_PRIVIFY_MEMORY
585 #ifndef HAVE_PRIVIFY_MEMORY
586 #define privify_memory(x,y) do {} while(0)
587 #endif
589 // see arch/x86/xxx/domain_build.c
590 int elf_sanity_check(Elf_Ehdr *ehdr)
591 {
592 return (IS_ELF(*ehdr));
593 }
595 static void copy_memory(void *dst, void *src, int size)
596 {
597 int remain;
599 if (IS_XEN_ADDRESS(dom0,src)) {
600 memcpy(dst,src,size);
601 }
602 else {
603 printf("About to call __copy_from_user(%p,%p,%d)\n",
604 dst,src,size);
605 while (remain = __copy_from_user(dst,src,size)) {
606 printf("incomplete user copy, %d remain of %d\n",
607 remain,size);
608 dst += size - remain; src += size - remain;
609 size -= remain;
610 }
611 }
612 }
614 void loaddomainelfimage(struct domain *d, unsigned long image_start)
615 {
616 char *elfbase = image_start;
617 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
618 Elf_Ehdr ehdr;
619 Elf_Phdr phdr;
620 int h, filesz, memsz, paddr;
621 unsigned long elfaddr, dom_mpaddr, dom_imva;
622 struct page *p;
623 unsigned long pteval;
625 copy_memory(&ehdr,image_start,sizeof(Elf_Ehdr));
626 for ( h = 0; h < ehdr.e_phnum; h++ ) {
627 copy_memory(&phdr,elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
628 sizeof(Elf_Phdr));
629 //if ( !is_loadable_phdr(phdr) )
630 if ((phdr.p_type != PT_LOAD)) {
631 continue;
632 }
633 filesz = phdr.p_filesz; memsz = phdr.p_memsz;
634 elfaddr = elfbase + phdr.p_offset;
635 dom_mpaddr = phdr.p_paddr;
636 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
637 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
638 if (d == dom0) {
639 if (dom_mpaddr+memsz>dom0_size || dom_mpaddr+filesz>dom0_size) {
640 printf("Domain0 doesn't fit in allocated space!\n");
641 while(1);
642 }
643 dom_imva = __va(dom_mpaddr + dom0_start);
644 copy_memory(dom_imva,elfaddr,filesz);
645 if (memsz > filesz) memset(dom_imva+filesz,0,memsz-filesz);
646 //FIXME: This test for code seems to find a lot more than objdump -x does
647 if (phdr.p_flags & PF_X) privify_memory(dom_imva,filesz);
648 }
649 else
650 #endif
651 while (memsz > 0) {
652 #ifdef DOMU_AUTO_RESTART
653 pteval = lookup_domain_mpa(d,dom_mpaddr);
654 if (pteval) dom_imva = __va(pteval & _PFN_MASK);
655 else { printf("loaddomainelfimage: BAD!\n"); while(1); }
656 #else
657 p = map_new_domain_page(d,dom_mpaddr);
658 if (unlikely(!p)) BUG();
659 dom_imva = __va(page_to_phys(p));
660 #endif
661 if (filesz > 0) {
662 if (filesz >= PAGE_SIZE)
663 copy_memory(dom_imva,elfaddr,PAGE_SIZE);
664 else { // copy partial page, zero the rest of page
665 copy_memory(dom_imva,elfaddr,filesz);
666 memset(dom_imva+filesz,0,PAGE_SIZE-filesz);
667 }
668 //FIXME: This test for code seems to find a lot more than objdump -x does
669 if (phdr.p_flags & PF_X)
670 privify_memory(dom_imva,PAGE_SIZE);
671 }
672 else if (memsz > 0) // always zero out entire page
673 memset(dom_imva,0,PAGE_SIZE);
674 memsz -= PAGE_SIZE; filesz -= PAGE_SIZE;
675 elfaddr += PAGE_SIZE; dom_mpaddr += PAGE_SIZE;
676 }
677 }
678 }
680 int
681 parsedomainelfimage(char *elfbase, unsigned long elfsize, unsigned long *entry)
682 {
683 Elf_Ehdr ehdr;
685 copy_memory(&ehdr,elfbase,sizeof(Elf_Ehdr));
687 if ( !elf_sanity_check(&ehdr) ) {
688 printk("ELF sanity check failed.\n");
689 return -EINVAL;
690 }
692 if ( (ehdr.e_phoff + (ehdr.e_phnum * ehdr.e_phentsize)) > elfsize )
693 {
694 printk("ELF program headers extend beyond end of image.\n");
695 return -EINVAL;
696 }
698 if ( (ehdr.e_shoff + (ehdr.e_shnum * ehdr.e_shentsize)) > elfsize )
699 {
700 printk("ELF section headers extend beyond end of image.\n");
701 return -EINVAL;
702 }
704 #if 0
705 /* Find the section-header strings table. */
706 if ( ehdr.e_shstrndx == SHN_UNDEF )
707 {
708 printk("ELF image has no section-header strings table (shstrtab).\n");
709 return -EINVAL;
710 }
711 #endif
713 *entry = ehdr.e_entry;
714 printf("parsedomainelfimage: entry point = %p\n",*entry);
716 return 0;
717 }
720 void alloc_dom0(void)
721 {
722 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
723 if (platform_is_hp_ski()) {
724 dom0_size = 128*1024*1024; //FIXME: Should be configurable
725 }
726 printf("alloc_dom0: starting (initializing %d MB...)\n",dom0_size/(1024*1024));
728 /* FIXME: The first trunk (say 256M) should always be assigned to
729 * Dom0, since Dom0's physical == machine address for DMA purpose.
730 * Some old version linux, like 2.4, assumes physical memory existing
731 * in 2nd 64M space.
732 */
733 dom0_start = alloc_boot_pages(
734 dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
735 dom0_start <<= PAGE_SHIFT;
736 if (!dom0_start) {
737 printf("construct_dom0: can't allocate contiguous memory size=%p\n",
738 dom0_size);
739 while(1);
740 }
741 printf("alloc_dom0: dom0_start=%p\n",dom0_start);
742 #else
743 dom0_start = 0;
744 #endif
746 }
749 /*
750 * Domain 0 has direct access to all devices absolutely. However
751 * the major point of this stub here, is to allow alloc_dom_mem
752 * handled with order > 0 request. Dom0 requires that bit set to
753 * allocate memory for other domains.
754 */
755 void physdev_init_dom0(struct domain *d)
756 {
757 set_bit(_DOMF_physdev_access, &d->domain_flags);
758 }
760 unsigned int vmx_dom0 = 0;
761 int construct_dom0(struct domain *d,
762 unsigned long image_start, unsigned long image_len,
763 unsigned long initrd_start, unsigned long initrd_len,
764 char *cmdline)
765 {
766 char *dst;
767 int i, rc;
768 unsigned long pfn, mfn;
769 unsigned long nr_pt_pages;
770 unsigned long count;
771 unsigned long alloc_start, alloc_end;
772 struct pfn_info *page = NULL;
773 start_info_t *si;
774 struct vcpu *v = d->vcpu[0];
776 struct domain_setup_info dsi;
777 unsigned long p_start;
778 unsigned long pkern_start;
779 unsigned long pkern_entry;
780 unsigned long pkern_end;
781 unsigned long pinitrd_start = 0;
782 unsigned long ret, progress = 0;
784 //printf("construct_dom0: starting\n");
786 #ifndef CLONE_DOMAIN0
787 /* Sanity! */
788 BUG_ON(d != dom0);
789 BUG_ON(d->vcpu[0] == NULL);
790 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
791 #endif
793 memset(&dsi, 0, sizeof(struct domain_setup_info));
795 printk("*** LOADING DOMAIN 0 ***\n");
797 alloc_start = dom0_start;
798 alloc_end = dom0_start + dom0_size;
799 d->tot_pages = d->max_pages = dom0_size/PAGE_SIZE;
800 dsi.image_addr = (unsigned long)image_start;
801 dsi.image_len = image_len;
802 rc = parseelfimage(&dsi);
803 if ( rc != 0 )
804 return rc;
806 #ifdef VALIDATE_VT
807 /* Temp workaround */
808 if (running_on_sim)
809 dsi.xen_section_string = (char *)1;
811 /* Check whether dom0 is vti domain */
812 if ((!vmx_enabled) && !dsi.xen_section_string) {
813 printk("Lack of hardware support for unmodified vmx dom0\n");
814 panic("");
815 }
817 if (vmx_enabled && !dsi.xen_section_string) {
818 printk("Dom0 is vmx domain!\n");
819 vmx_dom0 = 1;
820 }
821 #endif
823 p_start = dsi.v_start;
824 pkern_start = dsi.v_kernstart;
825 pkern_end = dsi.v_kernend;
826 pkern_entry = dsi.v_kernentry;
828 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
830 if ( (p_start & (PAGE_SIZE-1)) != 0 )
831 {
832 printk("Initial guest OS must load to a page boundary.\n");
833 return -EINVAL;
834 }
836 if(initrd_start&&initrd_len){
837 pinitrd_start=(dom0_start+dom0_size) -
838 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
840 memcpy(__va(pinitrd_start),initrd_start,initrd_len);
841 }
843 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
844 " Kernel image: %lx->%lx\n"
845 " Entry address: %lx\n"
846 " Init. ramdisk: %lx len %lx\n",
847 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len);
849 if ( (pkern_end - pkern_start) > (d->max_pages * PAGE_SIZE) )
850 {
851 printk("Initial guest OS requires too much space\n"
852 "(%luMB is greater than %luMB limit)\n",
853 (pkern_end-pkern_start)>>20, (d->max_pages<<PAGE_SHIFT)>>20);
854 return -ENOMEM;
855 }
857 // if high 3 bits of pkern start are non-zero, error
859 // if pkern end is after end of metaphysical memory, error
860 // (we should be able to deal with this... later)
863 //
865 #if 0
866 strcpy(d->name,"Domain0");
867 #endif
869 /* Mask all upcalls... */
870 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
871 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
873 #ifdef VALIDATE_VT
874 /* Construct a frame-allocation list for the initial domain, since these
875 * pages are allocated by boot allocator and pfns are not set properly
876 */
877 for ( mfn = (alloc_start>>PAGE_SHIFT);
878 mfn < (alloc_end>>PAGE_SHIFT);
879 mfn++ )
880 {
881 page = &frame_table[mfn];
882 page_set_owner(page, d);
883 page->u.inuse.type_info = 0;
884 page->count_info = PGC_allocated | 1;
885 list_add_tail(&page->list, &d->page_list);
887 /* Construct 1:1 mapping */
888 machine_to_phys_mapping[mfn] = mfn;
889 }
891 #endif
893 /* Copy the OS image. */
894 loaddomainelfimage(d,image_start);
896 /* Copy the initial ramdisk. */
897 //if ( initrd_len != 0 )
898 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
901 /* Set up start info area. */
902 si = (start_info_t *)alloc_xenheap_page();
903 memset(si, 0, PAGE_SIZE);
904 d->shared_info->arch.start_info_pfn = __pa(si) >> PAGE_SHIFT;
905 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
906 si->nr_pages = d->tot_pages;
908 #if 0
909 si->shared_info = virt_to_phys(d->shared_info);
910 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
911 //si->pt_base = vpt_start;
912 //si->nr_pt_frames = nr_pt_pages;
913 //si->mfn_list = vphysmap_start;
915 if ( initrd_len != 0 )
916 {
917 //si->mod_start = vinitrd_start;
918 si->mod_len = initrd_len;
919 printk("Initrd len 0x%lx, start at 0x%08lx\n",
920 si->mod_len, si->mod_start);
921 }
923 dst = si->cmd_line;
924 if ( cmdline != NULL )
925 {
926 for ( i = 0; i < 255; i++ )
927 {
928 if ( cmdline[i] == '\0' )
929 break;
930 *dst++ = cmdline[i];
931 }
932 }
933 *dst = '\0';
935 zap_low_mappings(); /* Do the same for the idle page tables. */
936 #endif
938 /* Give up the VGA console if DOM0 is configured to grab it. */
939 if (cmdline != NULL)
940 console_endboot(strstr(cmdline, "tty0") != NULL);
942 /* VMX specific construction for Dom0, if hardware supports VMX
943 * and Dom0 is unmodified image
944 */
945 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
946 if (vmx_dom0)
947 vmx_final_setup_guest(v);
949 set_bit(_VCPUF_initialised, &v->vcpu_flags);
951 new_thread(v, pkern_entry, 0, 0);
952 physdev_init_dom0(d);
953 sync_split_caches();
955 // FIXME: Hack for keyboard input
956 #ifdef CLONE_DOMAIN0
957 if (d == dom0)
958 #endif
959 serial_input_init();
960 if (d == dom0) {
961 VCPU(v, delivery_mask[0]) = -1L;
962 VCPU(v, delivery_mask[1]) = -1L;
963 VCPU(v, delivery_mask[2]) = -1L;
964 VCPU(v, delivery_mask[3]) = -1L;
965 }
966 else __set_bit(0x30, VCPU(v, delivery_mask));
968 return 0;
969 }
971 // FIXME: When dom0 can construct domains, this goes away (or is rewritten)
972 int construct_domU(struct domain *d,
973 unsigned long image_start, unsigned long image_len,
974 unsigned long initrd_start, unsigned long initrd_len,
975 char *cmdline)
976 {
977 int i, rc;
978 struct vcpu *v = d->vcpu[0];
979 unsigned long pkern_entry;
981 #ifndef DOMU_AUTO_RESTART
982 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
983 #endif
985 printk("*** LOADING DOMAIN %d ***\n",d->domain_id);
987 d->max_pages = dom0_size/PAGE_SIZE; // FIXME: use dom0 size
988 // FIXME: use domain0 command line
989 rc = parsedomainelfimage(image_start, image_len, &pkern_entry);
990 printk("parsedomainelfimage returns %d\n",rc);
991 if ( rc != 0 ) return rc;
993 /* Mask all upcalls... */
994 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
995 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
997 /* Copy the OS image. */
998 printk("calling loaddomainelfimage(%p,%p)\n",d,image_start);
999 loaddomainelfimage(d,image_start);
1000 printk("loaddomainelfimage returns\n");
1002 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1004 printk("calling new_thread, entry=%p\n",pkern_entry);
1005 #ifdef DOMU_AUTO_RESTART
1006 v->domain->arch.image_start = image_start;
1007 v->domain->arch.image_len = image_len;
1008 v->domain->arch.entry = pkern_entry;
1009 #endif
1010 new_thread(v, pkern_entry, 0, 0);
1011 printk("new_thread returns\n");
1012 sync_split_caches();
1013 __set_bit(0x30, VCPU(v, delivery_mask));
1015 return 0;
1018 #ifdef DOMU_AUTO_RESTART
1019 void reconstruct_domU(struct vcpu *v)
1021 /* re-copy the OS image to reset data values to original */
1022 printk("reconstruct_domU: restarting domain %d...\n",
1023 v->domain->domain_id);
1024 loaddomainelfimage(v->domain,v->domain->arch.image_start);
1025 new_thread(v, v->domain->arch.entry, 0, 0);
1026 sync_split_caches();
1028 #endif
1030 void machine_restart(char * __unused)
1032 if (platform_is_hp_ski()) dummy();
1033 printf("machine_restart called: spinning....\n");
1034 while(1);
1037 void machine_halt(void)
1039 if (platform_is_hp_ski()) dummy();
1040 printf("machine_halt called: spinning....\n");
1041 while(1);
1044 void dummy_called(char *function)
1046 if (platform_is_hp_ski()) asm("break 0;;");
1047 printf("dummy called in %s: spinning....\n", function);
1048 while(1);
1052 #if 0
1053 void switch_to(struct vcpu *prev, struct vcpu *next)
1055 struct vcpu *last;
1057 __switch_to(prev,next,last);
1058 //set_current(next);
1060 #endif
1062 void domain_pend_keyboard_interrupt(int irq)
1064 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1067 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
1069 if ( v->processor == newcpu )
1070 return;
1072 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
1073 v->processor = newcpu;
1076 void sync_vcpu_execstate(struct vcpu *v)
1078 ia64_save_fpu(v->arch._thread.fph);
1079 if (VMX_DOMAIN(v))
1080 vmx_save_state(v);
1081 else {
1082 if (IA64_HAS_EXTRA_STATE(v))
1083 ia64_save_extra(v);
1085 // FIXME SMP: Anything else needed here for SMP?
1088 // FIXME: It would be nice to print out a nice error message for bad
1089 // values of these boot-time parameters, but it seems we are too early
1090 // in the boot and attempts to print freeze the system?
1091 #define abort(x...) do {} while(0)
1092 #define warn(x...) do {} while(0)
1094 static void parse_dom0_mem(char *s)
1096 unsigned long bytes = parse_size_and_unit(s);
1098 if (dom0_size < 4 * 1024 * 1024) {
1099 abort("parse_dom0_mem: too small, boot aborted"
1100 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1102 if (dom0_size % dom0_align) {
1103 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1104 warn("parse_dom0_mem: dom0_size rounded up from"
1105 " %lx to %lx bytes, due to dom0_align=%lx\n",
1106 bytes,dom0_size,dom0_align);
1108 else dom0_size = bytes;
1110 custom_param("dom0_mem", parse_dom0_mem);
1113 static void parse_dom0_align(char *s)
1115 unsigned long bytes = parse_size_and_unit(s);
1117 if ((bytes - 1) ^ bytes) { /* not a power of two */
1118 abort("parse_dom0_align: dom0_align must be power of two, "
1119 "boot aborted"
1120 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1122 else if (bytes < PAGE_SIZE) {
1123 abort("parse_dom0_align: dom0_align must be >= %ld, "
1124 "boot aborted"
1125 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1126 PAGE_SIZE);
1128 else dom0_align = bytes;
1129 if (dom0_size % dom0_align) {
1130 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1131 warn("parse_dom0_align: dom0_size rounded up from"
1132 " %ld to %ld bytes, due to dom0_align=%lx\n",
1133 bytes,dom0_size,dom0_align);
1136 custom_param("dom0_align", parse_dom0_align);