ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 8381:724f412d81ab

Support domU initrd and cmdline (previous cset also)
Signed-off-by: Akio Takebe <takebe_akio@jp.fujitsu.com>
author djm@kirby.fc.hp.com
date Wed Dec 21 09:31:05 2005 -0600 (2005-12-21)
parents 85261a82e02c
children 40648452d45f 4369fd869f51
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <asm/ptrace.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/hw_irq.h>
28 //#include <asm/mpspec.h>
29 #include <xen/irq.h>
30 #include <xen/event.h>
31 //#include <xen/shadow.h>
32 #include <xen/console.h>
33 #include <xen/compile.h>
35 #include <xen/elf.h>
36 //#include <asm/page.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h> /* for MAX_DMA_ADDRESS */
40 #include <asm/asm-offsets.h> /* for IA64_THREAD_INFO_SIZE */
42 #include <asm/vcpu.h> /* for function declarations */
43 #include <public/arch-ia64.h>
44 #include <asm/vmx.h>
45 #include <asm/vmx_vcpu.h>
46 #include <asm/vmx_vpd.h>
47 #include <asm/pal.h>
48 #include <public/io/ioreq.h>
50 #define CONFIG_DOMAIN0_CONTIGUOUS
51 unsigned long dom0_start = -1L;
52 unsigned long dom0_size = 512*1024*1024;
53 unsigned long dom0_align = 64*1024*1024;
55 // initialized by arch/ia64/setup.c:find_initrd()
56 unsigned long initrd_start = 0, initrd_end = 0;
57 extern unsigned long running_on_sim;
59 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
61 //extern int loadelfimage(char *);
62 extern int readelfimage_base_and_size(char *, unsigned long,
63 unsigned long *, unsigned long *, unsigned long *);
65 unsigned long map_domain_page0(struct domain *);
66 extern unsigned long dom_fw_setup(struct domain *, char *, int);
68 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
69 void free_perdomain_pt(struct domain *d)
70 {
71 printf("free_perdomain_pt: not implemented\n");
72 //free_page((unsigned long)d->mm.perdomain_pt);
73 }
75 static void default_idle(void)
76 {
77 int cpu = smp_processor_id();
78 local_irq_disable();
79 if ( !softirq_pending(cpu))
80 safe_halt();
81 local_irq_enable();
82 }
84 static void continue_cpu_idle_loop(void)
85 {
86 int cpu = smp_processor_id();
87 for ( ; ; )
88 {
89 printf ("idle%dD\n", cpu);
90 #ifdef IA64
91 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
92 #else
93 irq_stat[cpu].idle_timestamp = jiffies;
94 #endif
95 while ( !softirq_pending(cpu) )
96 default_idle();
97 add_preempt_count(SOFTIRQ_OFFSET);
98 raise_softirq(SCHEDULE_SOFTIRQ);
99 do_softirq();
100 sub_preempt_count(SOFTIRQ_OFFSET);
101 }
102 }
104 void startup_cpu_idle_loop(void)
105 {
106 int cpu = smp_processor_id ();
107 /* Just some sanity to ensure that the scheduler is set up okay. */
108 ASSERT(current->domain == IDLE_DOMAIN_ID);
109 printf ("idle%dA\n", cpu);
110 raise_softirq(SCHEDULE_SOFTIRQ);
111 #if 0 /* All this work is done within continue_cpu_idle_loop */
112 printf ("idle%dB\n", cpu);
113 asm volatile ("mov ar.k2=r0");
114 do_softirq();
115 printf ("idle%dC\n", cpu);
117 /*
118 * Declares CPU setup done to the boot processor.
119 * Therefore memory barrier to ensure state is visible.
120 */
121 smp_mb();
122 #endif
123 #if 0
124 //do we have to ensure the idle task has a shared page so that, for example,
125 //region registers can be loaded from it. Apparently not...
126 idle0_task.shared_info = (void *)alloc_xenheap_page();
127 memset(idle0_task.shared_info, 0, PAGE_SIZE);
128 /* pin mapping */
129 // FIXME: Does this belong here? Or do only at domain switch time?
130 {
131 /* WARNING: following must be inlined to avoid nested fault */
132 unsigned long psr = ia64_clear_ic();
133 ia64_itr(0x2, IA64_TR_SHARED_INFO, SHAREDINFO_ADDR,
134 pte_val(pfn_pte(ia64_tpa(idle0_task.shared_info) >> PAGE_SHIFT, PAGE_KERNEL)),
135 PAGE_SHIFT);
136 ia64_set_psr(psr);
137 ia64_srlz_i();
138 }
139 #endif
141 continue_cpu_idle_loop();
142 }
144 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
145 {
146 struct vcpu *v;
148 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
149 return NULL;
151 memset(v, 0, sizeof(*v));
152 memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
153 v->arch.privregs =
154 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
155 printf("arch_vcpu_info=%p\n", v->arch.privregs);
156 memset(v->arch.privregs, 0, PAGE_SIZE);
158 return v;
159 }
161 void free_vcpu_struct(struct vcpu *v)
162 {
163 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
164 }
166 static void init_switch_stack(struct vcpu *v)
167 {
168 struct pt_regs *regs = vcpu_regs (v);
169 struct switch_stack *sw = (struct switch_stack *) regs - 1;
170 extern void ia64_ret_from_clone;
172 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
173 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
174 sw->b0 = (unsigned long) &ia64_ret_from_clone;
175 sw->ar_fpsr = FPSR_DEFAULT;
176 v->arch._thread.ksp = (unsigned long) sw - 16;
177 // stay on kernel stack because may get interrupts!
178 // ia64_ret_from_clone (which b0 gets in new_thread) switches
179 // to user stack
180 v->arch._thread.on_ustack = 0;
181 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
182 }
184 void arch_do_createdomain(struct vcpu *v)
185 {
186 struct domain *d = v->domain;
187 struct thread_info *ti = alloc_thread_info(v);
189 /* Clear thread_info to clear some important fields, like preempt_count */
190 memset(ti, 0, sizeof(struct thread_info));
191 init_switch_stack(v);
193 d->shared_info = (void *)alloc_xenheap_page();
194 if (!d->shared_info) {
195 printk("ERROR/HALTING: CAN'T ALLOC PAGE\n");
196 while (1);
197 }
198 memset(d->shared_info, 0, PAGE_SIZE);
199 if (v == d->vcpu[0])
200 memset(&d->shared_info->evtchn_mask[0], 0xff,
201 sizeof(d->shared_info->evtchn_mask));
202 #if 0
203 d->vcpu[0].arch.privregs =
204 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
205 printf("arch_vcpu_info=%p\n", d->vcpu[0].arch.privregs);
206 memset(d->vcpu.arch.privregs, 0, PAGE_SIZE);
207 #endif
208 v->vcpu_info = &(d->shared_info->vcpu_info[0]);
210 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
212 /* We may also need emulation rid for region4, though it's unlikely
213 * to see guest issue uncacheable access in metaphysical mode. But
214 * keep such info here may be more sane.
215 */
216 if (((d->arch.metaphysical_rr0 = allocate_metaphysical_rr()) == -1UL)
217 || ((d->arch.metaphysical_rr4 = allocate_metaphysical_rr()) == -1UL))
218 BUG();
219 // VCPU(v, metaphysical_mode) = 1;
220 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
221 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
222 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
223 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
224 #define DOMAIN_RID_BITS_DEFAULT 18
225 if (!allocate_rid_range(d,DOMAIN_RID_BITS_DEFAULT)) // FIXME
226 BUG();
227 v->arch.starting_rid = d->arch.starting_rid;
228 v->arch.ending_rid = d->arch.ending_rid;
229 // the following will eventually need to be negotiated dynamically
230 d->xen_vastart = XEN_START_ADDR;
231 d->xen_vaend = XEN_END_ADDR;
232 d->shared_info_va = SHAREDINFO_ADDR;
233 d->arch.breakimm = 0x1000;
234 v->arch.breakimm = d->arch.breakimm;
236 d->arch.sys_pgnr = 0;
237 if (d->domain_id != IDLE_DOMAIN_ID) {
238 d->arch.mm = xmalloc(struct mm_struct);
239 if (unlikely(!d->arch.mm)) {
240 printk("Can't allocate mm_struct for domain %d\n",d->domain_id);
241 return -ENOMEM;
242 }
243 memset(d->arch.mm, 0, sizeof(*d->arch.mm));
244 d->arch.mm->pgd = pgd_alloc(d->arch.mm);
245 if (unlikely(!d->arch.mm->pgd)) {
246 printk("Can't allocate pgd for domain %d\n",d->domain_id);
247 return -ENOMEM;
248 }
249 } else
250 d->arch.mm = NULL;
251 printf ("arch_do_create_domain: domain=%p\n", d);
252 }
254 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
255 {
256 struct pt_regs *regs = vcpu_regs (v);
258 printf("arch_getdomaininfo_ctxt\n");
259 c->regs = *regs;
260 c->vcpu.evtchn_vector = v->vcpu_info->arch.evtchn_vector;
262 c->shared = v->domain->shared_info->arch;
263 }
265 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
266 {
267 struct pt_regs *regs = vcpu_regs (v);
268 struct domain *d = v->domain;
269 int i, rc, ret;
270 unsigned long progress = 0;
272 printf("arch_set_info_guest\n");
273 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
274 return 0;
275 if (c->flags & VGCF_VMX_GUEST) {
276 if (!vmx_enabled) {
277 printk("No VMX hardware feature for vmx domain.\n");
278 return -EINVAL;
279 }
281 if (v == d->vcpu[0])
282 vmx_setup_platform(d, c);
284 vmx_final_setup_guest(v);
285 }
287 *regs = c->regs;
288 d->arch.sys_pgnr = c->sys_pgnr;
289 d->arch.initrd_start = c->initrd.start;
290 d->arch.initrd_len = c->initrd.size;
291 d->arch.cmdline = c->cmdline;
292 new_thread(v, regs->cr_iip, 0, 0);
294 #ifdef CONFIG_IA64_SPLIT_CACHE
295 /* Sync d/i cache conservatively */
296 if (!running_on_sim) {
297 ret = ia64_pal_cache_flush(4, 0, &progress, NULL);
298 if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED))
299 printk("PAL CACHE FLUSH failed for dom0.\n");
300 else
301 printk("Sync i/d cache for guest SUCC\n");
302 }
303 #endif
304 v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector;
305 if ( c->vcpu.privregs && copy_from_user(v->arch.privregs,
306 c->vcpu.privregs, sizeof(mapped_regs_t))) {
307 printk("Bad ctxt address in arch_set_info_guest: 0x%lx\n", c->vcpu.privregs);
308 return -EFAULT;
309 }
311 v->arch.domain_itm_last = -1L;
312 d->shared_info->arch = c->shared;
314 /* Don't redo final setup */
315 set_bit(_VCPUF_initialised, &v->vcpu_flags);
316 return 0;
317 }
319 void domain_relinquish_resources(struct domain *d)
320 {
321 /* FIXME */
322 printf("domain_relinquish_resources: not implemented\n");
323 }
325 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
326 // and linux/arch/ia64/kernel/process.c:kernel_thread()
327 void new_thread(struct vcpu *v,
328 unsigned long start_pc,
329 unsigned long start_stack,
330 unsigned long start_info)
331 {
332 struct domain *d = v->domain;
333 struct pt_regs *regs;
334 struct ia64_boot_param *bp;
335 extern char saved_command_line[];
338 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
339 if (d == dom0) start_pc += dom0_start;
340 #endif
342 regs = vcpu_regs (v);
343 if (VMX_DOMAIN(v)) {
344 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
345 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
346 } else {
347 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
348 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN
349 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS);
350 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
351 }
352 regs->cr_iip = start_pc;
353 regs->cr_ifs = 1UL << 63; /* or clear? */
354 regs->ar_fpsr = FPSR_DEFAULT;
356 if (VMX_DOMAIN(v)) {
357 vmx_init_all_rr(v);
358 if (d == dom0)
359 // VCPU(v,vgr[12]) = dom_fw_setup(d,saved_command_line,256L);
360 regs->r28 = dom_fw_setup(d,saved_command_line,256L);
361 /* Virtual processor context setup */
362 VCPU(v, vpsr) = IA64_PSR_BN;
363 VCPU(v, dcr) = 0;
364 } else {
365 init_all_rr(v);
366 if (d == dom0)
367 regs->r28 = dom_fw_setup(d,saved_command_line,256L);
368 else {
369 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
370 if (*d->arch.cmdline == '\0') {
371 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
372 regs->r28 = dom_fw_setup(d,DEFAULT_CMDLINE,256L);
373 printf("domU command line defaulted to"
374 DEFAULT_CMDLINE "\n");
375 }
376 else regs->r28 = dom_fw_setup(d,d->arch.cmdline,256L);
377 }
378 VCPU(v, banknum) = 1;
379 VCPU(v, metaphysical_mode) = 1;
380 d->shared_info->arch.flags = (d == dom0) ? (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
381 }
382 }
384 static struct page * map_new_domain0_page(unsigned long mpaddr)
385 {
386 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
387 printk("map_new_domain0_page: bad domain0 mpaddr %p!\n",mpaddr);
388 printk("map_new_domain0_page: start=%p,end=%p!\n",dom0_start,dom0_start+dom0_size);
389 while(1);
390 }
391 return pfn_to_page((mpaddr >> PAGE_SHIFT));
392 }
394 /* allocate new page for domain and map it to the specified metaphysical addr */
395 struct page * map_new_domain_page(struct domain *d, unsigned long mpaddr)
396 {
397 struct mm_struct *mm = d->arch.mm;
398 struct page *p = (struct page *)0;
399 pgd_t *pgd;
400 pud_t *pud;
401 pmd_t *pmd;
402 pte_t *pte;
403 extern unsigned long vhpt_paddr, vhpt_pend;
405 if (!mm->pgd) {
406 printk("map_new_domain_page: domain pgd must exist!\n");
407 return(p);
408 }
409 pgd = pgd_offset(mm,mpaddr);
410 if (pgd_none(*pgd))
411 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
413 pud = pud_offset(pgd, mpaddr);
414 if (pud_none(*pud))
415 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
417 pmd = pmd_offset(pud, mpaddr);
418 if (pmd_none(*pmd))
419 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
420 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
422 pte = pte_offset_map(pmd, mpaddr);
423 if (pte_none(*pte)) {
424 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
425 if (d == dom0) p = map_new_domain0_page(mpaddr);
426 else
427 #endif
428 {
429 p = alloc_domheap_page(d);
430 // zero out pages for security reasons
431 memset(__va(page_to_phys(p)),0,PAGE_SIZE);
432 }
433 if (unlikely(!p)) {
434 printf("map_new_domain_page: Can't alloc!!!! Aaaargh!\n");
435 return(p);
436 }
437 if (unlikely(page_to_phys(p) > vhpt_paddr && page_to_phys(p) < vhpt_pend)) {
438 printf("map_new_domain_page: reassigned vhpt page %p!!\n",page_to_phys(p));
439 }
440 set_pte(pte, pfn_pte(page_to_phys(p) >> PAGE_SHIFT,
441 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
442 }
443 else printk("map_new_domain_page: mpaddr %lx already mapped!\n",mpaddr);
444 return p;
445 }
447 /* map a physical address to the specified metaphysical addr */
448 void map_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr)
449 {
450 struct mm_struct *mm = d->arch.mm;
451 pgd_t *pgd;
452 pud_t *pud;
453 pmd_t *pmd;
454 pte_t *pte;
456 if (!mm->pgd) {
457 printk("map_domain_page: domain pgd must exist!\n");
458 return;
459 }
460 pgd = pgd_offset(mm,mpaddr);
461 if (pgd_none(*pgd))
462 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
464 pud = pud_offset(pgd, mpaddr);
465 if (pud_none(*pud))
466 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
468 pmd = pmd_offset(pud, mpaddr);
469 if (pmd_none(*pmd))
470 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
471 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
473 pte = pte_offset_map(pmd, mpaddr);
474 if (pte_none(*pte)) {
475 set_pte(pte, pfn_pte(physaddr >> PAGE_SHIFT,
476 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
477 }
478 else printk("map_domain_page: mpaddr %lx already mapped!\n",mpaddr);
479 }
480 #if 0
481 /* map a physical address with specified I/O flag */
482 void map_domain_io_page(struct domain *d, unsigned long mpaddr, unsigned long flags)
483 {
484 struct mm_struct *mm = d->arch.mm;
485 pgd_t *pgd;
486 pud_t *pud;
487 pmd_t *pmd;
488 pte_t *pte;
489 pte_t io_pte;
491 if (!mm->pgd) {
492 printk("map_domain_page: domain pgd must exist!\n");
493 return;
494 }
495 ASSERT(flags & GPFN_IO_MASK);
497 pgd = pgd_offset(mm,mpaddr);
498 if (pgd_none(*pgd))
499 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
501 pud = pud_offset(pgd, mpaddr);
502 if (pud_none(*pud))
503 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
505 pmd = pmd_offset(pud, mpaddr);
506 if (pmd_none(*pmd))
507 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
508 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
510 pte = pte_offset_map(pmd, mpaddr);
511 if (pte_none(*pte)) {
512 pte_val(io_pte) = flags;
513 set_pte(pte, io_pte);
514 }
515 else printk("map_domain_page: mpaddr %lx already mapped!\n",mpaddr);
516 }
517 #endif
518 void mpafoo(unsigned long mpaddr)
519 {
520 extern unsigned long privop_trace;
521 if (mpaddr == 0x3800)
522 privop_trace = 1;
523 }
525 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
526 {
527 struct mm_struct *mm = d->arch.mm;
528 pgd_t *pgd = pgd_offset(mm, mpaddr);
529 pud_t *pud;
530 pmd_t *pmd;
531 pte_t *pte;
533 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
534 if (d == dom0) {
535 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
536 //printk("lookup_domain_mpa: bad dom0 mpaddr %p!\n",mpaddr);
537 //printk("lookup_domain_mpa: start=%p,end=%p!\n",dom0_start,dom0_start+dom0_size);
538 mpafoo(mpaddr);
539 }
540 pte_t pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
541 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
542 pte = &pteval;
543 return *(unsigned long *)pte;
544 }
545 #endif
546 tryagain:
547 if (pgd_present(*pgd)) {
548 pud = pud_offset(pgd,mpaddr);
549 if (pud_present(*pud)) {
550 pmd = pmd_offset(pud,mpaddr);
551 if (pmd_present(*pmd)) {
552 pte = pte_offset_map(pmd,mpaddr);
553 if (pte_present(*pte)) {
554 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
555 return *(unsigned long *)pte;
556 } else if (VMX_DOMAIN(d->vcpu[0]))
557 return GPFN_INV_MASK;
558 }
559 }
560 }
561 /* if lookup fails and mpaddr is "legal", "create" the page */
562 if ((mpaddr >> PAGE_SHIFT) < d->max_pages) {
563 if (map_new_domain_page(d,mpaddr)) goto tryagain;
564 }
565 printk("lookup_domain_mpa: bad mpa %p (> %p\n",
566 mpaddr,d->max_pages<<PAGE_SHIFT);
567 mpafoo(mpaddr);
568 return 0;
569 }
571 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
572 #if 1
573 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
574 {
575 unsigned long pte = lookup_domain_mpa(d,mpaddr);
576 unsigned long imva;
578 pte &= _PAGE_PPN_MASK;
579 imva = __va(pte);
580 imva |= mpaddr & ~PAGE_MASK;
581 return(imva);
582 }
583 #else
584 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
585 {
586 unsigned long imva = __gpa_to_mpa(d, mpaddr);
588 return __va(imva);
589 }
590 #endif
592 // remove following line if not privifying in memory
593 //#define HAVE_PRIVIFY_MEMORY
594 #ifndef HAVE_PRIVIFY_MEMORY
595 #define privify_memory(x,y) do {} while(0)
596 #endif
598 // see arch/x86/xxx/domain_build.c
599 int elf_sanity_check(Elf_Ehdr *ehdr)
600 {
601 return (IS_ELF(*ehdr));
602 }
604 static void copy_memory(void *dst, void *src, int size)
605 {
606 int remain;
608 if (IS_XEN_ADDRESS(dom0,src)) {
609 memcpy(dst,src,size);
610 }
611 else {
612 printf("About to call __copy_from_user(%p,%p,%d)\n",
613 dst,src,size);
614 while (remain = __copy_from_user(dst,src,size)) {
615 printf("incomplete user copy, %d remain of %d\n",
616 remain,size);
617 dst += size - remain; src += size - remain;
618 size -= remain;
619 }
620 }
621 }
623 void loaddomainelfimage(struct domain *d, unsigned long image_start)
624 {
625 char *elfbase = image_start;
626 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
627 Elf_Ehdr ehdr;
628 Elf_Phdr phdr;
629 int h, filesz, memsz, paddr;
630 unsigned long elfaddr, dom_mpaddr, dom_imva;
631 struct page *p;
632 unsigned long pteval;
634 copy_memory(&ehdr,image_start,sizeof(Elf_Ehdr));
635 for ( h = 0; h < ehdr.e_phnum; h++ ) {
636 copy_memory(&phdr,elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
637 sizeof(Elf_Phdr));
638 //if ( !is_loadable_phdr(phdr) )
639 if ((phdr.p_type != PT_LOAD)) {
640 continue;
641 }
642 filesz = phdr.p_filesz; memsz = phdr.p_memsz;
643 elfaddr = elfbase + phdr.p_offset;
644 dom_mpaddr = phdr.p_paddr;
645 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
646 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
647 if (d == dom0) {
648 if (dom_mpaddr+memsz>dom0_size || dom_mpaddr+filesz>dom0_size) {
649 printf("Domain0 doesn't fit in allocated space!\n");
650 while(1);
651 }
652 dom_imva = __va(dom_mpaddr + dom0_start);
653 copy_memory(dom_imva,elfaddr,filesz);
654 if (memsz > filesz) memset(dom_imva+filesz,0,memsz-filesz);
655 //FIXME: This test for code seems to find a lot more than objdump -x does
656 if (phdr.p_flags & PF_X) privify_memory(dom_imva,filesz);
657 }
658 else
659 #endif
660 while (memsz > 0) {
661 #ifdef DOMU_AUTO_RESTART
662 pteval = lookup_domain_mpa(d,dom_mpaddr);
663 if (pteval) dom_imva = __va(pteval & _PFN_MASK);
664 else { printf("loaddomainelfimage: BAD!\n"); while(1); }
665 #else
666 p = map_new_domain_page(d,dom_mpaddr);
667 if (unlikely(!p)) BUG();
668 dom_imva = __va(page_to_phys(p));
669 #endif
670 if (filesz > 0) {
671 if (filesz >= PAGE_SIZE)
672 copy_memory(dom_imva,elfaddr,PAGE_SIZE);
673 else { // copy partial page, zero the rest of page
674 copy_memory(dom_imva,elfaddr,filesz);
675 memset(dom_imva+filesz,0,PAGE_SIZE-filesz);
676 }
677 //FIXME: This test for code seems to find a lot more than objdump -x does
678 if (phdr.p_flags & PF_X)
679 privify_memory(dom_imva,PAGE_SIZE);
680 }
681 else if (memsz > 0) // always zero out entire page
682 memset(dom_imva,0,PAGE_SIZE);
683 memsz -= PAGE_SIZE; filesz -= PAGE_SIZE;
684 elfaddr += PAGE_SIZE; dom_mpaddr += PAGE_SIZE;
685 }
686 }
687 }
689 int
690 parsedomainelfimage(char *elfbase, unsigned long elfsize, unsigned long *entry)
691 {
692 Elf_Ehdr ehdr;
694 copy_memory(&ehdr,elfbase,sizeof(Elf_Ehdr));
696 if ( !elf_sanity_check(&ehdr) ) {
697 printk("ELF sanity check failed.\n");
698 return -EINVAL;
699 }
701 if ( (ehdr.e_phoff + (ehdr.e_phnum * ehdr.e_phentsize)) > elfsize )
702 {
703 printk("ELF program headers extend beyond end of image.\n");
704 return -EINVAL;
705 }
707 if ( (ehdr.e_shoff + (ehdr.e_shnum * ehdr.e_shentsize)) > elfsize )
708 {
709 printk("ELF section headers extend beyond end of image.\n");
710 return -EINVAL;
711 }
713 #if 0
714 /* Find the section-header strings table. */
715 if ( ehdr.e_shstrndx == SHN_UNDEF )
716 {
717 printk("ELF image has no section-header strings table (shstrtab).\n");
718 return -EINVAL;
719 }
720 #endif
722 *entry = ehdr.e_entry;
723 printf("parsedomainelfimage: entry point = %p\n",*entry);
725 return 0;
726 }
729 void alloc_dom0(void)
730 {
731 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
732 if (platform_is_hp_ski()) {
733 dom0_size = 128*1024*1024; //FIXME: Should be configurable
734 }
735 printf("alloc_dom0: starting (initializing %d MB...)\n",dom0_size/(1024*1024));
737 /* FIXME: The first trunk (say 256M) should always be assigned to
738 * Dom0, since Dom0's physical == machine address for DMA purpose.
739 * Some old version linux, like 2.4, assumes physical memory existing
740 * in 2nd 64M space.
741 */
742 dom0_start = alloc_boot_pages(
743 dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
744 dom0_start <<= PAGE_SHIFT;
745 if (!dom0_start) {
746 printf("construct_dom0: can't allocate contiguous memory size=%p\n",
747 dom0_size);
748 while(1);
749 }
750 printf("alloc_dom0: dom0_start=%p\n",dom0_start);
751 #else
752 dom0_start = 0;
753 #endif
755 }
758 /*
759 * Domain 0 has direct access to all devices absolutely. However
760 * the major point of this stub here, is to allow alloc_dom_mem
761 * handled with order > 0 request. Dom0 requires that bit set to
762 * allocate memory for other domains.
763 */
764 void physdev_init_dom0(struct domain *d)
765 {
766 set_bit(_DOMF_physdev_access, &d->domain_flags);
767 }
769 unsigned int vmx_dom0 = 0;
770 int construct_dom0(struct domain *d,
771 unsigned long image_start, unsigned long image_len,
772 unsigned long initrd_start, unsigned long initrd_len,
773 char *cmdline)
774 {
775 char *dst;
776 int i, rc;
777 unsigned long pfn, mfn;
778 unsigned long nr_pt_pages;
779 unsigned long count;
780 unsigned long alloc_start, alloc_end;
781 struct pfn_info *page = NULL;
782 start_info_t *si;
783 struct vcpu *v = d->vcpu[0];
785 struct domain_setup_info dsi;
786 unsigned long p_start;
787 unsigned long pkern_start;
788 unsigned long pkern_entry;
789 unsigned long pkern_end;
790 unsigned long pinitrd_start = 0;
791 unsigned long ret, progress = 0;
793 //printf("construct_dom0: starting\n");
795 #ifndef CLONE_DOMAIN0
796 /* Sanity! */
797 BUG_ON(d != dom0);
798 BUG_ON(d->vcpu[0] == NULL);
799 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
800 #endif
802 memset(&dsi, 0, sizeof(struct domain_setup_info));
804 printk("*** LOADING DOMAIN 0 ***\n");
806 alloc_start = dom0_start;
807 alloc_end = dom0_start + dom0_size;
808 d->tot_pages = d->max_pages = dom0_size/PAGE_SIZE;
809 dsi.image_addr = (unsigned long)image_start;
810 dsi.image_len = image_len;
811 rc = parseelfimage(&dsi);
812 if ( rc != 0 )
813 return rc;
815 #ifdef VALIDATE_VT
816 /* Temp workaround */
817 if (running_on_sim)
818 dsi.xen_section_string = (char *)1;
820 /* Check whether dom0 is vti domain */
821 if ((!vmx_enabled) && !dsi.xen_section_string) {
822 printk("Lack of hardware support for unmodified vmx dom0\n");
823 panic("");
824 }
826 if (vmx_enabled && !dsi.xen_section_string) {
827 printk("Dom0 is vmx domain!\n");
828 vmx_dom0 = 1;
829 }
830 #endif
832 p_start = dsi.v_start;
833 pkern_start = dsi.v_kernstart;
834 pkern_end = dsi.v_kernend;
835 pkern_entry = dsi.v_kernentry;
837 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
839 if ( (p_start & (PAGE_SIZE-1)) != 0 )
840 {
841 printk("Initial guest OS must load to a page boundary.\n");
842 return -EINVAL;
843 }
845 if(initrd_start&&initrd_len){
846 pinitrd_start=(dom0_start+dom0_size) -
847 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
849 memcpy(__va(pinitrd_start),initrd_start,initrd_len);
850 }
852 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
853 " Kernel image: %lx->%lx\n"
854 " Entry address: %lx\n"
855 " Init. ramdisk: %lx len %lx\n",
856 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len);
858 if ( (pkern_end - pkern_start) > (d->max_pages * PAGE_SIZE) )
859 {
860 printk("Initial guest OS requires too much space\n"
861 "(%luMB is greater than %luMB limit)\n",
862 (pkern_end-pkern_start)>>20, (d->max_pages<<PAGE_SHIFT)>>20);
863 return -ENOMEM;
864 }
866 // if high 3 bits of pkern start are non-zero, error
868 // if pkern end is after end of metaphysical memory, error
869 // (we should be able to deal with this... later)
872 //
874 #if 0
875 strcpy(d->name,"Domain0");
876 #endif
878 /* Mask all upcalls... */
879 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
880 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
882 #ifdef VALIDATE_VT
883 /* Construct a frame-allocation list for the initial domain, since these
884 * pages are allocated by boot allocator and pfns are not set properly
885 */
886 for ( mfn = (alloc_start>>PAGE_SHIFT);
887 mfn < (alloc_end>>PAGE_SHIFT);
888 mfn++ )
889 {
890 page = &frame_table[mfn];
891 page_set_owner(page, d);
892 page->u.inuse.type_info = 0;
893 page->count_info = PGC_allocated | 1;
894 list_add_tail(&page->list, &d->page_list);
896 /* Construct 1:1 mapping */
897 machine_to_phys_mapping[mfn] = mfn;
898 }
900 #endif
902 /* Copy the OS image. */
903 loaddomainelfimage(d,image_start);
905 /* Copy the initial ramdisk. */
906 //if ( initrd_len != 0 )
907 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
910 /* Set up start info area. */
911 si = (start_info_t *)alloc_xenheap_page();
912 memset(si, 0, PAGE_SIZE);
913 d->shared_info->arch.start_info_pfn = __pa(si) >> PAGE_SHIFT;
914 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
916 #if 0
917 si->nr_pages = d->tot_pages;
918 si->shared_info = virt_to_phys(d->shared_info);
919 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
920 //si->pt_base = vpt_start;
921 //si->nr_pt_frames = nr_pt_pages;
922 //si->mfn_list = vphysmap_start;
924 if ( initrd_len != 0 )
925 {
926 //si->mod_start = vinitrd_start;
927 si->mod_len = initrd_len;
928 printk("Initrd len 0x%lx, start at 0x%08lx\n",
929 si->mod_len, si->mod_start);
930 }
932 dst = si->cmd_line;
933 if ( cmdline != NULL )
934 {
935 for ( i = 0; i < 255; i++ )
936 {
937 if ( cmdline[i] == '\0' )
938 break;
939 *dst++ = cmdline[i];
940 }
941 }
942 *dst = '\0';
944 zap_low_mappings(); /* Do the same for the idle page tables. */
945 #endif
947 /* Give up the VGA console if DOM0 is configured to grab it. */
948 if (cmdline != NULL)
949 console_endboot(strstr(cmdline, "tty0") != NULL);
951 /* VMX specific construction for Dom0, if hardware supports VMX
952 * and Dom0 is unmodified image
953 */
954 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
955 if (vmx_dom0)
956 vmx_final_setup_guest(v);
958 set_bit(_VCPUF_initialised, &v->vcpu_flags);
960 new_thread(v, pkern_entry, 0, 0);
961 physdev_init_dom0(d);
962 #ifdef CONFIG_IA64_SPLIT_CACHE
963 /* Sync d/i cache conservatively */
964 if (!running_on_sim) {
965 ret = ia64_pal_cache_flush(4, 0, &progress, NULL);
966 if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED))
967 printk("PAL CACHE FLUSH failed for dom0.\n");
968 else
969 printk("Sync i/d cache for guest SUCC\n");
970 }
971 #endif
973 // FIXME: Hack for keyboard input
974 #ifdef CLONE_DOMAIN0
975 if (d == dom0)
976 #endif
977 serial_input_init();
978 if (d == dom0) {
979 VCPU(v, delivery_mask[0]) = -1L;
980 VCPU(v, delivery_mask[1]) = -1L;
981 VCPU(v, delivery_mask[2]) = -1L;
982 VCPU(v, delivery_mask[3]) = -1L;
983 }
984 else __set_bit(0x30, VCPU(v, delivery_mask));
986 return 0;
987 }
989 // FIXME: When dom0 can construct domains, this goes away (or is rewritten)
990 int construct_domU(struct domain *d,
991 unsigned long image_start, unsigned long image_len,
992 unsigned long initrd_start, unsigned long initrd_len,
993 char *cmdline)
994 {
995 int i, rc;
996 struct vcpu *v = d->vcpu[0];
997 unsigned long pkern_entry;
999 #ifndef DOMU_AUTO_RESTART
1000 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1001 #endif
1003 printk("*** LOADING DOMAIN %d ***\n",d->domain_id);
1005 d->max_pages = dom0_size/PAGE_SIZE; // FIXME: use dom0 size
1006 // FIXME: use domain0 command line
1007 rc = parsedomainelfimage(image_start, image_len, &pkern_entry);
1008 printk("parsedomainelfimage returns %d\n",rc);
1009 if ( rc != 0 ) return rc;
1011 /* Mask all upcalls... */
1012 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
1013 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1015 /* Copy the OS image. */
1016 printk("calling loaddomainelfimage(%p,%p)\n",d,image_start);
1017 loaddomainelfimage(d,image_start);
1018 printk("loaddomainelfimage returns\n");
1020 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1022 printk("calling new_thread, entry=%p\n",pkern_entry);
1023 #ifdef DOMU_AUTO_RESTART
1024 v->domain->arch.image_start = image_start;
1025 v->domain->arch.image_len = image_len;
1026 v->domain->arch.entry = pkern_entry;
1027 #endif
1028 new_thread(v, pkern_entry, 0, 0);
1029 printk("new_thread returns\n");
1030 #ifdef CONFIG_IA64_SPLIT_CACHE
1031 /* Sync d/i cache conservatively */
1032 if (!running_on_sim) {
1033 ret = ia64_pal_cache_flush(4, 0, &progress, NULL);
1034 if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED))
1035 printk("PAL CACHE FLUSH failed for dom0.\n");
1036 else
1037 printk("Sync i/d cache for guest SUCC\n");
1039 #endif
1040 __set_bit(0x30, VCPU(v, delivery_mask));
1042 return 0;
1045 #ifdef DOMU_AUTO_RESTART
1046 void reconstruct_domU(struct vcpu *v)
1048 /* re-copy the OS image to reset data values to original */
1049 printk("reconstruct_domU: restarting domain %d...\n",
1050 v->domain->domain_id);
1051 loaddomainelfimage(v->domain,v->domain->arch.image_start);
1052 new_thread(v, v->domain->arch.entry, 0, 0);
1053 #ifdef CONFIG_IA64_SPLIT_CACHE
1054 /* Sync d/i cache conservatively */
1055 if (!running_on_sim) {
1056 ret = ia64_pal_cache_flush(4, 0, &progress, NULL);
1057 if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED))
1058 printk("PAL CACHE FLUSH failed for dom0.\n");
1059 else
1060 printk("Sync i/d cache for guest SUCC\n");
1062 #endif
1064 #endif
1066 void machine_restart(char * __unused)
1068 if (platform_is_hp_ski()) dummy();
1069 printf("machine_restart called: spinning....\n");
1070 while(1);
1073 void machine_halt(void)
1075 if (platform_is_hp_ski()) dummy();
1076 printf("machine_halt called: spinning....\n");
1077 while(1);
1080 void dummy_called(char *function)
1082 if (platform_is_hp_ski()) asm("break 0;;");
1083 printf("dummy called in %s: spinning....\n", function);
1084 while(1);
1088 #if 0
1089 void switch_to(struct vcpu *prev, struct vcpu *next)
1091 struct vcpu *last;
1093 __switch_to(prev,next,last);
1094 //set_current(next);
1096 #endif
1098 void domain_pend_keyboard_interrupt(int irq)
1100 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1103 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
1105 if ( v->processor == newcpu )
1106 return;
1108 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
1109 v->processor = newcpu;
1112 void sync_vcpu_execstate(struct vcpu *v)
1114 ia64_save_fpu(v->arch._thread.fph);
1115 if (VMX_DOMAIN(v))
1116 vmx_save_state(v);
1117 else {
1118 if (IA64_HAS_EXTRA_STATE(v))
1119 ia64_save_extra(v);
1121 // FIXME SMP: Anything else needed here for SMP?
1124 // FIXME: It would be nice to print out a nice error message for bad
1125 // values of these boot-time parameters, but it seems we are too early
1126 // in the boot and attempts to print freeze the system?
1127 #define abort(x...) do {} while(0)
1128 #define warn(x...) do {} while(0)
1130 static void parse_dom0_mem(char *s)
1132 unsigned long bytes = parse_size_and_unit(s);
1134 if (dom0_size < 4 * 1024 * 1024) {
1135 abort("parse_dom0_mem: too small, boot aborted"
1136 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1138 if (dom0_size % dom0_align) {
1139 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1140 warn("parse_dom0_mem: dom0_size rounded up from"
1141 " %lx to %lx bytes, due to dom0_align=%lx\n",
1142 bytes,dom0_size,dom0_align);
1144 else dom0_size = bytes;
1146 custom_param("dom0_mem", parse_dom0_mem);
1149 static void parse_dom0_align(char *s)
1151 unsigned long bytes = parse_size_and_unit(s);
1153 if ((bytes - 1) ^ bytes) { /* not a power of two */
1154 abort("parse_dom0_align: dom0_align must be power of two, "
1155 "boot aborted"
1156 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1158 else if (bytes < PAGE_SIZE) {
1159 abort("parse_dom0_align: dom0_align must be >= %ld, "
1160 "boot aborted"
1161 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1162 PAGE_SIZE);
1164 else dom0_align = bytes;
1165 if (dom0_size % dom0_align) {
1166 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1167 warn("parse_dom0_align: dom0_size rounded up from"
1168 " %ld to %ld bytes, due to dom0_align=%lx\n",
1169 bytes,dom0_size,dom0_align);
1172 custom_param("dom0_align", parse_dom0_align);