ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 9405:29dfadcc5029

[IA64] Followup to xen time cleanup

Clean up to xen time handler. Tristan #if 0 some code because it seems
redundant, which however is actually problematic logic as a reason for
an intermittent timer oops issue of dom0. So delete it now.

Also remove vcpu_wake, since wakeup current has nothing meaningful and
simply waste cpu cycle.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
author awilliam@xenbuild.aw
date Mon Mar 27 15:32:08 2006 -0700 (2006-03-27)
parents e45666b8b05f
children 27050b1390cf
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <xen/iocap.h>
23 #include <asm/ptrace.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/hw_irq.h>
29 #include <asm/setup.h>
30 //#include <asm/mpspec.h>
31 #include <xen/irq.h>
32 #include <xen/event.h>
33 //#include <xen/shadow.h>
34 #include <xen/console.h>
35 #include <xen/compile.h>
37 #include <xen/elf.h>
38 //#include <asm/page.h>
39 #include <asm/pgalloc.h>
41 #include <asm/asm-offsets.h> /* for IA64_THREAD_INFO_SIZE */
43 #include <asm/vcpu.h> /* for function declarations */
44 #include <public/arch-ia64.h>
45 #include <asm/vmx.h>
46 #include <asm/vmx_vcpu.h>
47 #include <asm/vmx_vpd.h>
48 #include <asm/vmx_phy_mode.h>
49 #include <asm/pal.h>
50 #include <asm/vhpt.h>
51 #include <public/hvm/ioreq.h>
52 #include <public/arch-ia64.h>
53 #include <asm/tlbflush.h>
54 #include <asm/regionreg.h>
55 #include <asm/dom_fw.h>
57 #define CONFIG_DOMAIN0_CONTIGUOUS
58 unsigned long dom0_start = -1L;
59 unsigned long dom0_size = 512*1024*1024;
60 unsigned long dom0_align = 64*1024*1024;
62 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
63 static unsigned int dom0_max_vcpus = 1;
64 integer_param("dom0_max_vcpus", dom0_max_vcpus);
66 // initialized by arch/ia64/setup.c:find_initrd()
67 unsigned long initrd_start = 0, initrd_end = 0;
68 extern unsigned long running_on_sim;
70 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
72 /* FIXME: where these declarations should be there ? */
73 extern void domain_pend_keyboard_interrupt(int);
74 extern long platform_is_hp_ski(void);
75 extern void sync_split_caches(void);
76 extern void serial_input_init(void);
78 static void init_switch_stack(struct vcpu *v);
80 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
81 void arch_domain_destroy(struct domain *d)
82 {
83 struct page_info *page;
84 struct list_head *ent, *prev;
86 if (d->arch.mm->pgd != NULL)
87 {
88 list_for_each ( ent, &d->arch.mm->pt_list )
89 {
90 page = list_entry(ent, struct page_info, list);
91 prev = ent->prev;
92 list_del(ent);
93 free_xenheap_page(page_to_virt(page));
94 ent = prev;
95 }
96 pgd_free(d->arch.mm->pgd);
97 }
98 if (d->arch.mm != NULL)
99 xfree(d->arch.mm);
100 if (d->shared_info != NULL)
101 free_xenheap_page(d->shared_info);
103 deallocate_rid_range(d);
105 /* It is really good in this? */
106 flush_tlb_all();
108 /* It is really good in this? */
109 vhpt_flush();
110 }
112 static void default_idle(void)
113 {
114 int cpu = smp_processor_id();
115 local_irq_disable();
116 if ( !softirq_pending(cpu))
117 safe_halt();
118 local_irq_enable();
119 }
121 static void continue_cpu_idle_loop(void)
122 {
123 int cpu = smp_processor_id();
124 for ( ; ; )
125 {
126 #ifdef IA64
127 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
128 #else
129 irq_stat[cpu].idle_timestamp = jiffies;
130 #endif
131 while ( !softirq_pending(cpu) )
132 default_idle();
133 add_preempt_count(SOFTIRQ_OFFSET);
134 raise_softirq(SCHEDULE_SOFTIRQ);
135 do_softirq();
136 sub_preempt_count(SOFTIRQ_OFFSET);
137 }
138 }
140 void startup_cpu_idle_loop(void)
141 {
142 /* Just some sanity to ensure that the scheduler is set up okay. */
143 ASSERT(current->domain == IDLE_DOMAIN_ID);
144 raise_softirq(SCHEDULE_SOFTIRQ);
145 #if 0
146 //do we have to ensure the idle task has a shared page so that, for example,
147 //region registers can be loaded from it. Apparently not...
148 idle0_task.shared_info = (void *)alloc_xenheap_page();
149 memset(idle0_task.shared_info, 0, PAGE_SIZE);
150 /* pin mapping */
151 // FIXME: Does this belong here? Or do only at domain switch time?
152 {
153 /* WARNING: following must be inlined to avoid nested fault */
154 unsigned long psr = ia64_clear_ic();
155 ia64_itr(0x2, IA64_TR_SHARED_INFO, SHAREDINFO_ADDR,
156 pte_val(pfn_pte(ia64_tpa(idle0_task.shared_info) >> PAGE_SHIFT, PAGE_KERNEL)),
157 PAGE_SHIFT);
158 ia64_set_psr(psr);
159 ia64_srlz_i();
160 }
161 #endif
163 continue_cpu_idle_loop();
164 }
166 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
167 {
168 struct vcpu *v;
169 struct thread_info *ti;
171 /* Still keep idle vcpu0 static allocated at compilation, due
172 * to some code from Linux still requires it in early phase.
173 */
174 if (is_idle_domain(d) && !vcpu_id)
175 v = idle_vcpu[0];
176 else {
177 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
178 return NULL;
179 memset(v, 0, sizeof(*v));
181 ti = alloc_thread_info(v);
182 /* Clear thread_info to clear some important fields, like
183 * preempt_count
184 */
185 memset(ti, 0, sizeof(struct thread_info));
186 init_switch_stack(v);
187 }
189 if (!is_idle_domain(d)) {
190 v->arch.privregs =
191 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
192 BUG_ON(v->arch.privregs == NULL);
193 memset(v->arch.privregs, 0, PAGE_SIZE);
195 if (!vcpu_id)
196 memset(&d->shared_info->evtchn_mask[0], 0xff,
197 sizeof(d->shared_info->evtchn_mask));
199 v->vcpu_info = &(d->shared_info->vcpu_info[0]);
200 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
201 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
202 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
203 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
204 v->arch.starting_rid = d->arch.starting_rid;
205 v->arch.ending_rid = d->arch.ending_rid;
206 v->arch.breakimm = d->arch.breakimm;
207 }
209 return v;
210 }
212 void free_vcpu_struct(struct vcpu *v)
213 {
214 if (VMX_DOMAIN(v))
215 vmx_relinquish_vcpu_resources(v);
216 else {
217 if (v->arch.privregs != NULL)
218 free_xenheap_pages(v->arch.privregs, get_order(sizeof(mapped_regs_t)));
219 }
221 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
222 }
224 static void init_switch_stack(struct vcpu *v)
225 {
226 struct pt_regs *regs = vcpu_regs (v);
227 struct switch_stack *sw = (struct switch_stack *) regs - 1;
228 extern void ia64_ret_from_clone;
230 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
231 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
232 sw->b0 = (unsigned long) &ia64_ret_from_clone;
233 sw->ar_fpsr = FPSR_DEFAULT;
234 v->arch._thread.ksp = (unsigned long) sw - 16;
235 // stay on kernel stack because may get interrupts!
236 // ia64_ret_from_clone (which b0 gets in new_thread) switches
237 // to user stack
238 v->arch._thread.on_ustack = 0;
239 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
240 }
242 int arch_domain_create(struct domain *d)
243 {
244 // the following will eventually need to be negotiated dynamically
245 d->xen_vastart = XEN_START_ADDR;
246 d->xen_vaend = XEN_END_ADDR;
247 d->shared_info_va = SHAREDINFO_ADDR;
249 if (is_idle_domain(d))
250 return 0;
252 if ((d->shared_info = (void *)alloc_xenheap_page()) == NULL)
253 goto fail_nomem;
254 memset(d->shared_info, 0, PAGE_SIZE);
256 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
257 /* We may also need emulation rid for region4, though it's unlikely
258 * to see guest issue uncacheable access in metaphysical mode. But
259 * keep such info here may be more sane.
260 */
261 if (((d->arch.metaphysical_rr0 = allocate_metaphysical_rr()) == -1UL)
262 || ((d->arch.metaphysical_rr4 = allocate_metaphysical_rr()) == -1UL))
263 BUG();
264 #define DOMAIN_RID_BITS_DEFAULT 18
265 if (!allocate_rid_range(d,DOMAIN_RID_BITS_DEFAULT)) // FIXME
266 BUG();
267 d->arch.breakimm = 0x1000;
268 d->arch.sys_pgnr = 0;
270 if ((d->arch.mm = xmalloc(struct mm_struct)) == NULL)
271 goto fail_nomem;
272 memset(d->arch.mm, 0, sizeof(*d->arch.mm));
273 INIT_LIST_HEAD(&d->arch.mm->pt_list);
275 if ((d->arch.mm->pgd = pgd_alloc(d->arch.mm)) == NULL)
276 goto fail_nomem;
278 printf ("arch_domain_create: domain=%p\n", d);
279 return 0;
281 fail_nomem:
282 if (d->arch.mm->pgd != NULL)
283 pgd_free(d->arch.mm->pgd);
284 if (d->arch.mm != NULL)
285 xfree(d->arch.mm);
286 if (d->shared_info != NULL)
287 free_xenheap_page(d->shared_info);
288 return -ENOMEM;
289 }
291 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
292 {
293 struct pt_regs *regs = vcpu_regs (v);
295 printf("arch_getdomaininfo_ctxt\n");
296 c->regs = *regs;
297 c->vcpu.evtchn_vector = v->vcpu_info->arch.evtchn_vector;
299 c->shared = v->domain->shared_info->arch;
300 }
302 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
303 {
304 struct pt_regs *regs = vcpu_regs (v);
305 struct domain *d = v->domain;
307 printf("arch_set_info_guest\n");
308 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
309 return 0;
310 if (c->flags & VGCF_VMX_GUEST) {
311 if (!vmx_enabled) {
312 printk("No VMX hardware feature for vmx domain.\n");
313 return -EINVAL;
314 }
316 if (v == d->vcpu[0])
317 vmx_setup_platform(d, c);
319 vmx_final_setup_guest(v);
320 }
322 *regs = c->regs;
323 if (v == d->vcpu[0]) {
324 /* Only for first vcpu. */
325 d->arch.sys_pgnr = c->sys_pgnr;
326 d->arch.initrd_start = c->initrd.start;
327 d->arch.initrd_len = c->initrd.size;
328 d->arch.cmdline = c->cmdline;
329 d->shared_info->arch = c->shared;
331 /* FIXME: it is required here ? */
332 sync_split_caches();
333 }
334 new_thread(v, regs->cr_iip, 0, 0);
336 v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector;
337 if ( c->vcpu.privregs && copy_from_user(v->arch.privregs,
338 c->vcpu.privregs, sizeof(mapped_regs_t))) {
339 printk("Bad ctxt address in arch_set_info_guest: %p\n",
340 c->vcpu.privregs);
341 return -EFAULT;
342 }
344 v->arch.domain_itm_last = -1L;
346 /* Don't redo final setup */
347 set_bit(_VCPUF_initialised, &v->vcpu_flags);
348 return 0;
349 }
351 static void relinquish_memory(struct domain *d, struct list_head *list)
352 {
353 struct list_head *ent;
354 struct page_info *page;
355 #ifndef __ia64__
356 unsigned long x, y;
357 #endif
359 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
360 spin_lock_recursive(&d->page_alloc_lock);
361 ent = list->next;
362 while ( ent != list )
363 {
364 page = list_entry(ent, struct page_info, list);
365 /* Grab a reference to the page so it won't disappear from under us. */
366 if ( unlikely(!get_page(page, d)) )
367 {
368 /* Couldn't get a reference -- someone is freeing this page. */
369 ent = ent->next;
370 continue;
371 }
373 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
374 put_page_and_type(page);
376 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
377 put_page(page);
379 #ifndef __ia64__
380 /*
381 * Forcibly invalidate base page tables at this point to break circular
382 * 'linear page table' references. This is okay because MMU structures
383 * are not shared across domains and this domain is now dead. Thus base
384 * tables are not in use so a non-zero count means circular reference.
385 */
386 y = page->u.inuse.type_info;
387 for ( ; ; )
388 {
389 x = y;
390 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
391 (PGT_base_page_table|PGT_validated)) )
392 break;
394 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
395 if ( likely(y == x) )
396 {
397 free_page_type(page, PGT_base_page_table);
398 break;
399 }
400 }
401 #endif
403 /* Follow the list chain and /then/ potentially free the page. */
404 ent = ent->next;
405 put_page(page);
406 }
408 spin_unlock_recursive(&d->page_alloc_lock);
409 }
411 void domain_relinquish_resources(struct domain *d)
412 {
413 /* Relinquish every page of memory. */
415 /* xenheap_list is not used in ia64. */
416 BUG_ON(!list_empty(&d->xenpage_list));
418 relinquish_memory(d, &d->page_list);
419 }
421 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
422 // and linux/arch/ia64/kernel/process.c:kernel_thread()
423 void new_thread(struct vcpu *v,
424 unsigned long start_pc,
425 unsigned long start_stack,
426 unsigned long start_info)
427 {
428 struct domain *d = v->domain;
429 struct pt_regs *regs;
430 extern char dom0_command_line[];
432 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
433 if (d == dom0 && v->vcpu_id == 0) start_pc += dom0_start;
434 #endif
436 regs = vcpu_regs (v);
437 if (VMX_DOMAIN(v)) {
438 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
439 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
440 } else {
441 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
442 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN;
443 regs->cr_ipsr &= ~(IA64_PSR_BITS_TO_CLEAR
444 | IA64_PSR_RI | IA64_PSR_IS);
445 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
446 }
447 regs->cr_iip = start_pc;
448 regs->cr_ifs = 1UL << 63; /* or clear? */
449 regs->ar_fpsr = FPSR_DEFAULT;
451 if (VMX_DOMAIN(v)) {
452 vmx_init_all_rr(v);
453 if (d == dom0)
454 regs->r28 = dom_fw_setup(d,dom0_command_line,
455 COMMAND_LINE_SIZE);
456 /* Virtual processor context setup */
457 VCPU(v, vpsr) = IA64_PSR_BN;
458 VCPU(v, dcr) = 0;
459 } else {
460 init_all_rr(v);
461 if (v->vcpu_id == 0) {
462 /* Build the firmware. */
463 if (d == dom0)
464 regs->r28 = dom_fw_setup(d,dom0_command_line,
465 COMMAND_LINE_SIZE);
466 else {
467 const char *cmdline = d->arch.cmdline;
468 int len;
470 if (*cmdline == 0) {
471 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
472 cmdline = DEFAULT_CMDLINE;
473 len = sizeof (DEFAULT_CMDLINE);
474 printf("domU command line defaulted to"
475 DEFAULT_CMDLINE "\n");
476 }
477 else
478 len = IA64_COMMAND_LINE_SIZE;
480 regs->r28 = dom_fw_setup (d, cmdline, len);
481 }
482 d->shared_info->arch.flags = (d == dom0) ?
483 (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
484 }
485 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
486 VCPU(v, banknum) = 1;
487 VCPU(v, metaphysical_mode) = 1;
488 }
489 }
492 /* Allocate a new page for domain and map it to the specified metaphysical
493 address. */
494 static struct page_info * assign_new_domain_page(struct domain *d, unsigned long mpaddr)
495 {
496 unsigned long maddr;
497 struct page_info *p;
499 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
500 if (d == dom0) {
501 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
502 /* FIXME: is it true ?
503 dom0 memory is not contiguous! */
504 printk("assign_new_domain_page: bad domain0 "
505 "mpaddr=%lx, start=%lx, end=%lx!\n",
506 mpaddr, dom0_start, dom0_start+dom0_size);
507 while(1);
508 }
509 p = mfn_to_page((mpaddr >> PAGE_SHIFT));
510 }
511 else
512 #endif
513 {
514 p = alloc_domheap_page(d);
515 // zero out pages for security reasons
516 if (p) memset(__va(page_to_maddr(p)),0,PAGE_SIZE);
517 }
518 if (unlikely(!p)) {
519 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
520 return(p);
521 }
522 maddr = page_to_maddr (p);
523 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
524 && maddr < __get_cpu_var(vhpt_pend))) {
525 /* FIXME: how can this happen ?
526 vhpt is allocated by alloc_domheap_page. */
527 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
528 maddr);
529 }
530 assign_domain_page (d, mpaddr, maddr);
531 return p;
532 }
534 /* map a physical address to the specified metaphysical addr */
535 void assign_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr)
536 {
537 struct mm_struct *mm = d->arch.mm;
538 struct page_info *pt;
539 pgd_t *pgd;
540 pud_t *pud;
541 pmd_t *pmd;
542 pte_t *pte;
544 if (!mm->pgd) {
545 printk("assign_domain_page: domain pgd must exist!\n");
546 return;
547 }
548 pgd = pgd_offset(mm,mpaddr);
549 if (pgd_none(*pgd))
550 {
551 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
552 pt = maddr_to_page(pgd_val(*pgd));
553 list_add_tail(&pt->list, &d->arch.mm->pt_list);
554 }
556 pud = pud_offset(pgd, mpaddr);
557 if (pud_none(*pud))
558 {
559 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
560 pt = maddr_to_page(pud_val(*pud));
561 list_add_tail(&pt->list, &d->arch.mm->pt_list);
562 }
564 pmd = pmd_offset(pud, mpaddr);
565 if (pmd_none(*pmd))
566 {
567 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
568 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
569 pt = maddr_to_page(pmd_val(*pmd));
570 list_add_tail(&pt->list, &d->arch.mm->pt_list);
571 }
573 pte = pte_offset_map(pmd, mpaddr);
574 if (pte_none(*pte)) {
575 set_pte(pte, pfn_pte(physaddr >> PAGE_SHIFT,
576 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
577 }
578 else printk("assign_domain_page: mpaddr %lx already mapped!\n",mpaddr);
579 if((physaddr>>PAGE_SHIFT)<max_page){
580 *(mpt_table + (physaddr>>PAGE_SHIFT))=(mpaddr>>PAGE_SHIFT);
581 }
582 }
583 #if 0
584 /* map a physical address with specified I/O flag */
585 void assign_domain_io_page(struct domain *d, unsigned long mpaddr, unsigned long flags)
586 {
587 struct mm_struct *mm = d->arch.mm;
588 pgd_t *pgd;
589 pud_t *pud;
590 pmd_t *pmd;
591 pte_t *pte;
592 pte_t io_pte;
594 if (!mm->pgd) {
595 printk("assign_domain_page: domain pgd must exist!\n");
596 return;
597 }
598 ASSERT(flags & GPFN_IO_MASK);
600 pgd = pgd_offset(mm,mpaddr);
601 if (pgd_none(*pgd))
602 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
604 pud = pud_offset(pgd, mpaddr);
605 if (pud_none(*pud))
606 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
608 pmd = pmd_offset(pud, mpaddr);
609 if (pmd_none(*pmd))
610 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
611 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
613 pte = pte_offset_map(pmd, mpaddr);
614 if (pte_none(*pte)) {
615 pte_val(io_pte) = flags;
616 set_pte(pte, io_pte);
617 }
618 else printk("assign_domain_page: mpaddr %lx already mapped!\n",mpaddr);
619 }
620 #endif
621 void mpafoo(unsigned long mpaddr)
622 {
623 extern unsigned long privop_trace;
624 if (mpaddr == 0x3800)
625 privop_trace = 1;
626 }
628 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
629 {
630 struct mm_struct *mm = d->arch.mm;
631 pgd_t *pgd = pgd_offset(mm, mpaddr);
632 pud_t *pud;
633 pmd_t *pmd;
634 pte_t *pte;
636 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
637 if (d == dom0) {
638 pte_t pteval;
639 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
640 //printk("lookup_domain_mpa: bad dom0 mpaddr 0x%lx!\n",mpaddr);
641 //printk("lookup_domain_mpa: start=0x%lx,end=0x%lx!\n",dom0_start,dom0_start+dom0_size);
642 mpafoo(mpaddr);
643 }
644 pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
645 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
646 pte = &pteval;
647 return *(unsigned long *)pte;
648 }
649 #endif
650 tryagain:
651 if (pgd_present(*pgd)) {
652 pud = pud_offset(pgd,mpaddr);
653 if (pud_present(*pud)) {
654 pmd = pmd_offset(pud,mpaddr);
655 if (pmd_present(*pmd)) {
656 pte = pte_offset_map(pmd,mpaddr);
657 if (pte_present(*pte)) {
658 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
659 return *(unsigned long *)pte;
660 } else if (VMX_DOMAIN(d->vcpu[0]))
661 return GPFN_INV_MASK;
662 }
663 }
664 }
665 /* if lookup fails and mpaddr is "legal", "create" the page */
666 if ((mpaddr >> PAGE_SHIFT) < d->max_pages) {
667 if (assign_new_domain_page(d,mpaddr)) goto tryagain;
668 }
669 printk("lookup_domain_mpa: bad mpa 0x%lx (> 0x%lx)\n",
670 mpaddr, (unsigned long) d->max_pages<<PAGE_SHIFT);
671 mpafoo(mpaddr);
672 return 0;
673 }
675 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
676 #if 1
677 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
678 {
679 unsigned long pte = lookup_domain_mpa(d,mpaddr);
680 unsigned long imva;
682 pte &= _PAGE_PPN_MASK;
683 imva = (unsigned long) __va(pte);
684 imva |= mpaddr & ~PAGE_MASK;
685 return(imva);
686 }
687 #else
688 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
689 {
690 unsigned long imva = __gpa_to_mpa(d, mpaddr);
692 return __va(imva);
693 }
694 #endif
696 // remove following line if not privifying in memory
697 //#define HAVE_PRIVIFY_MEMORY
698 #ifndef HAVE_PRIVIFY_MEMORY
699 #define privify_memory(x,y) do {} while(0)
700 #endif
702 // see arch/x86/xxx/domain_build.c
703 int elf_sanity_check(Elf_Ehdr *ehdr)
704 {
705 return (IS_ELF(*ehdr));
706 }
708 static void copy_memory(void *dst, void *src, int size)
709 {
710 int remain;
712 if (IS_XEN_ADDRESS(dom0,(unsigned long) src)) {
713 memcpy(dst,src,size);
714 }
715 else {
716 printf("About to call __copy_from_user(%p,%p,%d)\n",
717 dst,src,size);
718 while ((remain = __copy_from_user(dst,src,size)) != 0) {
719 printf("incomplete user copy, %d remain of %d\n",
720 remain,size);
721 dst += size - remain; src += size - remain;
722 size -= remain;
723 }
724 }
725 }
727 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
728 {
729 char *elfbase = (char *) image_start;
730 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
731 Elf_Ehdr ehdr;
732 Elf_Phdr phdr;
733 int h, filesz, memsz;
734 unsigned long elfaddr, dom_mpaddr, dom_imva;
735 struct page_info *p;
737 copy_memory(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
738 for ( h = 0; h < ehdr.e_phnum; h++ ) {
739 copy_memory(&phdr,elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
740 sizeof(Elf_Phdr));
741 //if ( !is_loadable_phdr(phdr) )
742 if ((phdr.p_type != PT_LOAD)) {
743 continue;
744 }
745 filesz = phdr.p_filesz; memsz = phdr.p_memsz;
746 elfaddr = (unsigned long) elfbase + phdr.p_offset;
747 dom_mpaddr = phdr.p_paddr;
748 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
749 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
750 if (d == dom0) {
751 if (dom_mpaddr+memsz>dom0_size || dom_mpaddr+filesz>dom0_size) {
752 printf("Domain0 doesn't fit in allocated space!\n");
753 while(1);
754 }
755 dom_imva = (unsigned long) __va(dom_mpaddr + dom0_start);
756 copy_memory((void *) dom_imva, (void *) elfaddr, filesz);
757 if (memsz > filesz) memset((void *) dom_imva+filesz, 0, memsz-filesz);
758 //FIXME: This test for code seems to find a lot more than objdump -x does
759 if (phdr.p_flags & PF_X) privify_memory(dom_imva,filesz);
760 }
761 else
762 #endif
763 while (memsz > 0) {
764 p = assign_new_domain_page(d,dom_mpaddr);
765 if (unlikely(!p)) BUG();
766 dom_imva = (unsigned long) __va(page_to_maddr(p));
767 if (filesz > 0) {
768 if (filesz >= PAGE_SIZE)
769 copy_memory((void *) dom_imva, (void *) elfaddr, PAGE_SIZE);
770 else { // copy partial page, zero the rest of page
771 copy_memory((void *) dom_imva, (void *) elfaddr, filesz);
772 memset((void *) dom_imva+filesz, 0, PAGE_SIZE-filesz);
773 }
774 //FIXME: This test for code seems to find a lot more than objdump -x does
775 if (phdr.p_flags & PF_X)
776 privify_memory(dom_imva,PAGE_SIZE);
777 }
778 else if (memsz > 0) // always zero out entire page
779 memset((void *) dom_imva, 0, PAGE_SIZE);
780 memsz -= PAGE_SIZE; filesz -= PAGE_SIZE;
781 elfaddr += PAGE_SIZE; dom_mpaddr += PAGE_SIZE;
782 }
783 }
784 }
786 void alloc_dom0(void)
787 {
788 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
789 if (platform_is_hp_ski()) {
790 dom0_size = 128*1024*1024; //FIXME: Should be configurable
791 }
792 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
794 /* FIXME: The first trunk (say 256M) should always be assigned to
795 * Dom0, since Dom0's physical == machine address for DMA purpose.
796 * Some old version linux, like 2.4, assumes physical memory existing
797 * in 2nd 64M space.
798 */
799 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
800 dom0_start <<= PAGE_SHIFT;
801 if (!dom0_start) {
802 printf("alloc_dom0: can't allocate contiguous memory size=%lu\n",
803 dom0_size);
804 while(1);
805 }
806 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
807 #else
808 dom0_start = 0;
809 #endif
811 }
814 /*
815 * Domain 0 has direct access to all devices absolutely. However
816 * the major point of this stub here, is to allow alloc_dom_mem
817 * handled with order > 0 request. Dom0 requires that bit set to
818 * allocate memory for other domains.
819 */
820 static void physdev_init_dom0(struct domain *d)
821 {
822 if (iomem_permit_access(d, 0UL, ~0UL))
823 BUG();
824 if (irqs_permit_access(d, 0, NR_PIRQS-1))
825 BUG();
826 }
828 static unsigned int vmx_dom0 = 0;
829 int construct_dom0(struct domain *d,
830 unsigned long image_start, unsigned long image_len,
831 unsigned long initrd_start, unsigned long initrd_len,
832 char *cmdline)
833 {
834 int i, rc;
835 unsigned long alloc_start, alloc_end;
836 start_info_t *si;
837 struct vcpu *v = d->vcpu[0];
839 struct domain_setup_info dsi;
840 unsigned long p_start;
841 unsigned long pkern_start;
842 unsigned long pkern_entry;
843 unsigned long pkern_end;
844 unsigned long pinitrd_start = 0;
845 unsigned long pstart_info;
846 #if 0
847 char *dst;
848 unsigned long nr_pt_pages;
849 unsigned long count;
850 #endif
851 #ifdef VALIDATE_VT
852 unsigned long mfn;
853 struct page_info *page = NULL;
854 #endif
856 //printf("construct_dom0: starting\n");
858 /* Sanity! */
859 BUG_ON(d != dom0);
860 BUG_ON(d->vcpu[0] == NULL);
861 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
863 memset(&dsi, 0, sizeof(struct domain_setup_info));
865 printk("*** LOADING DOMAIN 0 ***\n");
867 alloc_start = dom0_start;
868 alloc_end = dom0_start + dom0_size;
869 d->tot_pages = d->max_pages = dom0_size/PAGE_SIZE;
870 dsi.image_addr = (unsigned long)image_start;
871 dsi.image_len = image_len;
872 rc = parseelfimage(&dsi);
873 if ( rc != 0 )
874 return rc;
876 #ifdef VALIDATE_VT
877 /* Temp workaround */
878 if (running_on_sim)
879 dsi.xen_section_string = (char *)1;
881 /* Check whether dom0 is vti domain */
882 if ((!vmx_enabled) && !dsi.xen_section_string) {
883 printk("Lack of hardware support for unmodified vmx dom0\n");
884 panic("");
885 }
887 if (vmx_enabled && !dsi.xen_section_string) {
888 printk("Dom0 is vmx domain!\n");
889 vmx_dom0 = 1;
890 }
891 #endif
893 p_start = dsi.v_start;
894 pkern_start = dsi.v_kernstart;
895 pkern_end = dsi.v_kernend;
896 pkern_entry = dsi.v_kernentry;
898 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
900 if ( (p_start & (PAGE_SIZE-1)) != 0 )
901 {
902 printk("Initial guest OS must load to a page boundary.\n");
903 return -EINVAL;
904 }
906 if(initrd_start&&initrd_len){
907 pinitrd_start=(dom0_start+dom0_size) -
908 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
910 memcpy(__va(pinitrd_start), (void *) initrd_start, initrd_len);
911 pstart_info = PAGE_ALIGN(pinitrd_start + initrd_len);
912 } else {
913 pstart_info = PAGE_ALIGN(pkern_end);
914 }
916 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
917 " Kernel image: %lx->%lx\n"
918 " Entry address: %lx\n"
919 " Init. ramdisk: %lx len %lx\n"
920 " Start info.: %lx->%lx\n",
921 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
922 pstart_info, pstart_info + PAGE_SIZE);
924 if ( (pkern_end - pkern_start) > (d->max_pages * PAGE_SIZE) )
925 {
926 printk("Initial guest OS requires too much space\n"
927 "(%luMB is greater than %luMB limit)\n",
928 (pkern_end-pkern_start)>>20,
929 (unsigned long) (d->max_pages<<PAGE_SHIFT)>>20);
930 return -ENOMEM;
931 }
933 // if high 3 bits of pkern start are non-zero, error
935 // if pkern end is after end of metaphysical memory, error
936 // (we should be able to deal with this... later)
939 //
941 #if 0
942 strcpy(d->name,"Domain0");
943 #endif
945 /* Mask all upcalls... */
946 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
947 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
949 if (dom0_max_vcpus == 0)
950 dom0_max_vcpus = MAX_VIRT_CPUS;
951 if (dom0_max_vcpus > num_online_cpus())
952 dom0_max_vcpus = num_online_cpus();
953 if (dom0_max_vcpus > MAX_VIRT_CPUS)
954 dom0_max_vcpus = MAX_VIRT_CPUS;
956 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
957 for ( i = 1; i < dom0_max_vcpus; i++ )
958 if (alloc_vcpu(d, i, i) == NULL)
959 printf ("Cannot allocate dom0 vcpu %d\n", i);
961 #ifdef VALIDATE_VT
962 /* Construct a frame-allocation list for the initial domain, since these
963 * pages are allocated by boot allocator and pfns are not set properly
964 */
965 for ( mfn = (alloc_start>>PAGE_SHIFT);
966 mfn < (alloc_end>>PAGE_SHIFT);
967 mfn++ )
968 {
969 page = &frame_table[mfn];
970 page_set_owner(page, d);
971 page->u.inuse.type_info = 0;
972 page->count_info = PGC_allocated | 1;
973 list_add_tail(&page->list, &d->page_list);
975 /* Construct 1:1 mapping */
976 machine_to_phys_mapping[mfn] = mfn;
977 }
979 #endif
981 /* Copy the OS image. */
982 loaddomainelfimage(d,image_start);
984 /* Copy the initial ramdisk. */
985 //if ( initrd_len != 0 )
986 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
989 /* Set up start info area. */
990 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
991 si = __va(pstart_info);
992 memset(si, 0, PAGE_SIZE);
993 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
994 si->nr_pages = d->tot_pages;
996 #if 0
997 si->shared_info = virt_to_maddr(d->shared_info);
998 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
999 //si->pt_base = vpt_start;
1000 //si->nr_pt_frames = nr_pt_pages;
1001 //si->mfn_list = vphysmap_start;
1003 if ( initrd_len != 0 )
1005 //si->mod_start = vinitrd_start;
1006 si->mod_len = initrd_len;
1007 printk("Initrd len 0x%lx, start at 0x%08lx\n",
1008 si->mod_len, si->mod_start);
1011 dst = si->cmd_line;
1012 if ( cmdline != NULL )
1014 for ( i = 0; i < 255; i++ )
1016 if ( cmdline[i] == '\0' )
1017 break;
1018 *dst++ = cmdline[i];
1021 *dst = '\0';
1023 zap_low_mappings(); /* Do the same for the idle page tables. */
1024 #endif
1026 /* Give up the VGA console if DOM0 is configured to grab it. */
1027 if (cmdline != NULL)
1028 console_endboot(strstr(cmdline, "tty0") != NULL);
1030 /* VMX specific construction for Dom0, if hardware supports VMX
1031 * and Dom0 is unmodified image
1032 */
1033 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
1034 if (vmx_dom0)
1035 vmx_final_setup_guest(v);
1037 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1039 new_thread(v, pkern_entry, 0, 0);
1040 physdev_init_dom0(d);
1041 sync_split_caches();
1043 // FIXME: Hack for keyboard input
1044 serial_input_init();
1046 return 0;
1049 void machine_restart(char * __unused)
1051 if (platform_is_hp_ski()) dummy();
1052 printf("machine_restart called: spinning....\n");
1053 while(1);
1056 void machine_halt(void)
1058 if (platform_is_hp_ski()) dummy();
1059 printf("machine_halt called: spinning....\n");
1060 while(1);
1063 void dummy_called(char *function)
1065 if (platform_is_hp_ski()) asm("break 0;;");
1066 printf("dummy called in %s: spinning....\n", function);
1067 while(1);
1071 #if 0
1072 void switch_to(struct vcpu *prev, struct vcpu *next)
1074 struct vcpu *last;
1076 __switch_to(prev,next,last);
1077 //set_current(next);
1079 #endif
1081 void domain_pend_keyboard_interrupt(int irq)
1083 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1086 void sync_vcpu_execstate(struct vcpu *v)
1088 ia64_save_fpu(v->arch._thread.fph);
1089 if (VMX_DOMAIN(v))
1090 vmx_save_state(v);
1091 else {
1092 if (IA64_HAS_EXTRA_STATE(v))
1093 ia64_save_extra(v);
1095 // FIXME SMP: Anything else needed here for SMP?
1098 // FIXME: It would be nice to print out a nice error message for bad
1099 // values of these boot-time parameters, but it seems we are too early
1100 // in the boot and attempts to print freeze the system?
1101 #define abort(x...) do {} while(0)
1102 #define warn(x...) do {} while(0)
1104 static void parse_dom0_mem(char *s)
1106 unsigned long bytes = parse_size_and_unit(s);
1108 if (dom0_size < 4 * 1024 * 1024) {
1109 abort("parse_dom0_mem: too small, boot aborted"
1110 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1112 if (dom0_size % dom0_align) {
1113 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1114 warn("parse_dom0_mem: dom0_size rounded up from"
1115 " %lx to %lx bytes, due to dom0_align=%lx\n",
1116 bytes,dom0_size,dom0_align);
1118 else dom0_size = bytes;
1120 custom_param("dom0_mem", parse_dom0_mem);
1123 static void parse_dom0_align(char *s)
1125 unsigned long bytes = parse_size_and_unit(s);
1127 if ((bytes - 1) ^ bytes) { /* not a power of two */
1128 abort("parse_dom0_align: dom0_align must be power of two, "
1129 "boot aborted"
1130 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1132 else if (bytes < PAGE_SIZE) {
1133 abort("parse_dom0_align: dom0_align must be >= %ld, "
1134 "boot aborted"
1135 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1136 PAGE_SIZE);
1138 else dom0_align = bytes;
1139 if (dom0_size % dom0_align) {
1140 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1141 warn("parse_dom0_align: dom0_size rounded up from"
1142 " %ld to %ld bytes, due to dom0_align=%lx\n",
1143 bytes,dom0_size,dom0_align);
1146 custom_param("dom0_align", parse_dom0_align);