ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 9685:4e8a64d8bd0e

[IA64] regionreg.c: deallocate metaphysical rids

allocate_rid_range also allocates metaphysical rids.
deallocate_rid_range also deallocates mp rids.
init_rid_allocator() added.

Signed-off-by: Tristan Gingold <tristan.gingold@bull.net>
author awilliam@xenbuild.aw
date Fri Apr 14 14:13:13 2006 -0600 (2006-04-14)
parents db2bd8169e9b
children 96bc87dd7ca9
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <xen/iocap.h>
23 #include <asm/ptrace.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/hw_irq.h>
29 #include <asm/setup.h>
30 //#include <asm/mpspec.h>
31 #include <xen/irq.h>
32 #include <xen/event.h>
33 //#include <xen/shadow.h>
34 #include <xen/console.h>
35 #include <xen/compile.h>
37 #include <xen/elf.h>
38 //#include <asm/page.h>
39 #include <asm/pgalloc.h>
41 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
43 #include <asm/vcpu.h> /* for function declarations */
44 #include <public/arch-ia64.h>
45 #include <asm/vmx.h>
46 #include <asm/vmx_vcpu.h>
47 #include <asm/vmx_vpd.h>
48 #include <asm/vmx_phy_mode.h>
49 #include <asm/pal.h>
50 #include <asm/vhpt.h>
51 #include <public/hvm/ioreq.h>
52 #include <public/arch-ia64.h>
53 #include <asm/tlbflush.h>
54 #include <asm/regionreg.h>
55 #include <asm/dom_fw.h>
57 #define CONFIG_DOMAIN0_CONTIGUOUS
58 unsigned long dom0_start = -1L;
59 unsigned long dom0_size = 512*1024*1024;
60 unsigned long dom0_align = 64*1024*1024;
62 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
63 static unsigned int dom0_max_vcpus = 1;
64 integer_param("dom0_max_vcpus", dom0_max_vcpus);
66 // initialized by arch/ia64/setup.c:find_initrd()
67 unsigned long initrd_start = 0, initrd_end = 0;
68 extern unsigned long running_on_sim;
70 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
72 /* FIXME: where these declarations should be there ? */
73 extern void domain_pend_keyboard_interrupt(int);
74 extern long platform_is_hp_ski(void);
75 extern void sync_split_caches(void);
76 extern void serial_input_init(void);
78 static void init_switch_stack(struct vcpu *v);
79 void build_physmap_table(struct domain *d);
81 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
82 void arch_domain_destroy(struct domain *d)
83 {
84 struct page_info *page;
85 struct list_head *ent, *prev;
87 if (d->arch.mm->pgd != NULL)
88 {
89 list_for_each ( ent, &d->arch.mm->pt_list )
90 {
91 page = list_entry(ent, struct page_info, list);
92 prev = ent->prev;
93 list_del(ent);
94 free_xenheap_page(page_to_virt(page));
95 ent = prev;
96 }
97 pgd_free(d->arch.mm->pgd);
98 }
99 if (d->arch.mm != NULL)
100 xfree(d->arch.mm);
101 if (d->shared_info != NULL)
102 free_xenheap_page(d->shared_info);
104 deallocate_rid_range(d);
106 /* It is really good in this? */
107 flush_tlb_all();
109 /* It is really good in this? */
110 vhpt_flush_all();
111 }
113 static void default_idle(void)
114 {
115 int cpu = smp_processor_id();
116 local_irq_disable();
117 if ( !softirq_pending(cpu))
118 safe_halt();
119 local_irq_enable();
120 }
122 static void continue_cpu_idle_loop(void)
123 {
124 int cpu = smp_processor_id();
125 for ( ; ; )
126 {
127 #ifdef IA64
128 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
129 #else
130 irq_stat[cpu].idle_timestamp = jiffies;
131 #endif
132 while ( !softirq_pending(cpu) )
133 default_idle();
134 add_preempt_count(SOFTIRQ_OFFSET);
135 raise_softirq(SCHEDULE_SOFTIRQ);
136 do_softirq();
137 sub_preempt_count(SOFTIRQ_OFFSET);
138 }
139 }
141 void startup_cpu_idle_loop(void)
142 {
143 /* Just some sanity to ensure that the scheduler is set up okay. */
144 ASSERT(current->domain == IDLE_DOMAIN_ID);
145 raise_softirq(SCHEDULE_SOFTIRQ);
146 #if 0
147 //do we have to ensure the idle task has a shared page so that, for example,
148 //region registers can be loaded from it. Apparently not...
149 idle0_task.shared_info = (void *)alloc_xenheap_page();
150 memset(idle0_task.shared_info, 0, PAGE_SIZE);
151 /* pin mapping */
152 // FIXME: Does this belong here? Or do only at domain switch time?
153 {
154 /* WARNING: following must be inlined to avoid nested fault */
155 unsigned long psr = ia64_clear_ic();
156 ia64_itr(0x2, IA64_TR_SHARED_INFO, SHAREDINFO_ADDR,
157 pte_val(pfn_pte(ia64_tpa(idle0_task.shared_info) >> PAGE_SHIFT, PAGE_KERNEL)),
158 PAGE_SHIFT);
159 ia64_set_psr(psr);
160 ia64_srlz_i();
161 }
162 #endif
164 continue_cpu_idle_loop();
165 }
167 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
168 {
169 struct vcpu *v;
170 struct thread_info *ti;
172 /* Still keep idle vcpu0 static allocated at compilation, due
173 * to some code from Linux still requires it in early phase.
174 */
175 if (is_idle_domain(d) && !vcpu_id)
176 v = idle_vcpu[0];
177 else {
178 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
179 return NULL;
180 memset(v, 0, sizeof(*v));
182 ti = alloc_thread_info(v);
183 /* Clear thread_info to clear some important fields, like
184 * preempt_count
185 */
186 memset(ti, 0, sizeof(struct thread_info));
187 init_switch_stack(v);
188 }
190 if (!is_idle_domain(d)) {
191 v->arch.privregs =
192 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
193 BUG_ON(v->arch.privregs == NULL);
194 memset(v->arch.privregs, 0, PAGE_SIZE);
196 if (!vcpu_id)
197 memset(&d->shared_info->evtchn_mask[0], 0xff,
198 sizeof(d->shared_info->evtchn_mask));
200 v->vcpu_info = &(d->shared_info->vcpu_info[0]);
201 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
202 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
203 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
204 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
206 /* Is it correct ?
207 It depends on the domain rid usage.
209 A domain may share rid among its processor (eg having a
210 global VHPT). In this case, we should also share rid
211 among vcpus and the rid range should be the same.
213 However a domain may have per cpu rid allocation. In
214 this case we don't want to share rid among vcpus, but we may
215 do it if two vcpus are on the same cpu... */
217 v->arch.starting_rid = d->arch.starting_rid;
218 v->arch.ending_rid = d->arch.ending_rid;
219 v->arch.breakimm = d->arch.breakimm;
220 }
222 return v;
223 }
225 void free_vcpu_struct(struct vcpu *v)
226 {
227 if (VMX_DOMAIN(v))
228 vmx_relinquish_vcpu_resources(v);
229 else {
230 if (v->arch.privregs != NULL)
231 free_xenheap_pages(v->arch.privregs, get_order(sizeof(mapped_regs_t)));
232 }
234 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
235 }
237 static void init_switch_stack(struct vcpu *v)
238 {
239 struct pt_regs *regs = vcpu_regs (v);
240 struct switch_stack *sw = (struct switch_stack *) regs - 1;
241 extern void ia64_ret_from_clone;
243 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
244 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
245 sw->b0 = (unsigned long) &ia64_ret_from_clone;
246 sw->ar_fpsr = FPSR_DEFAULT;
247 v->arch._thread.ksp = (unsigned long) sw - 16;
248 // stay on kernel stack because may get interrupts!
249 // ia64_ret_from_clone (which b0 gets in new_thread) switches
250 // to user stack
251 v->arch._thread.on_ustack = 0;
252 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
253 }
255 int arch_domain_create(struct domain *d)
256 {
257 // the following will eventually need to be negotiated dynamically
258 d->xen_vastart = XEN_START_ADDR;
259 d->xen_vaend = XEN_END_ADDR;
260 d->shared_info_va = SHAREDINFO_ADDR;
262 if (is_idle_domain(d))
263 return 0;
265 if ((d->shared_info = (void *)alloc_xenheap_page()) == NULL)
266 goto fail_nomem;
267 memset(d->shared_info, 0, PAGE_SIZE);
269 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
270 /* We may also need emulation rid for region4, though it's unlikely
271 * to see guest issue uncacheable access in metaphysical mode. But
272 * keep such info here may be more sane.
273 */
274 if (!allocate_rid_range(d,0))
275 goto fail_nomem;
276 d->arch.breakimm = 0x1000;
277 d->arch.sys_pgnr = 0;
279 if ((d->arch.mm = xmalloc(struct mm_struct)) == NULL)
280 goto fail_nomem;
281 memset(d->arch.mm, 0, sizeof(*d->arch.mm));
282 INIT_LIST_HEAD(&d->arch.mm->pt_list);
284 d->arch.physmap_built = 0;
285 if ((d->arch.mm->pgd = pgd_alloc(d->arch.mm)) == NULL)
286 goto fail_nomem;
288 printf ("arch_domain_create: domain=%p\n", d);
289 return 0;
291 fail_nomem:
292 if (d->arch.mm->pgd != NULL)
293 pgd_free(d->arch.mm->pgd);
294 if (d->arch.mm != NULL)
295 xfree(d->arch.mm);
296 if (d->shared_info != NULL)
297 free_xenheap_page(d->shared_info);
298 return -ENOMEM;
299 }
301 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
302 {
303 struct pt_regs *regs = vcpu_regs (v);
305 printf("arch_getdomaininfo_ctxt\n");
306 c->regs = *regs;
307 c->vcpu.evtchn_vector = v->vcpu_info->arch.evtchn_vector;
309 c->shared = v->domain->shared_info->arch;
310 }
312 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
313 {
314 struct pt_regs *regs = vcpu_regs (v);
315 struct domain *d = v->domain;
317 printf("arch_set_info_guest\n");
318 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
319 return 0;
320 if (c->flags & VGCF_VMX_GUEST) {
321 if (!vmx_enabled) {
322 printk("No VMX hardware feature for vmx domain.\n");
323 return -EINVAL;
324 }
326 if (v == d->vcpu[0])
327 vmx_setup_platform(d, c);
329 vmx_final_setup_guest(v);
330 } else if (!d->arch.physmap_built)
331 build_physmap_table(d);
333 *regs = c->regs;
334 if (v == d->vcpu[0]) {
335 /* Only for first vcpu. */
336 d->arch.sys_pgnr = c->sys_pgnr;
337 d->arch.initrd_start = c->initrd.start;
338 d->arch.initrd_len = c->initrd.size;
339 d->arch.cmdline = c->cmdline;
340 d->shared_info->arch = c->shared;
342 /* FIXME: it is required here ? */
343 sync_split_caches();
344 }
345 new_thread(v, regs->cr_iip, 0, 0);
347 v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector;
348 if ( c->vcpu.privregs && copy_from_user(v->arch.privregs,
349 c->vcpu.privregs, sizeof(mapped_regs_t))) {
350 printk("Bad ctxt address in arch_set_info_guest: %p\n",
351 c->vcpu.privregs);
352 return -EFAULT;
353 }
355 v->arch.domain_itm_last = -1L;
357 /* Don't redo final setup */
358 set_bit(_VCPUF_initialised, &v->vcpu_flags);
359 return 0;
360 }
362 static void relinquish_memory(struct domain *d, struct list_head *list)
363 {
364 struct list_head *ent;
365 struct page_info *page;
366 #ifndef __ia64__
367 unsigned long x, y;
368 #endif
370 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
371 spin_lock_recursive(&d->page_alloc_lock);
372 ent = list->next;
373 while ( ent != list )
374 {
375 page = list_entry(ent, struct page_info, list);
376 /* Grab a reference to the page so it won't disappear from under us. */
377 if ( unlikely(!get_page(page, d)) )
378 {
379 /* Couldn't get a reference -- someone is freeing this page. */
380 ent = ent->next;
381 continue;
382 }
384 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
385 put_page_and_type(page);
387 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
388 put_page(page);
390 #ifndef __ia64__
391 /*
392 * Forcibly invalidate base page tables at this point to break circular
393 * 'linear page table' references. This is okay because MMU structures
394 * are not shared across domains and this domain is now dead. Thus base
395 * tables are not in use so a non-zero count means circular reference.
396 */
397 y = page->u.inuse.type_info;
398 for ( ; ; )
399 {
400 x = y;
401 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
402 (PGT_base_page_table|PGT_validated)) )
403 break;
405 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
406 if ( likely(y == x) )
407 {
408 free_page_type(page, PGT_base_page_table);
409 break;
410 }
411 }
412 #endif
414 /* Follow the list chain and /then/ potentially free the page. */
415 ent = ent->next;
416 put_page(page);
417 }
419 spin_unlock_recursive(&d->page_alloc_lock);
420 }
422 void domain_relinquish_resources(struct domain *d)
423 {
424 /* Relinquish every page of memory. */
426 /* xenheap_list is not used in ia64. */
427 BUG_ON(!list_empty(&d->xenpage_list));
429 relinquish_memory(d, &d->page_list);
430 }
432 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
433 // and linux/arch/ia64/kernel/process.c:kernel_thread()
434 void new_thread(struct vcpu *v,
435 unsigned long start_pc,
436 unsigned long start_stack,
437 unsigned long start_info)
438 {
439 struct domain *d = v->domain;
440 struct pt_regs *regs;
441 extern char dom0_command_line[];
443 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
444 if (d == dom0 && v->vcpu_id == 0) start_pc += dom0_start;
445 #endif
447 regs = vcpu_regs (v);
448 if (VMX_DOMAIN(v)) {
449 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
450 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
451 } else {
452 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
453 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN;
454 regs->cr_ipsr &= ~(IA64_PSR_BITS_TO_CLEAR
455 | IA64_PSR_RI | IA64_PSR_IS);
456 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
457 }
458 regs->cr_iip = start_pc;
459 regs->cr_ifs = 1UL << 63; /* or clear? */
460 regs->ar_fpsr = FPSR_DEFAULT;
462 if (VMX_DOMAIN(v)) {
463 vmx_init_all_rr(v);
464 if (d == dom0)
465 regs->r28 = dom_fw_setup(d,dom0_command_line,
466 COMMAND_LINE_SIZE);
467 /* Virtual processor context setup */
468 VCPU(v, vpsr) = IA64_PSR_BN;
469 VCPU(v, dcr) = 0;
470 } else {
471 init_all_rr(v);
472 if (v->vcpu_id == 0) {
473 /* Build the firmware. */
474 if (d == dom0)
475 regs->r28 = dom_fw_setup(d,dom0_command_line,
476 COMMAND_LINE_SIZE);
477 else {
478 const char *cmdline = d->arch.cmdline;
479 int len;
481 if (*cmdline == 0) {
482 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
483 cmdline = DEFAULT_CMDLINE;
484 len = sizeof (DEFAULT_CMDLINE);
485 printf("domU command line defaulted to"
486 DEFAULT_CMDLINE "\n");
487 }
488 else
489 len = IA64_COMMAND_LINE_SIZE;
491 regs->r28 = dom_fw_setup (d, cmdline, len);
492 }
493 d->shared_info->arch.flags = (d == dom0) ?
494 (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
495 }
496 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
497 VCPU(v, banknum) = 1;
498 VCPU(v, metaphysical_mode) = 1;
499 VCPU(v, interrupt_mask_addr) =
500 (uint64_t)SHAREDINFO_ADDR + INT_ENABLE_OFFSET(v);
501 VCPU(v, itv) = (1 << 16); /* timer vector masked */
502 }
503 }
506 /* Allocate a new page for domain and map it to the specified metaphysical
507 address. */
508 static struct page_info * assign_new_domain_page(struct domain *d, unsigned long mpaddr)
509 {
510 unsigned long maddr;
511 struct page_info *p;
513 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
514 if (d == dom0) {
515 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
516 /* FIXME: is it true ?
517 dom0 memory is not contiguous! */
518 printk("assign_new_domain_page: bad domain0 "
519 "mpaddr=%lx, start=%lx, end=%lx!\n",
520 mpaddr, dom0_start, dom0_start+dom0_size);
521 while(1);
522 }
523 p = mfn_to_page((mpaddr >> PAGE_SHIFT));
524 }
525 else
526 #endif
527 {
528 p = alloc_domheap_page(d);
529 // zero out pages for security reasons
530 if (p) memset(__va(page_to_maddr(p)),0,PAGE_SIZE);
531 }
532 if (unlikely(!p)) {
533 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
534 return(p);
535 }
536 maddr = page_to_maddr (p);
537 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
538 && maddr < __get_cpu_var(vhpt_pend))) {
539 /* FIXME: how can this happen ?
540 vhpt is allocated by alloc_domheap_page. */
541 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
542 maddr);
543 }
544 assign_domain_page (d, mpaddr, maddr);
545 return p;
546 }
548 /* map a physical address to the specified metaphysical addr */
549 void assign_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr)
550 {
551 struct mm_struct *mm = d->arch.mm;
552 struct page_info *pt;
553 pgd_t *pgd;
554 pud_t *pud;
555 pmd_t *pmd;
556 pte_t *pte;
558 if (!mm->pgd) {
559 printk("assign_domain_page: domain pgd must exist!\n");
560 return;
561 }
562 pgd = pgd_offset(mm,mpaddr);
563 if (pgd_none(*pgd))
564 {
565 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
566 pt = maddr_to_page(pgd_val(*pgd));
567 list_add_tail(&pt->list, &d->arch.mm->pt_list);
568 }
570 pud = pud_offset(pgd, mpaddr);
571 if (pud_none(*pud))
572 {
573 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
574 pt = maddr_to_page(pud_val(*pud));
575 list_add_tail(&pt->list, &d->arch.mm->pt_list);
576 }
578 pmd = pmd_offset(pud, mpaddr);
579 if (pmd_none(*pmd))
580 {
581 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm,mpaddr));
582 // pmd_populate(mm, pmd, pte_alloc_one(mm,mpaddr));
583 pt = maddr_to_page(pmd_val(*pmd));
584 list_add_tail(&pt->list, &d->arch.mm->pt_list);
585 }
587 pte = pte_offset_map(pmd, mpaddr);
588 if (pte_none(*pte)) {
589 set_pte(pte, pfn_pte(physaddr >> PAGE_SHIFT,
590 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
591 }
592 else printk("assign_domain_page: mpaddr %lx already mapped!\n",mpaddr);
593 if((physaddr>>PAGE_SHIFT)<max_page){
594 *(mpt_table + (physaddr>>PAGE_SHIFT))=(mpaddr>>PAGE_SHIFT);
595 }
596 }
598 void build_physmap_table(struct domain *d)
599 {
600 struct list_head *list_ent = d->page_list.next;
601 unsigned long mfn, i = 0;
603 ASSERT(!d->arch.physmap_built);
604 while(list_ent != &d->page_list) {
605 mfn = page_to_mfn(list_entry(
606 list_ent, struct page_info, list));
607 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
609 i++;
610 list_ent = mfn_to_page(mfn)->list.next;
611 }
612 d->arch.physmap_built = 1;
613 }
615 void mpafoo(unsigned long mpaddr)
616 {
617 extern unsigned long privop_trace;
618 if (mpaddr == 0x3800)
619 privop_trace = 1;
620 }
622 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
623 {
624 struct mm_struct *mm = d->arch.mm;
625 pgd_t *pgd = pgd_offset(mm, mpaddr);
626 pud_t *pud;
627 pmd_t *pmd;
628 pte_t *pte;
630 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
631 if (d == dom0) {
632 pte_t pteval;
633 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
634 //printk("lookup_domain_mpa: bad dom0 mpaddr 0x%lx!\n",mpaddr);
635 //printk("lookup_domain_mpa: start=0x%lx,end=0x%lx!\n",dom0_start,dom0_start+dom0_size);
636 mpafoo(mpaddr);
637 }
638 pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
639 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
640 pte = &pteval;
641 return *(unsigned long *)pte;
642 }
643 #endif
644 if (pgd_present(*pgd)) {
645 pud = pud_offset(pgd,mpaddr);
646 if (pud_present(*pud)) {
647 pmd = pmd_offset(pud,mpaddr);
648 if (pmd_present(*pmd)) {
649 pte = pte_offset_map(pmd,mpaddr);
650 if (pte_present(*pte)) {
651 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
652 return *(unsigned long *)pte;
653 } else if (VMX_DOMAIN(d->vcpu[0]))
654 return GPFN_INV_MASK;
655 }
656 }
657 }
658 if ((mpaddr >> PAGE_SHIFT) < d->max_pages) {
659 printk("lookup_domain_mpa: non-allocated mpa 0x%lx (< 0x%lx)\n",
660 mpaddr, (unsigned long) d->max_pages<<PAGE_SHIFT);
661 } else
662 printk("lookup_domain_mpa: bad mpa 0x%lx (> 0x%lx)\n",
663 mpaddr, (unsigned long) d->max_pages<<PAGE_SHIFT);
664 mpafoo(mpaddr);
665 return 0;
666 }
668 /* Flush cache of domain d. */
669 void domain_cache_flush (struct domain *d, int sync_only)
670 {
671 struct mm_struct *mm = d->arch.mm;
672 pgd_t *pgd = mm->pgd;
673 unsigned long maddr;
674 int i,j,k, l;
675 int nbr_page = 0;
676 void (*flush_func)(unsigned long start, unsigned long end);
677 extern void flush_dcache_range (unsigned long, unsigned long);
679 if (sync_only)
680 flush_func = &flush_icache_range;
681 else
682 flush_func = &flush_dcache_range;
684 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
685 if (d == dom0) {
686 /* This is not fully correct (because of hole), but it should
687 be enough for now. */
688 (*flush_func)(__va_ul (dom0_start),
689 __va_ul (dom0_start + dom0_size));
690 return;
691 }
692 #endif
693 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
694 pud_t *pud;
695 if (!pgd_present(*pgd))
696 continue;
697 pud = pud_offset(pgd, 0);
698 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
699 pmd_t *pmd;
700 if (!pud_present(*pud))
701 continue;
702 pmd = pmd_offset(pud, 0);
703 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
704 pte_t *pte;
705 if (!pmd_present(*pmd))
706 continue;
707 pte = pte_offset_map(pmd, 0);
708 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
709 if (!pte_present(*pte))
710 continue;
711 /* Convert PTE to maddr. */
712 maddr = __va_ul (pte_val(*pte)
713 & _PAGE_PPN_MASK);
714 (*flush_func)(maddr, maddr+ PAGE_SIZE);
715 nbr_page++;
716 }
717 }
718 }
719 }
720 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
721 }
723 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
724 #if 1
725 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
726 {
727 unsigned long pte = lookup_domain_mpa(d,mpaddr);
728 unsigned long imva;
730 pte &= _PAGE_PPN_MASK;
731 imva = (unsigned long) __va(pte);
732 imva |= mpaddr & ~PAGE_MASK;
733 return(imva);
734 }
735 #else
736 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
737 {
738 unsigned long imva = __gpa_to_mpa(d, mpaddr);
740 return __va(imva);
741 }
742 #endif
744 // remove following line if not privifying in memory
745 //#define HAVE_PRIVIFY_MEMORY
746 #ifndef HAVE_PRIVIFY_MEMORY
747 #define privify_memory(x,y) do {} while(0)
748 #endif
750 // see arch/x86/xxx/domain_build.c
751 int elf_sanity_check(Elf_Ehdr *ehdr)
752 {
753 return (IS_ELF(*ehdr));
754 }
756 static void copy_memory(void *dst, void *src, int size)
757 {
758 int remain;
760 if (IS_XEN_ADDRESS(dom0,(unsigned long) src)) {
761 memcpy(dst,src,size);
762 }
763 else {
764 printf("About to call __copy_from_user(%p,%p,%d)\n",
765 dst,src,size);
766 while ((remain = __copy_from_user(dst,src,size)) != 0) {
767 printf("incomplete user copy, %d remain of %d\n",
768 remain,size);
769 dst += size - remain; src += size - remain;
770 size -= remain;
771 }
772 }
773 }
775 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
776 {
777 char *elfbase = (char *) image_start;
778 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
779 Elf_Ehdr ehdr;
780 Elf_Phdr phdr;
781 int h, filesz, memsz;
782 unsigned long elfaddr, dom_mpaddr, dom_imva;
783 struct page_info *p;
785 copy_memory(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
786 for ( h = 0; h < ehdr.e_phnum; h++ ) {
787 copy_memory(&phdr,elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
788 sizeof(Elf_Phdr));
789 //if ( !is_loadable_phdr(phdr) )
790 if ((phdr.p_type != PT_LOAD)) {
791 continue;
792 }
793 filesz = phdr.p_filesz; memsz = phdr.p_memsz;
794 elfaddr = (unsigned long) elfbase + phdr.p_offset;
795 dom_mpaddr = phdr.p_paddr;
796 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
797 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
798 if (d == dom0) {
799 if (dom_mpaddr+memsz>dom0_size || dom_mpaddr+filesz>dom0_size) {
800 printf("Domain0 doesn't fit in allocated space!\n");
801 while(1);
802 }
803 dom_imva = (unsigned long) __va(dom_mpaddr + dom0_start);
804 copy_memory((void *) dom_imva, (void *) elfaddr, filesz);
805 if (memsz > filesz) memset((void *) dom_imva+filesz, 0, memsz-filesz);
806 //FIXME: This test for code seems to find a lot more than objdump -x does
807 if (phdr.p_flags & PF_X) privify_memory(dom_imva,filesz);
808 }
809 else
810 #endif
811 while (memsz > 0) {
812 p = assign_new_domain_page(d,dom_mpaddr);
813 if (unlikely(!p)) BUG();
814 dom_imva = (unsigned long) __va(page_to_maddr(p));
815 if (filesz > 0) {
816 if (filesz >= PAGE_SIZE)
817 copy_memory((void *) dom_imva, (void *) elfaddr, PAGE_SIZE);
818 else { // copy partial page, zero the rest of page
819 copy_memory((void *) dom_imva, (void *) elfaddr, filesz);
820 memset((void *) dom_imva+filesz, 0, PAGE_SIZE-filesz);
821 }
822 //FIXME: This test for code seems to find a lot more than objdump -x does
823 if (phdr.p_flags & PF_X)
824 privify_memory(dom_imva,PAGE_SIZE);
825 }
826 else if (memsz > 0) // always zero out entire page
827 memset((void *) dom_imva, 0, PAGE_SIZE);
828 memsz -= PAGE_SIZE; filesz -= PAGE_SIZE;
829 elfaddr += PAGE_SIZE; dom_mpaddr += PAGE_SIZE;
830 }
831 }
832 }
834 void alloc_dom0(void)
835 {
836 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
837 if (platform_is_hp_ski()) {
838 dom0_size = 128*1024*1024; //FIXME: Should be configurable
839 }
840 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
842 /* FIXME: The first trunk (say 256M) should always be assigned to
843 * Dom0, since Dom0's physical == machine address for DMA purpose.
844 * Some old version linux, like 2.4, assumes physical memory existing
845 * in 2nd 64M space.
846 */
847 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
848 dom0_start <<= PAGE_SHIFT;
849 if (!dom0_start) {
850 printf("alloc_dom0: can't allocate contiguous memory size=%lu\n",
851 dom0_size);
852 while(1);
853 }
854 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
855 #else
856 dom0_start = 0;
857 #endif
859 }
862 /*
863 * Domain 0 has direct access to all devices absolutely. However
864 * the major point of this stub here, is to allow alloc_dom_mem
865 * handled with order > 0 request. Dom0 requires that bit set to
866 * allocate memory for other domains.
867 */
868 static void physdev_init_dom0(struct domain *d)
869 {
870 if (iomem_permit_access(d, 0UL, ~0UL))
871 BUG();
872 if (irqs_permit_access(d, 0, NR_PIRQS-1))
873 BUG();
874 }
876 static unsigned int vmx_dom0 = 0;
877 int construct_dom0(struct domain *d,
878 unsigned long image_start, unsigned long image_len,
879 unsigned long initrd_start, unsigned long initrd_len,
880 char *cmdline)
881 {
882 int i, rc;
883 unsigned long alloc_start, alloc_end;
884 start_info_t *si;
885 struct vcpu *v = d->vcpu[0];
887 struct domain_setup_info dsi;
888 unsigned long p_start;
889 unsigned long pkern_start;
890 unsigned long pkern_entry;
891 unsigned long pkern_end;
892 unsigned long pinitrd_start = 0;
893 unsigned long pstart_info;
894 #if 0
895 char *dst;
896 unsigned long nr_pt_pages;
897 unsigned long count;
898 #endif
899 #ifdef VALIDATE_VT
900 unsigned long mfn;
901 struct page_info *page = NULL;
902 #endif
904 //printf("construct_dom0: starting\n");
906 /* Sanity! */
907 BUG_ON(d != dom0);
908 BUG_ON(d->vcpu[0] == NULL);
909 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
911 memset(&dsi, 0, sizeof(struct domain_setup_info));
913 printk("*** LOADING DOMAIN 0 ***\n");
915 alloc_start = dom0_start;
916 alloc_end = dom0_start + dom0_size;
917 d->tot_pages = d->max_pages = dom0_size/PAGE_SIZE;
918 dsi.image_addr = (unsigned long)image_start;
919 dsi.image_len = image_len;
920 rc = parseelfimage(&dsi);
921 if ( rc != 0 )
922 return rc;
924 #ifdef VALIDATE_VT
925 /* Temp workaround */
926 if (running_on_sim)
927 dsi.xen_section_string = (char *)1;
929 /* Check whether dom0 is vti domain */
930 if ((!vmx_enabled) && !dsi.xen_section_string) {
931 printk("Lack of hardware support for unmodified vmx dom0\n");
932 panic("");
933 }
935 if (vmx_enabled && !dsi.xen_section_string) {
936 printk("Dom0 is vmx domain!\n");
937 vmx_dom0 = 1;
938 }
939 #endif
941 p_start = dsi.v_start;
942 pkern_start = dsi.v_kernstart;
943 pkern_end = dsi.v_kernend;
944 pkern_entry = dsi.v_kernentry;
946 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
948 if ( (p_start & (PAGE_SIZE-1)) != 0 )
949 {
950 printk("Initial guest OS must load to a page boundary.\n");
951 return -EINVAL;
952 }
954 if(initrd_start&&initrd_len){
955 pinitrd_start=(dom0_start+dom0_size) -
956 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
958 memcpy(__va(pinitrd_start), (void *) initrd_start, initrd_len);
959 pstart_info = PAGE_ALIGN(pinitrd_start + initrd_len);
960 } else {
961 pstart_info = PAGE_ALIGN(pkern_end);
962 }
964 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
965 " Kernel image: %lx->%lx\n"
966 " Entry address: %lx\n"
967 " Init. ramdisk: %lx len %lx\n"
968 " Start info.: %lx->%lx\n",
969 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
970 pstart_info, pstart_info + PAGE_SIZE);
972 if ( (pkern_end - pkern_start) > (d->max_pages * PAGE_SIZE) )
973 {
974 printk("Initial guest OS requires too much space\n"
975 "(%luMB is greater than %luMB limit)\n",
976 (pkern_end-pkern_start)>>20,
977 (unsigned long) (d->max_pages<<PAGE_SHIFT)>>20);
978 return -ENOMEM;
979 }
981 // if high 3 bits of pkern start are non-zero, error
983 // if pkern end is after end of metaphysical memory, error
984 // (we should be able to deal with this... later)
987 //
989 #if 0
990 strcpy(d->name,"Domain0");
991 #endif
993 /* Mask all upcalls... */
994 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
995 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
997 if (dom0_max_vcpus == 0)
998 dom0_max_vcpus = MAX_VIRT_CPUS;
999 if (dom0_max_vcpus > num_online_cpus())
1000 dom0_max_vcpus = num_online_cpus();
1001 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1002 dom0_max_vcpus = MAX_VIRT_CPUS;
1004 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1005 for ( i = 1; i < dom0_max_vcpus; i++ )
1006 if (alloc_vcpu(d, i, i) == NULL)
1007 printf ("Cannot allocate dom0 vcpu %d\n", i);
1009 #ifdef VALIDATE_VT
1010 /* Construct a frame-allocation list for the initial domain, since these
1011 * pages are allocated by boot allocator and pfns are not set properly
1012 */
1013 for ( mfn = (alloc_start>>PAGE_SHIFT);
1014 mfn < (alloc_end>>PAGE_SHIFT);
1015 mfn++ )
1017 page = &frame_table[mfn];
1018 page_set_owner(page, d);
1019 page->u.inuse.type_info = 0;
1020 page->count_info = PGC_allocated | 1;
1021 list_add_tail(&page->list, &d->page_list);
1023 /* Construct 1:1 mapping */
1024 machine_to_phys_mapping[mfn] = mfn;
1027 #endif
1029 /* Copy the OS image. */
1030 loaddomainelfimage(d,image_start);
1032 /* Copy the initial ramdisk. */
1033 //if ( initrd_len != 0 )
1034 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1037 /* Set up start info area. */
1038 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1039 si = __va(pstart_info);
1040 memset(si, 0, PAGE_SIZE);
1041 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
1042 si->nr_pages = d->tot_pages;
1044 #if 0
1045 si->shared_info = virt_to_maddr(d->shared_info);
1046 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
1047 //si->pt_base = vpt_start;
1048 //si->nr_pt_frames = nr_pt_pages;
1049 //si->mfn_list = vphysmap_start;
1051 if ( initrd_len != 0 )
1053 //si->mod_start = vinitrd_start;
1054 si->mod_len = initrd_len;
1055 printk("Initrd len 0x%lx, start at 0x%08lx\n",
1056 si->mod_len, si->mod_start);
1059 dst = si->cmd_line;
1060 if ( cmdline != NULL )
1062 for ( i = 0; i < 255; i++ )
1064 if ( cmdline[i] == '\0' )
1065 break;
1066 *dst++ = cmdline[i];
1069 *dst = '\0';
1071 zap_low_mappings(); /* Do the same for the idle page tables. */
1072 #endif
1074 /* Give up the VGA console if DOM0 is configured to grab it. */
1075 if (cmdline != NULL)
1076 console_endboot(strstr(cmdline, "tty0") != NULL);
1078 /* VMX specific construction for Dom0, if hardware supports VMX
1079 * and Dom0 is unmodified image
1080 */
1081 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
1082 if (vmx_dom0)
1083 vmx_final_setup_guest(v);
1085 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1087 new_thread(v, pkern_entry, 0, 0);
1088 physdev_init_dom0(d);
1089 sync_split_caches();
1091 // FIXME: Hack for keyboard input
1092 //serial_input_init();
1094 return 0;
1097 void machine_restart(char * __unused)
1099 if (platform_is_hp_ski()) dummy();
1100 printf("machine_restart called: spinning....\n");
1101 while(1);
1104 void machine_halt(void)
1106 if (platform_is_hp_ski()) dummy();
1107 printf("machine_halt called: spinning....\n");
1108 while(1);
1111 void dummy_called(char *function)
1113 if (platform_is_hp_ski()) asm("break 0;;");
1114 printf("dummy called in %s: spinning....\n", function);
1115 while(1);
1119 #if 0
1120 void switch_to(struct vcpu *prev, struct vcpu *next)
1122 struct vcpu *last;
1124 __switch_to(prev,next,last);
1125 //set_current(next);
1127 #endif
1129 void domain_pend_keyboard_interrupt(int irq)
1131 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1134 void sync_vcpu_execstate(struct vcpu *v)
1136 ia64_save_fpu(v->arch._thread.fph);
1137 if (VMX_DOMAIN(v))
1138 vmx_save_state(v);
1139 else {
1140 if (IA64_HAS_EXTRA_STATE(v))
1141 ia64_save_extra(v);
1143 // FIXME SMP: Anything else needed here for SMP?
1146 // FIXME: It would be nice to print out a nice error message for bad
1147 // values of these boot-time parameters, but it seems we are too early
1148 // in the boot and attempts to print freeze the system?
1149 #define abort(x...) do {} while(0)
1150 #define warn(x...) do {} while(0)
1152 static void parse_dom0_mem(char *s)
1154 unsigned long bytes = parse_size_and_unit(s);
1156 if (dom0_size < 4 * 1024 * 1024) {
1157 abort("parse_dom0_mem: too small, boot aborted"
1158 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1160 if (dom0_size % dom0_align) {
1161 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1162 warn("parse_dom0_mem: dom0_size rounded up from"
1163 " %lx to %lx bytes, due to dom0_align=%lx\n",
1164 bytes,dom0_size,dom0_align);
1166 else dom0_size = bytes;
1168 custom_param("dom0_mem", parse_dom0_mem);
1171 static void parse_dom0_align(char *s)
1173 unsigned long bytes = parse_size_and_unit(s);
1175 if ((bytes - 1) ^ bytes) { /* not a power of two */
1176 abort("parse_dom0_align: dom0_align must be power of two, "
1177 "boot aborted"
1178 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1180 else if (bytes < PAGE_SIZE) {
1181 abort("parse_dom0_align: dom0_align must be >= %ld, "
1182 "boot aborted"
1183 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1184 PAGE_SIZE);
1186 else dom0_align = bytes;
1187 if (dom0_size % dom0_align) {
1188 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1189 warn("parse_dom0_align: dom0_size rounded up from"
1190 " %ld to %ld bytes, due to dom0_align=%lx\n",
1191 bytes,dom0_size,dom0_align);
1194 custom_param("dom0_align", parse_dom0_align);