ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 10151:2cab08ac143b

[IA64] support DOMID_XEN and DOMID_IO of foreign domain page mapping

support of foreign domain page mapping of DOMID_XEN and DOMID_IO.
This patch is needed for xentrace and xenmon.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author awilliam@xenbuild.aw
date Wed May 24 10:39:55 2006 -0600 (2006-05-24)
parents d2f6e3d70f22
children 40959bc0a269
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <xen/iocap.h>
23 #include <asm/ptrace.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/hw_irq.h>
29 #include <asm/setup.h>
30 //#include <asm/mpspec.h>
31 #include <xen/irq.h>
32 #include <xen/event.h>
33 //#include <xen/shadow.h>
34 #include <xen/console.h>
35 #include <xen/compile.h>
37 #include <xen/elf.h>
38 //#include <asm/page.h>
39 #include <asm/pgalloc.h>
41 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
43 #include <asm/vcpu.h> /* for function declarations */
44 #include <public/arch-ia64.h>
45 #include <asm/vmx.h>
46 #include <asm/vmx_vcpu.h>
47 #include <asm/vmx_vpd.h>
48 #include <asm/vmx_phy_mode.h>
49 #include <asm/pal.h>
50 #include <asm/vhpt.h>
51 #include <public/hvm/ioreq.h>
52 #include <public/arch-ia64.h>
53 #include <asm/tlbflush.h>
54 #include <asm/regionreg.h>
55 #include <asm/dom_fw.h>
57 #ifndef CONFIG_XEN_IA64_DOM0_VP
58 #define CONFIG_DOMAIN0_CONTIGUOUS
59 #endif
60 unsigned long dom0_start = -1L;
61 unsigned long dom0_size = 512*1024*1024;
62 unsigned long dom0_align = 64*1024*1024;
64 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
65 static unsigned int dom0_max_vcpus = 1;
66 integer_param("dom0_max_vcpus", dom0_max_vcpus);
68 // initialized by arch/ia64/setup.c:find_initrd()
69 unsigned long initrd_start = 0, initrd_end = 0;
70 extern unsigned long running_on_sim;
72 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
74 /* FIXME: where these declarations should be there ? */
75 extern long platform_is_hp_ski(void);
76 extern void serial_input_init(void);
77 static void init_switch_stack(struct vcpu *v);
78 void build_physmap_table(struct domain *d);
80 static void try_to_clear_PGC_allocate(struct domain* d,
81 struct page_info* page);
83 #ifdef CONFIG_XEN_IA64_DOM0_VP
84 static struct domain *dom_xen, *dom_io;
86 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
87 void
88 alloc_dom_xen_and_dom_io(void)
89 {
90 /*
91 * Initialise our DOMID_XEN domain.
92 * Any Xen-heap pages that we will allow to be mapped will have
93 * their domain field set to dom_xen.
94 */
95 dom_xen = alloc_domain();
96 BUG_ON(dom_xen == NULL);
97 spin_lock_init(&dom_xen->page_alloc_lock);
98 INIT_LIST_HEAD(&dom_xen->page_list);
99 INIT_LIST_HEAD(&dom_xen->xenpage_list);
100 atomic_set(&dom_xen->refcnt, 1);
101 dom_xen->domain_id = DOMID_XEN;
103 /*
104 * Initialise our DOMID_IO domain.
105 * This domain owns I/O pages that are within the range of the page_info
106 * array. Mappings occur at the priv of the caller.
107 */
108 dom_io = alloc_domain();
109 BUG_ON(dom_io == NULL);
110 spin_lock_init(&dom_io->page_alloc_lock);
111 INIT_LIST_HEAD(&dom_io->page_list);
112 INIT_LIST_HEAD(&dom_io->xenpage_list);
113 atomic_set(&dom_io->refcnt, 1);
114 dom_io->domain_id = DOMID_IO;
115 }
116 #endif
118 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
119 void arch_domain_destroy(struct domain *d)
120 {
121 BUG_ON(d->arch.mm.pgd != NULL);
122 if (d->shared_info != NULL)
123 free_xenheap_page(d->shared_info);
125 domain_flush_destroy (d);
127 deallocate_rid_range(d);
128 }
130 static void default_idle(void)
131 {
132 int cpu = smp_processor_id();
133 local_irq_disable();
134 if ( !softirq_pending(cpu))
135 safe_halt();
136 local_irq_enable();
137 }
139 static void continue_cpu_idle_loop(void)
140 {
141 int cpu = smp_processor_id();
142 for ( ; ; )
143 {
144 #ifdef IA64
145 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
146 #else
147 irq_stat[cpu].idle_timestamp = jiffies;
148 #endif
149 while ( !softirq_pending(cpu) )
150 default_idle();
151 add_preempt_count(SOFTIRQ_OFFSET);
152 raise_softirq(SCHEDULE_SOFTIRQ);
153 do_softirq();
154 sub_preempt_count(SOFTIRQ_OFFSET);
155 }
156 }
158 void startup_cpu_idle_loop(void)
159 {
160 /* Just some sanity to ensure that the scheduler is set up okay. */
161 ASSERT(current->domain == IDLE_DOMAIN_ID);
162 raise_softirq(SCHEDULE_SOFTIRQ);
164 continue_cpu_idle_loop();
165 }
167 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
168 {
169 struct vcpu *v;
170 struct thread_info *ti;
172 /* Still keep idle vcpu0 static allocated at compilation, due
173 * to some code from Linux still requires it in early phase.
174 */
175 if (is_idle_domain(d) && !vcpu_id)
176 v = idle_vcpu[0];
177 else {
178 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
179 return NULL;
180 memset(v, 0, sizeof(*v));
182 ti = alloc_thread_info(v);
183 /* Clear thread_info to clear some important fields, like
184 * preempt_count
185 */
186 memset(ti, 0, sizeof(struct thread_info));
187 init_switch_stack(v);
188 }
190 if (!is_idle_domain(d)) {
191 v->arch.privregs =
192 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
193 BUG_ON(v->arch.privregs == NULL);
194 memset(v->arch.privregs, 0, PAGE_SIZE);
196 if (!vcpu_id)
197 memset(&d->shared_info->evtchn_mask[0], 0xff,
198 sizeof(d->shared_info->evtchn_mask));
200 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
201 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
202 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
203 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
205 /* Is it correct ?
206 It depends on the domain rid usage.
208 A domain may share rid among its processor (eg having a
209 global VHPT). In this case, we should also share rid
210 among vcpus and the rid range should be the same.
212 However a domain may have per cpu rid allocation. In
213 this case we don't want to share rid among vcpus, but we may
214 do it if two vcpus are on the same cpu... */
216 v->arch.starting_rid = d->arch.starting_rid;
217 v->arch.ending_rid = d->arch.ending_rid;
218 v->arch.breakimm = d->arch.breakimm;
219 }
221 return v;
222 }
224 void free_vcpu_struct(struct vcpu *v)
225 {
226 if (VMX_DOMAIN(v))
227 vmx_relinquish_vcpu_resources(v);
228 else {
229 if (v->arch.privregs != NULL)
230 free_xenheap_pages(v->arch.privregs, get_order(sizeof(mapped_regs_t)));
231 }
233 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
234 }
236 static void init_switch_stack(struct vcpu *v)
237 {
238 struct pt_regs *regs = vcpu_regs (v);
239 struct switch_stack *sw = (struct switch_stack *) regs - 1;
240 extern void ia64_ret_from_clone;
242 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
243 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
244 sw->b0 = (unsigned long) &ia64_ret_from_clone;
245 sw->ar_fpsr = FPSR_DEFAULT;
246 v->arch._thread.ksp = (unsigned long) sw - 16;
247 // stay on kernel stack because may get interrupts!
248 // ia64_ret_from_clone (which b0 gets in new_thread) switches
249 // to user stack
250 v->arch._thread.on_ustack = 0;
251 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
252 }
254 int arch_domain_create(struct domain *d)
255 {
256 // the following will eventually need to be negotiated dynamically
257 d->xen_vastart = XEN_START_ADDR;
258 d->xen_vaend = XEN_END_ADDR;
259 d->arch.shared_info_va = SHAREDINFO_ADDR;
260 d->arch.breakimm = 0x1000;
262 if (is_idle_domain(d))
263 return 0;
265 if ((d->shared_info = (void *)alloc_xenheap_page()) == NULL)
266 goto fail_nomem;
267 memset(d->shared_info, 0, PAGE_SIZE);
269 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
270 /* We may also need emulation rid for region4, though it's unlikely
271 * to see guest issue uncacheable access in metaphysical mode. But
272 * keep such info here may be more sane.
273 */
274 if (!allocate_rid_range(d,0))
275 goto fail_nomem;
276 d->arch.sys_pgnr = 0;
278 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
280 d->arch.physmap_built = 0;
281 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
282 goto fail_nomem;
284 printf ("arch_domain_create: domain=%p\n", d);
285 return 0;
287 fail_nomem:
288 if (d->arch.mm.pgd != NULL)
289 pgd_free(d->arch.mm.pgd);
290 if (d->shared_info != NULL)
291 free_xenheap_page(d->shared_info);
292 return -ENOMEM;
293 }
295 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
296 {
297 c->regs = *vcpu_regs (v);
298 c->shared = v->domain->shared_info->arch;
299 }
301 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
302 {
303 struct pt_regs *regs = vcpu_regs (v);
304 struct domain *d = v->domain;
306 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
307 return 0;
308 if (c->flags & VGCF_VMX_GUEST) {
309 if (!vmx_enabled) {
310 printk("No VMX hardware feature for vmx domain.\n");
311 return -EINVAL;
312 }
314 if (v == d->vcpu[0])
315 vmx_setup_platform(d, c);
317 vmx_final_setup_guest(v);
318 } else if (!d->arch.physmap_built)
319 build_physmap_table(d);
321 *regs = c->regs;
322 if (v == d->vcpu[0]) {
323 /* Only for first vcpu. */
324 d->arch.sys_pgnr = c->sys_pgnr;
325 d->arch.initrd_start = c->initrd.start;
326 d->arch.initrd_len = c->initrd.size;
327 d->arch.cmdline = c->cmdline;
328 d->shared_info->arch = c->shared;
330 /* Cache synchronization seems to be done by the linux kernel
331 during mmap/unmap operation. However be conservative. */
332 domain_cache_flush (d, 1);
333 }
334 new_thread(v, regs->cr_iip, 0, 0);
336 if ( c->privregs && copy_from_user(v->arch.privregs,
337 c->privregs, sizeof(mapped_regs_t))) {
338 printk("Bad ctxt address in arch_set_info_guest: %p\n",
339 c->privregs);
340 return -EFAULT;
341 }
343 v->arch.domain_itm_last = -1L;
345 /* Don't redo final setup */
346 set_bit(_VCPUF_initialised, &v->vcpu_flags);
347 return 0;
348 }
350 static void relinquish_memory(struct domain *d, struct list_head *list)
351 {
352 struct list_head *ent;
353 struct page_info *page;
354 #ifndef __ia64__
355 unsigned long x, y;
356 #endif
358 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
359 spin_lock_recursive(&d->page_alloc_lock);
360 ent = list->next;
361 while ( ent != list )
362 {
363 page = list_entry(ent, struct page_info, list);
364 /* Grab a reference to the page so it won't disappear from under us. */
365 if ( unlikely(!get_page(page, d)) )
366 {
367 /* Couldn't get a reference -- someone is freeing this page. */
368 ent = ent->next;
369 continue;
370 }
372 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
373 put_page_and_type(page);
375 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
376 put_page(page);
378 #ifndef __ia64__
379 /*
380 * Forcibly invalidate base page tables at this point to break circular
381 * 'linear page table' references. This is okay because MMU structures
382 * are not shared across domains and this domain is now dead. Thus base
383 * tables are not in use so a non-zero count means circular reference.
384 */
385 y = page->u.inuse.type_info;
386 for ( ; ; )
387 {
388 x = y;
389 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
390 (PGT_base_page_table|PGT_validated)) )
391 break;
393 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
394 if ( likely(y == x) )
395 {
396 free_page_type(page, PGT_base_page_table);
397 break;
398 }
399 }
400 #endif
402 /* Follow the list chain and /then/ potentially free the page. */
403 ent = ent->next;
404 #ifdef CONFIG_XEN_IA64_DOM0_VP
405 #if 1
406 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
407 #else
408 //XXX this should be done at traversing the P2M table.
409 if (page_get_owner(page) == d)
410 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
411 #endif
412 #endif
413 put_page(page);
414 }
416 spin_unlock_recursive(&d->page_alloc_lock);
417 }
419 static void
420 relinquish_pte(struct domain* d, pte_t* pte)
421 {
422 unsigned long mfn = pte_pfn(*pte);
423 struct page_info* page;
425 // vmx domain use bit[58:56] to distinguish io region from memory.
426 // see vmx_build_physmap_table() in vmx_init.c
427 if (((mfn << PAGE_SHIFT) & GPFN_IO_MASK) != GPFN_MEM)
428 return;
430 // domain might map IO space or acpi table pages. check it.
431 if (!mfn_valid(mfn))
432 return;
433 page = mfn_to_page(mfn);
434 // struct page_info corresponding to mfn may exist or not depending
435 // on CONFIG_VIRTUAL_FRAME_TABLE.
436 // This check is too easy.
437 // The right way is to check whether this page is of io area or acpi pages
438 if (page_get_owner(page) == NULL) {
439 BUG_ON(page->count_info != 0);
440 return;
441 }
443 #ifdef CONFIG_XEN_IA64_DOM0_VP
444 if (page_get_owner(page) == d) {
445 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
446 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
447 }
448 #endif
449 try_to_clear_PGC_allocate(d, page);
450 put_page(page);
451 }
453 static void
454 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
455 {
456 unsigned long i;
457 pte_t* pte = pte_offset_map(pmd, offset);
459 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
460 if (!pte_present(*pte))
461 continue;
463 relinquish_pte(d, pte);
464 }
465 pte_free_kernel(pte_offset_map(pmd, offset));
466 }
468 static void
469 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
470 {
471 unsigned long i;
472 pmd_t *pmd = pmd_offset(pud, offset);
474 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
475 if (!pmd_present(*pmd))
476 continue;
478 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
479 }
480 pmd_free(pmd_offset(pud, offset));
481 }
483 static void
484 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
485 {
486 unsigned long i;
487 pud_t *pud = pud_offset(pgd, offset);
489 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
490 if (!pud_present(*pud))
491 continue;
493 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
494 }
495 pud_free(pud_offset(pgd, offset));
496 }
498 static void
499 relinquish_mm(struct domain* d)
500 {
501 struct mm_struct* mm = &d->arch.mm;
502 unsigned long i;
503 pgd_t* pgd;
505 if (mm->pgd == NULL)
506 return;
508 pgd = pgd_offset(mm, 0);
509 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
510 if (!pgd_present(*pgd))
511 continue;
513 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
514 }
515 pgd_free(mm->pgd);
516 mm->pgd = NULL;
517 }
519 void domain_relinquish_resources(struct domain *d)
520 {
521 /* Relinquish every page of memory. */
523 // relase page traversing d->arch.mm.
524 relinquish_mm(d);
526 relinquish_memory(d, &d->xenpage_list);
527 relinquish_memory(d, &d->page_list);
528 }
530 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
531 // and linux/arch/ia64/kernel/process.c:kernel_thread()
532 void new_thread(struct vcpu *v,
533 unsigned long start_pc,
534 unsigned long start_stack,
535 unsigned long start_info)
536 {
537 struct domain *d = v->domain;
538 struct pt_regs *regs;
539 extern char dom0_command_line[];
541 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
542 if (d == dom0 && v->vcpu_id == 0) start_pc += dom0_start;
543 #endif
545 regs = vcpu_regs (v);
546 if (VMX_DOMAIN(v)) {
547 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
548 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
549 } else {
550 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
551 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN;
552 regs->cr_ipsr &= ~(IA64_PSR_BITS_TO_CLEAR
553 | IA64_PSR_RI | IA64_PSR_IS);
554 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
555 }
556 regs->cr_iip = start_pc;
557 regs->cr_ifs = 1UL << 63; /* or clear? */
558 regs->ar_fpsr = FPSR_DEFAULT;
560 if (VMX_DOMAIN(v)) {
561 vmx_init_all_rr(v);
562 if (d == dom0)
563 regs->r28 = dom_fw_setup(d,dom0_command_line,
564 COMMAND_LINE_SIZE);
565 /* Virtual processor context setup */
566 VCPU(v, vpsr) = IA64_PSR_BN;
567 VCPU(v, dcr) = 0;
568 } else {
569 init_all_rr(v);
570 if (v->vcpu_id == 0) {
571 /* Build the firmware. */
572 if (d == dom0)
573 regs->r28 = dom_fw_setup(d,dom0_command_line,
574 COMMAND_LINE_SIZE);
575 else {
576 const char *cmdline = d->arch.cmdline;
577 int len;
579 if (*cmdline == 0) {
580 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
581 cmdline = DEFAULT_CMDLINE;
582 len = sizeof (DEFAULT_CMDLINE);
583 printf("domU command line defaulted to"
584 DEFAULT_CMDLINE "\n");
585 }
586 else
587 len = IA64_COMMAND_LINE_SIZE;
589 regs->r28 = dom_fw_setup (d, cmdline, len);
590 }
591 d->shared_info->arch.flags = (d == dom0) ?
592 (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
593 }
594 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
595 VCPU(v, banknum) = 1;
596 VCPU(v, metaphysical_mode) = 1;
597 VCPU(v, interrupt_mask_addr) =
598 (uint64_t)SHAREDINFO_ADDR + INT_ENABLE_OFFSET(v);
599 VCPU(v, itv) = (1 << 16); /* timer vector masked */
600 }
601 }
603 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
604 void
605 share_xen_page_with_guest(struct page_info *page,
606 struct domain *d, int readonly)
607 {
608 if ( page_get_owner(page) == d )
609 return;
611 #if 1
612 if (readonly) {
613 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
614 }
615 #endif
617 // alloc_xenheap_pages() doesn't initialize page owner.
618 //BUG_ON(page_get_owner(page) != NULL);
619 #if 0
620 if (get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY) {
621 printk("%s:%d page 0x%p mfn 0x%lx gpfn 0x%lx\n", __func__, __LINE__,
622 page, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)));
623 }
624 #endif
625 // grant_table_destroy() release these pages.
626 // but it doesn't clear m2p entry. So there might remain stale entry.
627 // We clear such a stale entry here.
628 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
630 spin_lock(&d->page_alloc_lock);
632 #ifndef __ia64__
633 /* The incremented type count pins as writable or read-only. */
634 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
635 page->u.inuse.type_info |= PGT_validated | 1;
636 #endif
638 page_set_owner(page, d);
639 wmb(); /* install valid domain ptr before updating refcnt. */
640 ASSERT(page->count_info == 0);
641 page->count_info |= PGC_allocated | 1;
643 if ( unlikely(d->xenheap_pages++ == 0) )
644 get_knownalive_domain(d);
645 list_add_tail(&page->list, &d->xenpage_list);
647 spin_unlock(&d->page_alloc_lock);
648 }
650 void
651 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
652 {
653 share_xen_page_with_guest(page, dom_xen, readonly);
654 }
656 //XXX !xxx_present() should be used instread of !xxx_none()?
657 static pte_t*
658 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
659 {
660 struct mm_struct *mm = &d->arch.mm;
661 pgd_t *pgd;
662 pud_t *pud;
663 pmd_t *pmd;
665 BUG_ON(mm->pgd == NULL);
666 pgd = pgd_offset(mm, mpaddr);
667 if (pgd_none(*pgd)) {
668 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
669 }
671 pud = pud_offset(pgd, mpaddr);
672 if (pud_none(*pud)) {
673 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
674 }
676 pmd = pmd_offset(pud, mpaddr);
677 if (pmd_none(*pmd)) {
678 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
679 }
681 return pte_offset_map(pmd, mpaddr);
682 }
684 //XXX xxx_none() should be used instread of !xxx_present()?
685 static pte_t*
686 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
687 {
688 struct mm_struct *mm = &d->arch.mm;
689 pgd_t *pgd;
690 pud_t *pud;
691 pmd_t *pmd;
693 BUG_ON(mm->pgd == NULL);
694 pgd = pgd_offset(mm, mpaddr);
695 if (!pgd_present(*pgd))
696 goto not_present;
698 pud = pud_offset(pgd, mpaddr);
699 if (!pud_present(*pud))
700 goto not_present;
702 pmd = pmd_offset(pud, mpaddr);
703 if (!pmd_present(*pmd))
704 goto not_present;
706 return pte_offset_map(pmd, mpaddr);
708 not_present:
709 return NULL;
710 }
712 #ifdef CONFIG_XEN_IA64_DOM0_VP
713 static pte_t*
714 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
715 {
716 struct mm_struct *mm = &d->arch.mm;
717 pgd_t *pgd;
718 pud_t *pud;
719 pmd_t *pmd;
721 BUG_ON(mm->pgd == NULL);
722 pgd = pgd_offset(mm, mpaddr);
723 if (pgd_none(*pgd))
724 goto not_present;
726 pud = pud_offset(pgd, mpaddr);
727 if (pud_none(*pud))
728 goto not_present;
730 pmd = pmd_offset(pud, mpaddr);
731 if (pmd_none(*pmd))
732 goto not_present;
734 return pte_offset_map(pmd, mpaddr);
736 not_present:
737 return NULL;
738 }
739 #endif
741 /* Allocate a new page for domain and map it to the specified metaphysical
742 address. */
743 struct page_info *
744 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
745 {
746 struct page_info *p = NULL;
747 unsigned long maddr;
748 int ret;
750 BUG_ON(!pte_none(*pte));
752 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
753 if (d == dom0) {
754 #if 0
755 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
756 /* FIXME: is it true ?
757 dom0 memory is not contiguous! */
758 panic("assign_new_domain_page: bad domain0 "
759 "mpaddr=%lx, start=%lx, end=%lx!\n",
760 mpaddr, dom0_start, dom0_start+dom0_size);
761 }
762 #endif
763 p = mfn_to_page((mpaddr >> PAGE_SHIFT));
764 return p;
765 }
766 #endif
768 p = alloc_domheap_page(d);
769 if (unlikely(!p)) {
770 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
771 return(p);
772 }
774 // zero out pages for security reasons
775 clear_page(page_to_virt(p));
776 maddr = page_to_maddr (p);
777 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
778 && maddr < __get_cpu_var(vhpt_pend))) {
779 /* FIXME: how can this happen ?
780 vhpt is allocated by alloc_domheap_page. */
781 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
782 maddr);
783 }
785 ret = get_page(p, d);
786 BUG_ON(ret == 0);
787 set_pte(pte, pfn_pte(maddr >> PAGE_SHIFT,
788 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
790 mb ();
791 //XXX CONFIG_XEN_IA64_DOM0_VP
792 // TODO racy
793 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
794 return p;
795 }
797 struct page_info *
798 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
799 {
800 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
801 pte_t dummy_pte = __pte(0);
802 return __assign_new_domain_page(d, mpaddr, &dummy_pte);
803 #else
804 struct page_info *p = NULL;
805 pte_t *pte;
807 pte = lookup_alloc_domain_pte(d, mpaddr);
808 if (pte_none(*pte)) {
809 p = __assign_new_domain_page(d, mpaddr, pte);
810 } else {
811 DPRINTK("%s: d 0x%p mpaddr %lx already mapped!\n",
812 __func__, d, mpaddr);
813 }
815 return p;
816 #endif
817 }
819 void
820 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
821 {
822 #ifndef CONFIG_DOMAIN0_CONTIGUOUS
823 pte_t *pte;
825 BUG_ON(d != dom0);
826 pte = lookup_alloc_domain_pte(d, mpaddr);
827 if (pte_none(*pte)) {
828 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
829 if (p == NULL) {
830 panic("%s: can't allocate page for dom0", __func__);
831 }
832 }
833 #endif
834 }
836 /* map a physical address to the specified metaphysical addr */
837 void
838 __assign_domain_page(struct domain *d,
839 unsigned long mpaddr, unsigned long physaddr)
840 {
841 pte_t *pte;
843 pte = lookup_alloc_domain_pte(d, mpaddr);
844 if (pte_none(*pte)) {
845 set_pte(pte,
846 pfn_pte(physaddr >> PAGE_SHIFT,
847 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
848 mb ();
849 } else
850 printk("%s: mpaddr %lx already mapped!\n", __func__, mpaddr);
851 }
853 /* get_page() and map a physical address to the specified metaphysical addr */
854 void
855 assign_domain_page(struct domain *d,
856 unsigned long mpaddr, unsigned long physaddr)
857 {
858 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
859 int ret;
861 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
862 ret = get_page(page, d);
863 BUG_ON(ret == 0);
864 __assign_domain_page(d, mpaddr, physaddr);
866 //XXX CONFIG_XEN_IA64_DOM0_VP
867 // TODO racy
868 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
869 }
871 #ifdef CONFIG_XEN_IA64_DOM0_VP
872 static void
873 assign_domain_same_page(struct domain *d,
874 unsigned long mpaddr, unsigned long size)
875 {
876 //XXX optimization
877 unsigned long end = mpaddr + size;
878 for (; mpaddr < end; mpaddr += PAGE_SIZE) {
879 __assign_domain_page(d, mpaddr, mpaddr);
880 }
881 }
883 static int
884 efi_mmio(unsigned long physaddr, unsigned long size)
885 {
886 void *efi_map_start, *efi_map_end;
887 u64 efi_desc_size;
888 void* p;
890 efi_map_start = __va(ia64_boot_param->efi_memmap);
891 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
892 efi_desc_size = ia64_boot_param->efi_memdesc_size;
894 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
895 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
896 unsigned long start = md->phys_addr;
897 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
899 if (start <= physaddr && physaddr < end) {
900 if ((physaddr + size) > end) {
901 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
902 __func__, __LINE__, physaddr, size);
903 return 0;
904 }
906 // for io space
907 if (md->type == EFI_MEMORY_MAPPED_IO ||
908 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
909 return 1;
910 }
912 // for runtime
913 // see efi_enter_virtual_mode(void)
914 // in linux/arch/ia64/kernel/efi.c
915 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
916 !(md->attribute & EFI_MEMORY_WB)) {
917 return 1;
918 }
920 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
921 __func__, __LINE__, physaddr, size);
922 return 0;
923 }
925 if (physaddr < start) {
926 break;
927 }
928 }
930 return 1;
931 }
933 unsigned long
934 assign_domain_mmio_page(struct domain *d,
935 unsigned long mpaddr, unsigned long size)
936 {
937 if (size == 0) {
938 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
939 __func__, d, mpaddr, size);
940 }
941 if (!efi_mmio(mpaddr, size)) {
942 DPRINTK("%s:%d domain %p mpaddr 0x%lx size = 0x%lx\n",
943 __func__, __LINE__, d, mpaddr, size);
944 return -EINVAL;
945 }
946 assign_domain_same_page(d, mpaddr, size);
947 return mpaddr;
948 }
950 unsigned long
951 assign_domain_mach_page(struct domain *d,
952 unsigned long mpaddr, unsigned long size)
953 {
954 assign_domain_same_page(d, mpaddr, size);
955 return mpaddr;
956 }
958 //XXX selege hammer.
959 // flush finer range.
960 void
961 domain_page_flush(struct domain* d, unsigned long mpaddr,
962 unsigned long old_mfn, unsigned long new_mfn)
963 {
964 domain_flush_vtlb_all();
965 }
966 #endif
968 //XXX heavily depends on the struct page_info layout.
969 //
970 // if (page_get_owner(page) == d &&
971 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
972 // put_page(page);
973 // }
974 static void
975 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
976 {
977 u32 _d, _nd;
978 u64 x, nx, y;
980 _d = pickle_domptr(d);
981 y = *((u64*)&page->count_info);
982 do {
983 x = y;
984 _nd = x >> 32;
985 nx = x - 1;
986 __clear_bit(_PGC_allocated, &nx);
988 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
989 struct domain* nd = unpickle_domptr(_nd);
990 if (nd == NULL) {
991 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
992 "sd=%p 0x%x,"
993 " caf=%016lx, taf=%" PRtype_info "\n",
994 (void *) page_to_mfn(page),
995 d, d->domain_id, _d,
996 nd, _nd,
997 x,
998 page->u.inuse.type_info);
999 }
1000 break;
1003 BUG_ON((nx & PGC_count_mask) < 1);
1004 y = cmpxchg((u64*)&page->count_info, x, nx);
1005 } while (unlikely(y != x));
1008 #ifdef CONFIG_XEN_IA64_DOM0_VP
1009 static void
1010 zap_domain_page_one(struct domain *d, unsigned long mpaddr, int do_put_page)
1012 struct mm_struct *mm = &d->arch.mm;
1013 pte_t *pte;
1014 pte_t old_pte;
1015 unsigned long mfn;
1016 struct page_info *page;
1018 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1019 if (pte == NULL)
1020 return;
1021 if (pte_none(*pte))
1022 return;
1024 // update pte
1025 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1026 mfn = pte_pfn(old_pte);
1027 page = mfn_to_page(mfn);
1028 BUG_ON((page->count_info & PGC_count_mask) == 0);
1030 if (page_get_owner(page) == d) {
1031 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1032 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1035 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
1037 if (do_put_page) {
1038 try_to_clear_PGC_allocate(d, page);
1039 put_page(page);
1042 #endif
1044 void build_physmap_table(struct domain *d)
1046 struct list_head *list_ent = d->page_list.next;
1047 unsigned long mfn, i = 0;
1049 ASSERT(!d->arch.physmap_built);
1050 while(list_ent != &d->page_list) {
1051 mfn = page_to_mfn(list_entry(
1052 list_ent, struct page_info, list));
1053 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
1055 i++;
1056 list_ent = mfn_to_page(mfn)->list.next;
1058 d->arch.physmap_built = 1;
1061 void mpafoo(unsigned long mpaddr)
1063 extern unsigned long privop_trace;
1064 if (mpaddr == 0x3800)
1065 privop_trace = 1;
1068 #ifdef CONFIG_XEN_IA64_DOM0_VP
1069 unsigned long
1070 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
1072 pte_t *pte;
1074 pte = lookup_noalloc_domain_pte(d, mpaddr);
1075 if (pte == NULL)
1076 goto not_present;
1078 if (pte_present(*pte))
1079 return (pte->pte & _PFN_MASK);
1080 else if (VMX_DOMAIN(d->vcpu[0]))
1081 return GPFN_INV_MASK;
1083 not_present:
1084 return INVALID_MFN;
1087 unsigned long
1088 __lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
1090 unsigned long machine = ____lookup_domain_mpa(d, mpaddr);
1091 if (machine != INVALID_MFN)
1092 return machine;
1094 printk("%s: d 0x%p id %d current 0x%p id %d\n",
1095 __func__, d, d->domain_id, current, current->vcpu_id);
1096 printk("%s: bad mpa 0x%lx (max_pages 0x%lx)\n",
1097 __func__, mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
1098 return INVALID_MFN;
1100 #endif
1102 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
1104 pte_t *pte;
1106 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1107 if (d == dom0) {
1108 pte_t pteval;
1109 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
1110 //printk("lookup_domain_mpa: bad dom0 mpaddr 0x%lx!\n",mpaddr);
1111 //printk("lookup_domain_mpa: start=0x%lx,end=0x%lx!\n",dom0_start,dom0_start+dom0_size);
1112 mpafoo(mpaddr);
1114 pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
1115 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
1116 pte = &pteval;
1117 return *(unsigned long *)pte;
1119 #endif
1120 pte = lookup_noalloc_domain_pte(d, mpaddr);
1121 if (pte != NULL) {
1122 if (pte_present(*pte)) {
1123 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
1124 return *(unsigned long *)pte;
1125 } else if (VMX_DOMAIN(d->vcpu[0]))
1126 return GPFN_INV_MASK;
1129 printk("%s: d 0x%p id %d current 0x%p id %d\n",
1130 __func__, d, d->domain_id, current, current->vcpu_id);
1131 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
1132 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
1133 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
1134 else
1135 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
1136 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
1137 mpafoo(mpaddr);
1138 return 0;
1141 #ifdef CONFIG_XEN_IA64_DOM0_VP
1142 //XXX SMP
1143 unsigned long
1144 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1145 unsigned int extent_order)
1147 unsigned long ret = 0;
1148 if (extent_order != 0) {
1149 //XXX
1150 ret = -ENOSYS;
1151 goto out;
1154 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1);
1156 out:
1157 return ret;
1160 // caller must get_page(mfn_to_page(mfn)) before
1161 // caller must call set_gpfn_from_mfn().
1162 static void
1163 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1164 unsigned long mfn, unsigned int flags)
1166 struct mm_struct *mm = &d->arch.mm;
1167 pte_t* pte;
1168 pte_t old_pte;
1169 pte_t npte;
1171 pte = lookup_alloc_domain_pte(d, mpaddr);
1173 // update pte
1174 npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
1175 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1176 if (!pte_none(old_pte)) {
1177 unsigned long old_mfn;
1178 struct page_info* old_page;
1180 // XXX should previous underlying page be removed?
1181 // or should error be returned because it is a due to a domain?
1182 old_mfn = pte_pfn(old_pte);//XXX
1183 old_page = mfn_to_page(old_mfn);
1185 if (page_get_owner(old_page) == d) {
1186 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1187 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1190 domain_page_flush(d, mpaddr, old_mfn, mfn);
1192 try_to_clear_PGC_allocate(d, old_page);
1193 put_page(old_page);
1194 } else {
1195 BUG_ON(!mfn_valid(mfn));
1196 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1197 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1201 unsigned long
1202 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1203 unsigned int flags, domid_t domid)
1205 int error = 0;
1207 struct domain* rd;
1208 rd = find_domain_by_id(domid);
1209 if (unlikely(rd == NULL)) {
1210 switch (domid) {
1211 case DOMID_XEN:
1212 rd = dom_xen;
1213 break;
1214 case DOMID_IO:
1215 rd = dom_io;
1216 break;
1217 default:
1218 DPRINTK("d 0x%p domid %d "
1219 "pgfn 0x%lx mfn 0x%lx flags 0x%lx domid %d\n",
1220 d, d->domain_id, gpfn, mfn, flags, domid);
1221 error = -ESRCH;
1222 goto out0;
1224 BUG_ON(rd == NULL);
1225 get_knownalive_domain(rd);
1228 if (unlikely(rd == d)) {
1229 error = -EINVAL;
1230 goto out1;
1232 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1233 error = -EINVAL;
1234 goto out1;
1237 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, 0/* flags:XXX */);
1238 //don't update p2m table because this page belongs to rd, not d.
1239 out1:
1240 put_domain(rd);
1241 out0:
1242 return error;
1245 // grant table host mapping
1246 // mpaddr: host_addr: pseudo physical address
1247 // mfn: frame: machine page frame
1248 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1249 int
1250 create_grant_host_mapping(unsigned long gpaddr,
1251 unsigned long mfn, unsigned int flags)
1253 struct domain* d = current->domain;
1254 struct page_info* page;
1255 int ret;
1257 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1258 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1259 return GNTST_general_error;
1261 if (flags & GNTMAP_readonly) {
1262 #if 0
1263 DPRINTK("%s: GNTMAP_readonly is not implemented yet. flags %x\n",
1264 __func__, flags);
1265 #endif
1266 flags &= ~GNTMAP_readonly;
1269 page = mfn_to_page(mfn);
1270 ret = get_page(page, page_get_owner(page));
1271 BUG_ON(ret == 0);
1272 assign_domain_page_replace(d, gpaddr, mfn, flags);
1274 return GNTST_okay;
1277 // grant table host unmapping
1278 int
1279 destroy_grant_host_mapping(unsigned long gpaddr,
1280 unsigned long mfn, unsigned int flags)
1282 struct domain* d = current->domain;
1283 pte_t* pte;
1284 pte_t old_pte;
1285 unsigned long old_mfn = INVALID_MFN;
1286 struct page_info* old_page;
1288 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1289 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1290 return GNTST_general_error;
1292 if (flags & GNTMAP_readonly) {
1293 #if 0
1294 DPRINTK("%s: GNTMAP_readonly is not implemented yet. flags %x\n",
1295 __func__, flags);
1296 #endif
1297 flags &= ~GNTMAP_readonly;
1300 pte = lookup_noalloc_domain_pte(d, gpaddr);
1301 if (pte == NULL || !pte_present(*pte) || pte_pfn(*pte) != mfn)
1302 return GNTST_general_error;//XXX GNTST_bad_pseudo_phys_addr
1304 // update pte
1305 old_pte = ptep_get_and_clear(&d->arch.mm, gpaddr, pte);
1306 if (pte_present(old_pte)) {
1307 old_mfn = pte_pfn(old_pte);//XXX
1309 domain_page_flush(d, gpaddr, old_mfn, INVALID_MFN);
1311 old_page = mfn_to_page(old_mfn);
1312 BUG_ON(page_get_owner(old_page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1313 put_page(old_page);
1315 return GNTST_okay;
1318 //XXX needs refcount patch
1319 //XXX heavily depends on the struct page layout.
1320 //XXX SMP
1321 int
1322 steal_page_for_grant_transfer(struct domain *d, struct page_info *page)
1324 #if 0 /* if big endian */
1325 # error "implement big endian version of steal_page_for_grant_transfer()"
1326 #endif
1327 u32 _d, _nd;
1328 u64 x, nx, y;
1329 unsigned long mpaddr = get_gpfn_from_mfn(page_to_mfn(page)) << PAGE_SHIFT;
1330 struct page_info *new;
1332 zap_domain_page_one(d, mpaddr, 0);
1333 put_page(page);
1335 spin_lock(&d->page_alloc_lock);
1337 /*
1338 * The tricky bit: atomically release ownership while there is just one
1339 * benign reference to the page (PGC_allocated). If that reference
1340 * disappears then the deallocation routine will safely spin.
1341 */
1342 _d = pickle_domptr(d);
1343 y = *((u64*)&page->count_info);
1344 do {
1345 x = y;
1346 nx = x & 0xffffffff;
1347 // page->count_info: untouched
1348 // page->u.inused._domain = 0;
1349 _nd = x >> 32;
1351 if (unlikely((x & (PGC_count_mask | PGC_allocated)) !=
1352 (1 | PGC_allocated)) ||
1353 unlikely(_nd != _d)) {
1354 struct domain* nd = unpickle_domptr(_nd);
1355 if (nd == NULL) {
1356 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1357 "sd=%p 0x%x,"
1358 " caf=%016lx, taf=%" PRtype_info "\n",
1359 (void *) page_to_mfn(page),
1360 d, d->domain_id, _d,
1361 nd, _nd,
1362 x,
1363 page->u.inuse.type_info);
1364 } else {
1365 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1366 "sd=%p(%u) 0x%x,"
1367 " caf=%016lx, taf=%" PRtype_info "\n",
1368 (void *) page_to_mfn(page),
1369 d, d->domain_id, _d,
1370 nd, nd->domain_id, _nd,
1371 x,
1372 page->u.inuse.type_info);
1374 spin_unlock(&d->page_alloc_lock);
1375 return -1;
1378 y = cmpxchg((u64*)&page->count_info, x, nx);
1379 } while (unlikely(y != x));
1381 /*
1382 * Unlink from 'd'. At least one reference remains (now anonymous), so
1383 * noone else is spinning to try to delete this page from 'd'.
1384 */
1385 d->tot_pages--;
1386 list_del(&page->list);
1388 spin_unlock(&d->page_alloc_lock);
1390 #if 1
1391 //XXX Until net_rx_action() fix
1392 // assign new page for this mpaddr
1393 new = assign_new_domain_page(d, mpaddr);
1394 BUG_ON(new == NULL);//XXX
1395 #endif
1397 return 0;
1400 void
1401 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1402 unsigned long mfn)
1404 int ret;
1406 ret = get_page(mfn_to_page(mfn), d);
1407 BUG_ON(ret == 0);
1408 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, 0/* XXX */);
1409 set_gpfn_from_mfn(mfn, gpfn);//XXX SMP
1411 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1414 void
1415 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1416 unsigned long mfn)
1418 BUG_ON(mfn == 0);//XXX
1419 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1);
1421 #endif
1423 /* Flush cache of domain d. */
1424 void domain_cache_flush (struct domain *d, int sync_only)
1426 struct mm_struct *mm = &d->arch.mm;
1427 pgd_t *pgd = mm->pgd;
1428 unsigned long maddr;
1429 int i,j,k, l;
1430 int nbr_page = 0;
1431 void (*flush_func)(unsigned long start, unsigned long end);
1432 extern void flush_dcache_range (unsigned long, unsigned long);
1434 if (sync_only)
1435 flush_func = &flush_icache_range;
1436 else
1437 flush_func = &flush_dcache_range;
1439 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1440 if (d == dom0) {
1441 /* This is not fully correct (because of hole), but it should
1442 be enough for now. */
1443 (*flush_func)(__va_ul (dom0_start),
1444 __va_ul (dom0_start + dom0_size));
1445 return;
1447 #endif
1448 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1449 pud_t *pud;
1450 if (!pgd_present(*pgd))
1451 continue;
1452 pud = pud_offset(pgd, 0);
1453 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1454 pmd_t *pmd;
1455 if (!pud_present(*pud))
1456 continue;
1457 pmd = pmd_offset(pud, 0);
1458 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1459 pte_t *pte;
1460 if (!pmd_present(*pmd))
1461 continue;
1462 pte = pte_offset_map(pmd, 0);
1463 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1464 if (!pte_present(*pte))
1465 continue;
1466 /* Convert PTE to maddr. */
1467 maddr = __va_ul (pte_val(*pte)
1468 & _PAGE_PPN_MASK);
1469 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1470 nbr_page++;
1475 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1478 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
1479 #if 1
1480 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
1482 unsigned long pte = lookup_domain_mpa(d,mpaddr);
1483 unsigned long imva;
1485 pte &= _PAGE_PPN_MASK;
1486 imva = (unsigned long) __va(pte);
1487 imva |= mpaddr & ~PAGE_MASK;
1488 return(imva);
1490 #else
1491 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
1493 unsigned long imva = __gpa_to_mpa(d, mpaddr);
1495 return __va(imva);
1497 #endif
1499 // remove following line if not privifying in memory
1500 //#define HAVE_PRIVIFY_MEMORY
1501 #ifndef HAVE_PRIVIFY_MEMORY
1502 #define privify_memory(x,y) do {} while(0)
1503 #endif
1505 // see arch/x86/xxx/domain_build.c
1506 int elf_sanity_check(Elf_Ehdr *ehdr)
1508 if (!(IS_ELF(*ehdr)))
1510 printk("DOM0 image is not a Xen-compatible Elf image.\n");
1511 return 0;
1513 return 1;
1516 static void copy_memory(void *dst, void *src, int size)
1518 int remain;
1520 if (IS_XEN_ADDRESS(dom0,(unsigned long) src)) {
1521 memcpy(dst,src,size);
1523 else {
1524 printf("About to call __copy_from_user(%p,%p,%d)\n",
1525 dst,src,size);
1526 while ((remain = __copy_from_user(dst,src,size)) != 0) {
1527 printf("incomplete user copy, %d remain of %d\n",
1528 remain,size);
1529 dst += size - remain; src += size - remain;
1530 size -= remain;
1535 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
1537 char *elfbase = (char *) image_start;
1538 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
1539 Elf_Ehdr ehdr;
1540 Elf_Phdr phdr;
1541 int h, filesz, memsz;
1542 unsigned long elfaddr, dom_mpaddr, dom_imva;
1543 struct page_info *p;
1545 copy_memory(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
1546 for ( h = 0; h < ehdr.e_phnum; h++ ) {
1547 copy_memory(&phdr,
1548 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
1549 sizeof(Elf_Phdr));
1550 if ((phdr.p_type != PT_LOAD))
1551 continue;
1553 filesz = phdr.p_filesz;
1554 memsz = phdr.p_memsz;
1555 elfaddr = (unsigned long) elfbase + phdr.p_offset;
1556 dom_mpaddr = phdr.p_paddr;
1558 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
1559 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1560 if (d == dom0) {
1561 if (dom_mpaddr+memsz>dom0_size)
1562 panic("Dom0 doesn't fit in memory space!\n");
1563 dom_imva = __va_ul(dom_mpaddr + dom0_start);
1564 copy_memory((void *)dom_imva, (void *)elfaddr, filesz);
1565 if (memsz > filesz)
1566 memset((void *)dom_imva+filesz, 0,
1567 memsz-filesz);
1568 //FIXME: This test for code seems to find a lot more than objdump -x does
1569 if (phdr.p_flags & PF_X) {
1570 privify_memory(dom_imva,filesz);
1571 flush_icache_range (dom_imva, dom_imva+filesz);
1574 else
1575 #endif
1576 while (memsz > 0) {
1577 p = assign_new_domain_page(d,dom_mpaddr);
1578 BUG_ON (unlikely(p == NULL));
1579 dom_imva = __va_ul(page_to_maddr(p));
1580 if (filesz > 0) {
1581 if (filesz >= PAGE_SIZE)
1582 copy_memory((void *) dom_imva,
1583 (void *) elfaddr,
1584 PAGE_SIZE);
1585 else {
1586 // copy partial page
1587 copy_memory((void *) dom_imva,
1588 (void *) elfaddr, filesz);
1589 // zero the rest of page
1590 memset((void *) dom_imva+filesz, 0,
1591 PAGE_SIZE-filesz);
1593 //FIXME: This test for code seems to find a lot more than objdump -x does
1594 if (phdr.p_flags & PF_X) {
1595 privify_memory(dom_imva,PAGE_SIZE);
1596 flush_icache_range(dom_imva,
1597 dom_imva+PAGE_SIZE);
1600 else if (memsz > 0) {
1601 /* always zero out entire page */
1602 memset((void *) dom_imva, 0, PAGE_SIZE);
1604 memsz -= PAGE_SIZE;
1605 filesz -= PAGE_SIZE;
1606 elfaddr += PAGE_SIZE;
1607 dom_mpaddr += PAGE_SIZE;
1612 void alloc_dom0(void)
1614 if (platform_is_hp_ski()) {
1615 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1617 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1618 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
1620 /* FIXME: The first trunk (say 256M) should always be assigned to
1621 * Dom0, since Dom0's physical == machine address for DMA purpose.
1622 * Some old version linux, like 2.4, assumes physical memory existing
1623 * in 2nd 64M space.
1624 */
1625 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
1626 dom0_start <<= PAGE_SHIFT;
1627 if (!dom0_start) {
1628 panic("alloc_dom0: can't allocate contiguous memory size=%lu\n",
1629 dom0_size);
1631 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
1632 #else
1633 // no need to allocate pages for now
1634 // pages are allocated by map_new_domain_page() via loaddomainelfimage()
1635 dom0_start = 0;
1636 #endif
1641 /*
1642 * Domain 0 has direct access to all devices absolutely. However
1643 * the major point of this stub here, is to allow alloc_dom_mem
1644 * handled with order > 0 request. Dom0 requires that bit set to
1645 * allocate memory for other domains.
1646 */
1647 static void physdev_init_dom0(struct domain *d)
1649 if (iomem_permit_access(d, 0UL, ~0UL))
1650 BUG();
1651 if (irqs_permit_access(d, 0, NR_IRQS-1))
1652 BUG();
1655 static unsigned int vmx_dom0 = 0;
1656 int construct_dom0(struct domain *d,
1657 unsigned long image_start, unsigned long image_len,
1658 unsigned long initrd_start, unsigned long initrd_len,
1659 char *cmdline)
1661 int i, rc;
1662 unsigned long alloc_start, alloc_end;
1663 start_info_t *si;
1664 struct vcpu *v = d->vcpu[0];
1665 unsigned long max_pages;
1667 struct domain_setup_info dsi;
1668 unsigned long p_start;
1669 unsigned long pkern_start;
1670 unsigned long pkern_entry;
1671 unsigned long pkern_end;
1672 unsigned long pinitrd_start = 0;
1673 unsigned long pstart_info;
1674 struct page_info *start_info_page;
1676 #ifdef VALIDATE_VT
1677 unsigned long mfn;
1678 struct page_info *page = NULL;
1679 #endif
1681 //printf("construct_dom0: starting\n");
1683 /* Sanity! */
1684 BUG_ON(d != dom0);
1685 BUG_ON(d->vcpu[0] == NULL);
1686 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1688 memset(&dsi, 0, sizeof(struct domain_setup_info));
1690 printk("*** LOADING DOMAIN 0 ***\n");
1692 alloc_start = dom0_start;
1693 alloc_end = dom0_start + dom0_size;
1694 max_pages = dom0_size / PAGE_SIZE;
1695 d->max_pages = max_pages;
1696 #ifndef CONFIG_XEN_IA64_DOM0_VP
1697 d->tot_pages = d->max_pages;
1698 #else
1699 d->tot_pages = 0;
1700 #endif
1701 dsi.image_addr = (unsigned long)image_start;
1702 dsi.image_len = image_len;
1703 rc = parseelfimage(&dsi);
1704 if ( rc != 0 )
1705 return rc;
1707 #ifdef VALIDATE_VT
1708 /* Temp workaround */
1709 if (running_on_sim)
1710 dsi.xen_section_string = (char *)1;
1712 /* Check whether dom0 is vti domain */
1713 if ((!vmx_enabled) && !dsi.xen_section_string) {
1714 printk("Lack of hardware support for unmodified vmx dom0\n");
1715 panic("");
1718 if (vmx_enabled && !dsi.xen_section_string) {
1719 printk("Dom0 is vmx domain!\n");
1720 vmx_dom0 = 1;
1722 #endif
1724 p_start = dsi.v_start;
1725 pkern_start = dsi.v_kernstart;
1726 pkern_end = dsi.v_kernend;
1727 pkern_entry = dsi.v_kernentry;
1729 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1731 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1733 printk("Initial guest OS must load to a page boundary.\n");
1734 return -EINVAL;
1737 pstart_info = PAGE_ALIGN(pkern_end);
1738 if(initrd_start && initrd_len){
1739 unsigned long offset;
1741 pinitrd_start= (dom0_start + dom0_size) -
1742 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
1743 if (pinitrd_start <= pstart_info)
1744 panic("%s:enough memory is not assigned to dom0", __func__);
1746 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1747 struct page_info *p;
1748 p = assign_new_domain_page(d, pinitrd_start + offset);
1749 if (p == NULL)
1750 panic("%s: can't allocate page for initrd image", __func__);
1751 if (initrd_len < offset + PAGE_SIZE)
1752 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1753 initrd_len - offset);
1754 else
1755 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1759 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1760 " Kernel image: %lx->%lx\n"
1761 " Entry address: %lx\n"
1762 " Init. ramdisk: %lx len %lx\n"
1763 " Start info.: %lx->%lx\n",
1764 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1765 pstart_info, pstart_info + PAGE_SIZE);
1767 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1769 printk("Initial guest OS requires too much space\n"
1770 "(%luMB is greater than %luMB limit)\n",
1771 (pkern_end-pkern_start)>>20,
1772 (max_pages <<PAGE_SHIFT)>>20);
1773 return -ENOMEM;
1776 // if high 3 bits of pkern start are non-zero, error
1778 // if pkern end is after end of metaphysical memory, error
1779 // (we should be able to deal with this... later)
1781 /* Mask all upcalls... */
1782 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1783 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1785 if (dom0_max_vcpus == 0)
1786 dom0_max_vcpus = MAX_VIRT_CPUS;
1787 if (dom0_max_vcpus > num_online_cpus())
1788 dom0_max_vcpus = num_online_cpus();
1789 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1790 dom0_max_vcpus = MAX_VIRT_CPUS;
1792 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1793 for ( i = 1; i < dom0_max_vcpus; i++ )
1794 if (alloc_vcpu(d, i, i) == NULL)
1795 printf ("Cannot allocate dom0 vcpu %d\n", i);
1797 #if defined(VALIDATE_VT) && !defined(CONFIG_XEN_IA64_DOM0_VP)
1798 /* Construct a frame-allocation list for the initial domain, since these
1799 * pages are allocated by boot allocator and pfns are not set properly
1800 */
1801 for ( mfn = (alloc_start>>PAGE_SHIFT);
1802 mfn < (alloc_end>>PAGE_SHIFT);
1803 mfn++ )
1805 page = mfn_to_page(mfn);
1806 page_set_owner(page, d);
1807 page->u.inuse.type_info = 0;
1808 page->count_info = PGC_allocated | 1;
1809 list_add_tail(&page->list, &d->page_list);
1811 /* Construct 1:1 mapping */
1812 set_gpfn_from_mfn(mfn, mfn);
1814 #endif
1816 /* Copy the OS image. */
1817 loaddomainelfimage(d,image_start);
1819 /* Copy the initial ramdisk. */
1820 //if ( initrd_len != 0 )
1821 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1824 /* Set up start info area. */
1825 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1826 start_info_page = assign_new_domain_page(d, pstart_info);
1827 if (start_info_page == NULL)
1828 panic("can't allocate start info page");
1829 si = page_to_virt(start_info_page);
1830 memset(si, 0, PAGE_SIZE);
1831 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
1832 si->nr_pages = max_pages;
1834 /* Give up the VGA console if DOM0 is configured to grab it. */
1835 if (cmdline != NULL)
1836 console_endboot(strstr(cmdline, "tty0") != NULL);
1838 /* VMX specific construction for Dom0, if hardware supports VMX
1839 * and Dom0 is unmodified image
1840 */
1841 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
1842 if (vmx_dom0)
1843 vmx_final_setup_guest(v);
1845 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1847 new_thread(v, pkern_entry, 0, 0);
1848 physdev_init_dom0(d);
1850 // dom0 doesn't need build_physmap_table()
1851 // see arch_set_info_guest()
1852 // instead we allocate pages manually.
1853 for (i = 0; i < max_pages; i++) {
1854 assign_new_domain0_page(d, i << PAGE_SHIFT);
1856 d->arch.physmap_built = 1;
1858 // FIXME: Hack for keyboard input
1859 //serial_input_init();
1861 return 0;
1864 void machine_restart(char * __unused)
1866 if (platform_is_hp_ski()) dummy();
1867 printf("machine_restart called: spinning....\n");
1868 while(1);
1871 void machine_halt(void)
1873 if (platform_is_hp_ski()) dummy();
1874 printf("machine_halt called: spinning....\n");
1875 while(1);
1878 void dummy_called(char *function)
1880 if (platform_is_hp_ski()) asm("break 0;;");
1881 printf("dummy called in %s: spinning....\n", function);
1882 while(1);
1885 void domain_pend_keyboard_interrupt(int irq)
1887 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1890 void sync_vcpu_execstate(struct vcpu *v)
1892 // __ia64_save_fpu(v->arch._thread.fph);
1893 // if (VMX_DOMAIN(v))
1894 // vmx_save_state(v);
1895 // FIXME SMP: Anything else needed here for SMP?
1898 // FIXME: It would be nice to print out a nice error message for bad
1899 // values of these boot-time parameters, but it seems we are too early
1900 // in the boot and attempts to print freeze the system?
1901 #define abort(x...) do {} while(0)
1902 #define warn(x...) do {} while(0)
1904 static void parse_dom0_mem(char *s)
1906 unsigned long bytes = parse_size_and_unit(s);
1908 if (dom0_size < 4 * 1024 * 1024) {
1909 abort("parse_dom0_mem: too small, boot aborted"
1910 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1912 if (dom0_size % dom0_align) {
1913 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1914 warn("parse_dom0_mem: dom0_size rounded up from"
1915 " %lx to %lx bytes, due to dom0_align=%lx\n",
1916 bytes,dom0_size,dom0_align);
1918 else dom0_size = bytes;
1920 custom_param("dom0_mem", parse_dom0_mem);
1923 static void parse_dom0_align(char *s)
1925 unsigned long bytes = parse_size_and_unit(s);
1927 if ((bytes - 1) ^ bytes) { /* not a power of two */
1928 abort("parse_dom0_align: dom0_align must be power of two, "
1929 "boot aborted"
1930 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1932 else if (bytes < PAGE_SIZE) {
1933 abort("parse_dom0_align: dom0_align must be >= %ld, "
1934 "boot aborted"
1935 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1936 PAGE_SIZE);
1938 else dom0_align = bytes;
1939 if (dom0_size % dom0_align) {
1940 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1941 warn("parse_dom0_align: dom0_size rounded up from"
1942 " %ld to %ld bytes, due to dom0_align=%lx\n",
1943 bytes,dom0_size,dom0_align);
1946 custom_param("dom0_align", parse_dom0_align);