ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 10017:2de2952715d9

[IA64] Fix xm pause/unpause bug

This small patch intends to fix domain pause/unpause bug.
Current xm pause operation will do sync_vcpu_execstate to
sync vcpu status, but it saves dom0's fpu and other registers
to VTi domain or domainU due to xm pause from control panel.
Because sync_vcpu_execstate was called after vcpu_sleep which
has saved all status when schedule out ,in addition, currently
no lazy states need to be saved in IPF side, so sync_vcpu_execstate
would need do nothing now.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Zhang xiantao <xiantao.zhang@intel.com>
author awilliam@xenbuild.aw
date Wed May 17 15:52:55 2006 -0600 (2006-05-17)
parents 303406dd9e3b
children 41e7549d7df9
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <xen/mm.h>
22 #include <xen/iocap.h>
23 #include <asm/ptrace.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/hw_irq.h>
29 #include <asm/setup.h>
30 //#include <asm/mpspec.h>
31 #include <xen/irq.h>
32 #include <xen/event.h>
33 //#include <xen/shadow.h>
34 #include <xen/console.h>
35 #include <xen/compile.h>
37 #include <xen/elf.h>
38 //#include <asm/page.h>
39 #include <asm/pgalloc.h>
41 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
43 #include <asm/vcpu.h> /* for function declarations */
44 #include <public/arch-ia64.h>
45 #include <asm/vmx.h>
46 #include <asm/vmx_vcpu.h>
47 #include <asm/vmx_vpd.h>
48 #include <asm/vmx_phy_mode.h>
49 #include <asm/pal.h>
50 #include <asm/vhpt.h>
51 #include <public/hvm/ioreq.h>
52 #include <public/arch-ia64.h>
53 #include <asm/tlbflush.h>
54 #include <asm/regionreg.h>
55 #include <asm/dom_fw.h>
57 #ifndef CONFIG_XEN_IA64_DOM0_VP
58 #define CONFIG_DOMAIN0_CONTIGUOUS
59 #endif
60 unsigned long dom0_start = -1L;
61 unsigned long dom0_size = 512*1024*1024;
62 unsigned long dom0_align = 64*1024*1024;
64 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
65 static unsigned int dom0_max_vcpus = 1;
66 integer_param("dom0_max_vcpus", dom0_max_vcpus);
68 // initialized by arch/ia64/setup.c:find_initrd()
69 unsigned long initrd_start = 0, initrd_end = 0;
70 extern unsigned long running_on_sim;
72 #define IS_XEN_ADDRESS(d,a) ((a >= d->xen_vastart) && (a <= d->xen_vaend))
74 /* FIXME: where these declarations should be there ? */
75 extern long platform_is_hp_ski(void);
76 extern void serial_input_init(void);
77 static void init_switch_stack(struct vcpu *v);
78 void build_physmap_table(struct domain *d);
80 static void try_to_clear_PGC_allocate(struct domain* d,
81 struct page_info* page);
83 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
84 void arch_domain_destroy(struct domain *d)
85 {
86 BUG_ON(d->arch.mm->pgd != NULL);
87 if (d->arch.mm != NULL)
88 xfree(d->arch.mm);
89 if (d->shared_info != NULL)
90 free_xenheap_page(d->shared_info);
92 domain_flush_destroy (d);
94 deallocate_rid_range(d);
95 }
97 static void default_idle(void)
98 {
99 int cpu = smp_processor_id();
100 local_irq_disable();
101 if ( !softirq_pending(cpu))
102 safe_halt();
103 local_irq_enable();
104 }
106 static void continue_cpu_idle_loop(void)
107 {
108 int cpu = smp_processor_id();
109 for ( ; ; )
110 {
111 #ifdef IA64
112 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
113 #else
114 irq_stat[cpu].idle_timestamp = jiffies;
115 #endif
116 while ( !softirq_pending(cpu) )
117 default_idle();
118 add_preempt_count(SOFTIRQ_OFFSET);
119 raise_softirq(SCHEDULE_SOFTIRQ);
120 do_softirq();
121 sub_preempt_count(SOFTIRQ_OFFSET);
122 }
123 }
125 void startup_cpu_idle_loop(void)
126 {
127 /* Just some sanity to ensure that the scheduler is set up okay. */
128 ASSERT(current->domain == IDLE_DOMAIN_ID);
129 raise_softirq(SCHEDULE_SOFTIRQ);
131 continue_cpu_idle_loop();
132 }
134 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
135 {
136 struct vcpu *v;
137 struct thread_info *ti;
139 /* Still keep idle vcpu0 static allocated at compilation, due
140 * to some code from Linux still requires it in early phase.
141 */
142 if (is_idle_domain(d) && !vcpu_id)
143 v = idle_vcpu[0];
144 else {
145 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
146 return NULL;
147 memset(v, 0, sizeof(*v));
149 ti = alloc_thread_info(v);
150 /* Clear thread_info to clear some important fields, like
151 * preempt_count
152 */
153 memset(ti, 0, sizeof(struct thread_info));
154 init_switch_stack(v);
155 }
157 if (!is_idle_domain(d)) {
158 v->arch.privregs =
159 alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
160 BUG_ON(v->arch.privregs == NULL);
161 memset(v->arch.privregs, 0, PAGE_SIZE);
163 if (!vcpu_id)
164 memset(&d->shared_info->evtchn_mask[0], 0xff,
165 sizeof(d->shared_info->evtchn_mask));
167 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
168 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
169 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
170 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
172 /* Is it correct ?
173 It depends on the domain rid usage.
175 A domain may share rid among its processor (eg having a
176 global VHPT). In this case, we should also share rid
177 among vcpus and the rid range should be the same.
179 However a domain may have per cpu rid allocation. In
180 this case we don't want to share rid among vcpus, but we may
181 do it if two vcpus are on the same cpu... */
183 v->arch.starting_rid = d->arch.starting_rid;
184 v->arch.ending_rid = d->arch.ending_rid;
185 v->arch.breakimm = d->arch.breakimm;
186 }
188 return v;
189 }
191 void free_vcpu_struct(struct vcpu *v)
192 {
193 if (VMX_DOMAIN(v))
194 vmx_relinquish_vcpu_resources(v);
195 else {
196 if (v->arch.privregs != NULL)
197 free_xenheap_pages(v->arch.privregs, get_order(sizeof(mapped_regs_t)));
198 }
200 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
201 }
203 static void init_switch_stack(struct vcpu *v)
204 {
205 struct pt_regs *regs = vcpu_regs (v);
206 struct switch_stack *sw = (struct switch_stack *) regs - 1;
207 extern void ia64_ret_from_clone;
209 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
210 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
211 sw->b0 = (unsigned long) &ia64_ret_from_clone;
212 sw->ar_fpsr = FPSR_DEFAULT;
213 v->arch._thread.ksp = (unsigned long) sw - 16;
214 // stay on kernel stack because may get interrupts!
215 // ia64_ret_from_clone (which b0 gets in new_thread) switches
216 // to user stack
217 v->arch._thread.on_ustack = 0;
218 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
219 }
221 int arch_domain_create(struct domain *d)
222 {
223 // the following will eventually need to be negotiated dynamically
224 d->xen_vastart = XEN_START_ADDR;
225 d->xen_vaend = XEN_END_ADDR;
226 d->arch.shared_info_va = SHAREDINFO_ADDR;
227 d->arch.breakimm = 0x1000;
229 if (is_idle_domain(d))
230 return 0;
232 if ((d->shared_info = (void *)alloc_xenheap_page()) == NULL)
233 goto fail_nomem;
234 memset(d->shared_info, 0, PAGE_SIZE);
236 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
237 /* We may also need emulation rid for region4, though it's unlikely
238 * to see guest issue uncacheable access in metaphysical mode. But
239 * keep such info here may be more sane.
240 */
241 if (!allocate_rid_range(d,0))
242 goto fail_nomem;
243 d->arch.sys_pgnr = 0;
245 if ((d->arch.mm = xmalloc(struct mm_struct)) == NULL)
246 goto fail_nomem;
247 memset(d->arch.mm, 0, sizeof(*d->arch.mm));
249 d->arch.physmap_built = 0;
250 if ((d->arch.mm->pgd = pgd_alloc(d->arch.mm)) == NULL)
251 goto fail_nomem;
253 printf ("arch_domain_create: domain=%p\n", d);
254 return 0;
256 fail_nomem:
257 if (d->arch.mm->pgd != NULL)
258 pgd_free(d->arch.mm->pgd);
259 if (d->arch.mm != NULL)
260 xfree(d->arch.mm);
261 if (d->shared_info != NULL)
262 free_xenheap_page(d->shared_info);
263 return -ENOMEM;
264 }
266 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
267 {
268 c->regs = *vcpu_regs (v);
269 c->shared = v->domain->shared_info->arch;
270 }
272 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
273 {
274 struct pt_regs *regs = vcpu_regs (v);
275 struct domain *d = v->domain;
277 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
278 return 0;
279 if (c->flags & VGCF_VMX_GUEST) {
280 if (!vmx_enabled) {
281 printk("No VMX hardware feature for vmx domain.\n");
282 return -EINVAL;
283 }
285 if (v == d->vcpu[0])
286 vmx_setup_platform(d, c);
288 vmx_final_setup_guest(v);
289 } else if (!d->arch.physmap_built)
290 build_physmap_table(d);
292 *regs = c->regs;
293 if (v == d->vcpu[0]) {
294 /* Only for first vcpu. */
295 d->arch.sys_pgnr = c->sys_pgnr;
296 d->arch.initrd_start = c->initrd.start;
297 d->arch.initrd_len = c->initrd.size;
298 d->arch.cmdline = c->cmdline;
299 d->shared_info->arch = c->shared;
301 /* Cache synchronization seems to be done by the linux kernel
302 during mmap/unmap operation. However be conservative. */
303 domain_cache_flush (d, 1);
304 }
305 new_thread(v, regs->cr_iip, 0, 0);
307 if ( c->privregs && copy_from_user(v->arch.privregs,
308 c->privregs, sizeof(mapped_regs_t))) {
309 printk("Bad ctxt address in arch_set_info_guest: %p\n",
310 c->privregs);
311 return -EFAULT;
312 }
314 v->arch.domain_itm_last = -1L;
316 /* Don't redo final setup */
317 set_bit(_VCPUF_initialised, &v->vcpu_flags);
318 return 0;
319 }
321 static void relinquish_memory(struct domain *d, struct list_head *list)
322 {
323 struct list_head *ent;
324 struct page_info *page;
325 #ifndef __ia64__
326 unsigned long x, y;
327 #endif
329 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
330 spin_lock_recursive(&d->page_alloc_lock);
331 ent = list->next;
332 while ( ent != list )
333 {
334 page = list_entry(ent, struct page_info, list);
335 /* Grab a reference to the page so it won't disappear from under us. */
336 if ( unlikely(!get_page(page, d)) )
337 {
338 /* Couldn't get a reference -- someone is freeing this page. */
339 ent = ent->next;
340 continue;
341 }
343 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
344 put_page_and_type(page);
346 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
347 put_page(page);
349 #ifndef __ia64__
350 /*
351 * Forcibly invalidate base page tables at this point to break circular
352 * 'linear page table' references. This is okay because MMU structures
353 * are not shared across domains and this domain is now dead. Thus base
354 * tables are not in use so a non-zero count means circular reference.
355 */
356 y = page->u.inuse.type_info;
357 for ( ; ; )
358 {
359 x = y;
360 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
361 (PGT_base_page_table|PGT_validated)) )
362 break;
364 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
365 if ( likely(y == x) )
366 {
367 free_page_type(page, PGT_base_page_table);
368 break;
369 }
370 }
371 #endif
373 /* Follow the list chain and /then/ potentially free the page. */
374 ent = ent->next;
375 #ifdef CONFIG_XEN_IA64_DOM0_VP
376 #if 1
377 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
378 #else
379 //XXX this should be done at traversing the P2M table.
380 if (page_get_owner(page) == d)
381 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
382 #endif
383 #endif
384 put_page(page);
385 }
387 spin_unlock_recursive(&d->page_alloc_lock);
388 }
390 static void
391 relinquish_pte(struct domain* d, pte_t* pte)
392 {
393 unsigned long mfn = pte_pfn(*pte);
394 struct page_info* page;
396 // vmx domain use bit[58:56] to distinguish io region from memory.
397 // see vmx_build_physmap_table() in vmx_init.c
398 if (((mfn << PAGE_SHIFT) & GPFN_IO_MASK) != GPFN_MEM)
399 return;
401 // domain might map IO space or acpi table pages. check it.
402 if (!mfn_valid(mfn))
403 return;
404 page = mfn_to_page(mfn);
405 // struct page_info corresponding to mfn may exist or not depending
406 // on CONFIG_VIRTUAL_FRAME_TABLE.
407 // This check is too easy.
408 // The right way is to check whether this page is of io area or acpi pages
409 if (page_get_owner(page) == NULL) {
410 BUG_ON(page->count_info != 0);
411 return;
412 }
414 #ifdef CONFIG_XEN_IA64_DOM0_VP
415 if (page_get_owner(page) == d) {
416 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
417 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
418 }
419 #endif
420 try_to_clear_PGC_allocate(d, page);
421 put_page(page);
422 }
424 static void
425 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
426 {
427 unsigned long i;
428 pte_t* pte = pte_offset_map(pmd, offset);
430 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
431 if (!pte_present(*pte))
432 continue;
434 relinquish_pte(d, pte);
435 }
436 pte_free_kernel(pte_offset_map(pmd, offset));
437 }
439 static void
440 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
441 {
442 unsigned long i;
443 pmd_t *pmd = pmd_offset(pud, offset);
445 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
446 if (!pmd_present(*pmd))
447 continue;
449 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
450 }
451 pmd_free(pmd_offset(pud, offset));
452 }
454 static void
455 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
456 {
457 unsigned long i;
458 pud_t *pud = pud_offset(pgd, offset);
460 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
461 if (!pud_present(*pud))
462 continue;
464 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
465 }
466 pud_free(pud_offset(pgd, offset));
467 }
469 static void
470 relinquish_mm(struct domain* d)
471 {
472 struct mm_struct* mm = d->arch.mm;
473 unsigned long i;
474 pgd_t* pgd;
476 if (mm->pgd == NULL)
477 return;
479 pgd = pgd_offset(mm, 0);
480 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
481 if (!pgd_present(*pgd))
482 continue;
484 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
485 }
486 pgd_free(mm->pgd);
487 mm->pgd = NULL;
488 }
490 void domain_relinquish_resources(struct domain *d)
491 {
492 /* Relinquish every page of memory. */
494 // relase page traversing d->arch.mm.
495 relinquish_mm(d);
497 relinquish_memory(d, &d->xenpage_list);
498 relinquish_memory(d, &d->page_list);
499 }
501 // heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread()
502 // and linux/arch/ia64/kernel/process.c:kernel_thread()
503 void new_thread(struct vcpu *v,
504 unsigned long start_pc,
505 unsigned long start_stack,
506 unsigned long start_info)
507 {
508 struct domain *d = v->domain;
509 struct pt_regs *regs;
510 extern char dom0_command_line[];
512 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
513 if (d == dom0 && v->vcpu_id == 0) start_pc += dom0_start;
514 #endif
516 regs = vcpu_regs (v);
517 if (VMX_DOMAIN(v)) {
518 /* dt/rt/it:1;i/ic:1, si:1, vm/bn:1, ac:1 */
519 regs->cr_ipsr = 0x501008826008; /* Need to be expanded as macro */
520 } else {
521 regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR)
522 | IA64_PSR_BITS_TO_SET | IA64_PSR_BN;
523 regs->cr_ipsr &= ~(IA64_PSR_BITS_TO_CLEAR
524 | IA64_PSR_RI | IA64_PSR_IS);
525 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2
526 }
527 regs->cr_iip = start_pc;
528 regs->cr_ifs = 1UL << 63; /* or clear? */
529 regs->ar_fpsr = FPSR_DEFAULT;
531 if (VMX_DOMAIN(v)) {
532 vmx_init_all_rr(v);
533 if (d == dom0)
534 regs->r28 = dom_fw_setup(d,dom0_command_line,
535 COMMAND_LINE_SIZE);
536 /* Virtual processor context setup */
537 VCPU(v, vpsr) = IA64_PSR_BN;
538 VCPU(v, dcr) = 0;
539 } else {
540 init_all_rr(v);
541 if (v->vcpu_id == 0) {
542 /* Build the firmware. */
543 if (d == dom0)
544 regs->r28 = dom_fw_setup(d,dom0_command_line,
545 COMMAND_LINE_SIZE);
546 else {
547 const char *cmdline = d->arch.cmdline;
548 int len;
550 if (*cmdline == 0) {
551 #define DEFAULT_CMDLINE "nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1"
552 cmdline = DEFAULT_CMDLINE;
553 len = sizeof (DEFAULT_CMDLINE);
554 printf("domU command line defaulted to"
555 DEFAULT_CMDLINE "\n");
556 }
557 else
558 len = IA64_COMMAND_LINE_SIZE;
560 regs->r28 = dom_fw_setup (d, cmdline, len);
561 }
562 d->shared_info->arch.flags = (d == dom0) ?
563 (SIF_INITDOMAIN|SIF_PRIVILEGED) : 0;
564 }
565 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
566 VCPU(v, banknum) = 1;
567 VCPU(v, metaphysical_mode) = 1;
568 VCPU(v, interrupt_mask_addr) =
569 (uint64_t)SHAREDINFO_ADDR + INT_ENABLE_OFFSET(v);
570 VCPU(v, itv) = (1 << 16); /* timer vector masked */
571 }
572 }
574 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
575 void
576 share_xen_page_with_guest(struct page_info *page,
577 struct domain *d, int readonly)
578 {
579 if ( page_get_owner(page) == d )
580 return;
582 #if 1
583 if (readonly) {
584 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
585 }
586 #endif
588 // alloc_xenheap_pages() doesn't initialize page owner.
589 //BUG_ON(page_get_owner(page) != NULL);
590 #if 0
591 if (get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY) {
592 printk("%s:%d page 0x%p mfn 0x%lx gpfn 0x%lx\n", __func__, __LINE__,
593 page, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)));
594 }
595 #endif
596 // grant_table_destroy() release these pages.
597 // but it doesn't clear m2p entry. So there might remain stale entry.
598 // We clear such a stale entry here.
599 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
601 spin_lock(&d->page_alloc_lock);
603 #ifndef __ia64__
604 /* The incremented type count pins as writable or read-only. */
605 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
606 page->u.inuse.type_info |= PGT_validated | 1;
607 #endif
609 page_set_owner(page, d);
610 wmb(); /* install valid domain ptr before updating refcnt. */
611 ASSERT(page->count_info == 0);
612 page->count_info |= PGC_allocated | 1;
614 if ( unlikely(d->xenheap_pages++ == 0) )
615 get_knownalive_domain(d);
616 list_add_tail(&page->list, &d->xenpage_list);
618 spin_unlock(&d->page_alloc_lock);
619 }
621 //XXX !xxx_present() should be used instread of !xxx_none()?
622 static pte_t*
623 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
624 {
625 struct mm_struct *mm = d->arch.mm;
626 pgd_t *pgd;
627 pud_t *pud;
628 pmd_t *pmd;
630 BUG_ON(mm->pgd == NULL);
631 pgd = pgd_offset(mm, mpaddr);
632 if (pgd_none(*pgd)) {
633 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
634 }
636 pud = pud_offset(pgd, mpaddr);
637 if (pud_none(*pud)) {
638 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
639 }
641 pmd = pmd_offset(pud, mpaddr);
642 if (pmd_none(*pmd)) {
643 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
644 }
646 return pte_offset_map(pmd, mpaddr);
647 }
649 //XXX xxx_none() should be used instread of !xxx_present()?
650 static pte_t*
651 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
652 {
653 struct mm_struct *mm = d->arch.mm;
654 pgd_t *pgd;
655 pud_t *pud;
656 pmd_t *pmd;
658 BUG_ON(mm->pgd == NULL);
659 pgd = pgd_offset(mm, mpaddr);
660 if (!pgd_present(*pgd))
661 goto not_present;
663 pud = pud_offset(pgd, mpaddr);
664 if (!pud_present(*pud))
665 goto not_present;
667 pmd = pmd_offset(pud, mpaddr);
668 if (!pmd_present(*pmd))
669 goto not_present;
671 return pte_offset_map(pmd, mpaddr);
673 not_present:
674 return NULL;
675 }
677 #ifdef CONFIG_XEN_IA64_DOM0_VP
678 static pte_t*
679 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
680 {
681 struct mm_struct *mm = d->arch.mm;
682 pgd_t *pgd;
683 pud_t *pud;
684 pmd_t *pmd;
686 BUG_ON(mm->pgd == NULL);
687 pgd = pgd_offset(mm, mpaddr);
688 if (pgd_none(*pgd))
689 goto not_present;
691 pud = pud_offset(pgd, mpaddr);
692 if (pud_none(*pud))
693 goto not_present;
695 pmd = pmd_offset(pud, mpaddr);
696 if (pmd_none(*pmd))
697 goto not_present;
699 return pte_offset_map(pmd, mpaddr);
701 not_present:
702 return NULL;
703 }
704 #endif
706 /* Allocate a new page for domain and map it to the specified metaphysical
707 address. */
708 struct page_info *
709 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
710 {
711 struct page_info *p = NULL;
712 unsigned long maddr;
713 int ret;
715 BUG_ON(!pte_none(*pte));
717 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
718 if (d == dom0) {
719 #if 0
720 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
721 /* FIXME: is it true ?
722 dom0 memory is not contiguous! */
723 panic("assign_new_domain_page: bad domain0 "
724 "mpaddr=%lx, start=%lx, end=%lx!\n",
725 mpaddr, dom0_start, dom0_start+dom0_size);
726 }
727 #endif
728 p = mfn_to_page((mpaddr >> PAGE_SHIFT));
729 return p;
730 }
731 #endif
733 p = alloc_domheap_page(d);
734 if (unlikely(!p)) {
735 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
736 return(p);
737 }
739 // zero out pages for security reasons
740 clear_page(page_to_virt(p));
741 maddr = page_to_maddr (p);
742 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
743 && maddr < __get_cpu_var(vhpt_pend))) {
744 /* FIXME: how can this happen ?
745 vhpt is allocated by alloc_domheap_page. */
746 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
747 maddr);
748 }
750 ret = get_page(p, d);
751 BUG_ON(ret == 0);
752 set_pte(pte, pfn_pte(maddr >> PAGE_SHIFT,
753 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
755 //XXX CONFIG_XEN_IA64_DOM0_VP
756 // TODO racy
757 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
758 return p;
759 }
761 struct page_info *
762 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
763 {
764 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
765 pte_t dummy_pte = __pte(0);
766 return __assign_new_domain_page(d, mpaddr, &dummy_pte);
767 #else
768 struct page_info *p = NULL;
769 pte_t *pte;
771 pte = lookup_alloc_domain_pte(d, mpaddr);
772 if (pte_none(*pte)) {
773 p = __assign_new_domain_page(d, mpaddr, pte);
774 } else {
775 DPRINTK("%s: d 0x%p mpaddr %lx already mapped!\n",
776 __func__, d, mpaddr);
777 }
779 return p;
780 #endif
781 }
783 void
784 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
785 {
786 #ifndef CONFIG_DOMAIN0_CONTIGUOUS
787 pte_t *pte;
789 BUG_ON(d != dom0);
790 pte = lookup_alloc_domain_pte(d, mpaddr);
791 if (pte_none(*pte)) {
792 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
793 if (p == NULL) {
794 panic("%s: can't allocate page for dom0", __func__);
795 }
796 }
797 #endif
798 }
800 /* map a physical address to the specified metaphysical addr */
801 void
802 __assign_domain_page(struct domain *d,
803 unsigned long mpaddr, unsigned long physaddr)
804 {
805 pte_t *pte;
807 pte = lookup_alloc_domain_pte(d, mpaddr);
808 if (pte_none(*pte)) {
809 set_pte(pte,
810 pfn_pte(physaddr >> PAGE_SHIFT,
811 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
812 } else
813 printk("%s: mpaddr %lx already mapped!\n", __func__, mpaddr);
814 }
816 /* get_page() and map a physical address to the specified metaphysical addr */
817 void
818 assign_domain_page(struct domain *d,
819 unsigned long mpaddr, unsigned long physaddr)
820 {
821 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
822 int ret;
824 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
825 ret = get_page(page, d);
826 BUG_ON(ret == 0);
827 __assign_domain_page(d, mpaddr, physaddr);
829 //XXX CONFIG_XEN_IA64_DOM0_VP
830 // TODO racy
831 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
832 }
834 #ifdef CONFIG_XEN_IA64_DOM0_VP
835 static void
836 assign_domain_same_page(struct domain *d,
837 unsigned long mpaddr, unsigned long size)
838 {
839 //XXX optimization
840 unsigned long end = mpaddr + size;
841 for (; mpaddr < end; mpaddr += PAGE_SIZE) {
842 __assign_domain_page(d, mpaddr, mpaddr);
843 }
844 }
846 unsigned long
847 assign_domain_mmio_page(struct domain *d,
848 unsigned long mpaddr, unsigned long size)
849 {
850 if (size == 0) {
851 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
852 __func__, d, mpaddr, size);
853 }
854 assign_domain_same_page(d, mpaddr, size);
855 return mpaddr;
856 }
858 unsigned long
859 assign_domain_mach_page(struct domain *d,
860 unsigned long mpaddr, unsigned long size)
861 {
862 assign_domain_same_page(d, mpaddr, size);
863 return mpaddr;
864 }
866 //XXX selege hammer.
867 // flush finer range.
868 void
869 domain_page_flush(struct domain* d, unsigned long mpaddr,
870 unsigned long old_mfn, unsigned long new_mfn)
871 {
872 domain_flush_vtlb_all();
873 }
874 #endif
876 //XXX heavily depends on the struct page_info layout.
877 //
878 // if (page_get_owner(page) == d &&
879 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
880 // put_page(page);
881 // }
882 static void
883 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
884 {
885 u32 _d, _nd;
886 u64 x, nx, y;
888 _d = pickle_domptr(d);
889 y = *((u64*)&page->count_info);
890 do {
891 x = y;
892 _nd = x >> 32;
893 nx = x - 1;
894 __clear_bit(_PGC_allocated, &nx);
896 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
897 struct domain* nd = unpickle_domptr(_nd);
898 if (nd == NULL) {
899 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
900 "sd=%p 0x%x,"
901 " caf=%016lx, taf=%" PRtype_info "\n",
902 (void *) page_to_mfn(page),
903 d, d->domain_id, _d,
904 nd, _nd,
905 x,
906 page->u.inuse.type_info);
907 }
908 break;
909 }
911 BUG_ON((nx & PGC_count_mask) < 1);
912 y = cmpxchg((u64*)&page->count_info, x, nx);
913 } while (unlikely(y != x));
914 }
916 #ifdef CONFIG_XEN_IA64_DOM0_VP
917 static void
918 zap_domain_page_one(struct domain *d, unsigned long mpaddr, int do_put_page)
919 {
920 struct mm_struct *mm = d->arch.mm;
921 pte_t *pte;
922 pte_t old_pte;
923 unsigned long mfn;
924 struct page_info *page;
926 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
927 if (pte == NULL)
928 return;
929 if (pte_none(*pte))
930 return;
932 // update pte
933 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
934 mfn = pte_pfn(old_pte);
935 page = mfn_to_page(mfn);
936 BUG_ON((page->count_info & PGC_count_mask) == 0);
938 if (page_get_owner(page) == d) {
939 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
940 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
941 }
943 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
945 if (do_put_page) {
946 try_to_clear_PGC_allocate(d, page);
947 put_page(page);
948 }
949 }
950 #endif
952 void build_physmap_table(struct domain *d)
953 {
954 struct list_head *list_ent = d->page_list.next;
955 unsigned long mfn, i = 0;
957 ASSERT(!d->arch.physmap_built);
958 while(list_ent != &d->page_list) {
959 mfn = page_to_mfn(list_entry(
960 list_ent, struct page_info, list));
961 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
963 i++;
964 list_ent = mfn_to_page(mfn)->list.next;
965 }
966 d->arch.physmap_built = 1;
967 }
969 void mpafoo(unsigned long mpaddr)
970 {
971 extern unsigned long privop_trace;
972 if (mpaddr == 0x3800)
973 privop_trace = 1;
974 }
976 #ifdef CONFIG_XEN_IA64_DOM0_VP
977 unsigned long
978 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
979 {
980 pte_t *pte;
982 pte = lookup_noalloc_domain_pte(d, mpaddr);
983 if (pte == NULL)
984 goto not_present;
986 if (pte_present(*pte))
987 return (pte->pte & _PFN_MASK);
988 else if (VMX_DOMAIN(d->vcpu[0]))
989 return GPFN_INV_MASK;
991 not_present:
992 return INVALID_MFN;
993 }
995 unsigned long
996 __lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
997 {
998 unsigned long machine = ____lookup_domain_mpa(d, mpaddr);
999 if (machine != INVALID_MFN)
1000 return machine;
1002 printk("%s: d 0x%p id %d current 0x%p id %d\n",
1003 __func__, d, d->domain_id, current, current->vcpu_id);
1004 printk("%s: bad mpa 0x%lx (max_pages 0x%lx)\n",
1005 __func__, mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
1006 return INVALID_MFN;
1008 #endif
1010 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
1012 pte_t *pte;
1014 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1015 if (d == dom0) {
1016 pte_t pteval;
1017 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
1018 //printk("lookup_domain_mpa: bad dom0 mpaddr 0x%lx!\n",mpaddr);
1019 //printk("lookup_domain_mpa: start=0x%lx,end=0x%lx!\n",dom0_start,dom0_start+dom0_size);
1020 mpafoo(mpaddr);
1022 pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
1023 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
1024 pte = &pteval;
1025 return *(unsigned long *)pte;
1027 #endif
1028 pte = lookup_noalloc_domain_pte(d, mpaddr);
1029 if (pte != NULL) {
1030 if (pte_present(*pte)) {
1031 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
1032 return *(unsigned long *)pte;
1033 } else if (VMX_DOMAIN(d->vcpu[0]))
1034 return GPFN_INV_MASK;
1037 printk("%s: d 0x%p id %d current 0x%p id %d\n",
1038 __func__, d, d->domain_id, current, current->vcpu_id);
1039 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
1040 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
1041 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
1042 else
1043 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
1044 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
1045 mpafoo(mpaddr);
1046 return 0;
1049 #ifdef CONFIG_XEN_IA64_DOM0_VP
1050 //XXX SMP
1051 unsigned long
1052 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1053 unsigned int extent_order)
1055 unsigned long ret = 0;
1056 if (extent_order != 0) {
1057 //XXX
1058 ret = -ENOSYS;
1059 goto out;
1062 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1);
1064 out:
1065 return ret;
1068 // caller must get_page(mfn_to_page(mfn)) before
1069 // caller must call set_gpfn_from_mfn().
1070 static void
1071 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1072 unsigned long mfn, unsigned int flags)
1074 struct mm_struct *mm = d->arch.mm;
1075 pte_t* pte;
1076 pte_t old_pte;
1078 pte = lookup_alloc_domain_pte(d, mpaddr);
1080 // update pte
1081 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1082 set_pte(pte, pfn_pte(mfn,
1083 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
1084 if (!pte_none(old_pte)) {
1085 unsigned long old_mfn;
1086 struct page_info* old_page;
1088 // XXX should previous underlying page be removed?
1089 // or should error be returned because it is a due to a domain?
1090 old_mfn = pte_pfn(old_pte);//XXX
1091 old_page = mfn_to_page(old_mfn);
1093 if (page_get_owner(old_page) == d) {
1094 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1095 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1098 domain_page_flush(d, mpaddr, old_mfn, mfn);
1100 try_to_clear_PGC_allocate(d, old_page);
1101 put_page(old_page);
1102 } else {
1103 BUG_ON(!mfn_valid(mfn));
1104 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1105 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1109 unsigned long
1110 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1111 unsigned int flags, domid_t domid)
1113 int error = 0;
1115 struct domain* rd;
1116 rd = find_domain_by_id(domid);
1117 if (unlikely(rd == NULL)) {
1118 error = -EINVAL;
1119 goto out0;
1121 if (unlikely(rd == d)) {
1122 error = -EINVAL;
1123 goto out1;
1125 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1126 error = -EINVAL;
1127 goto out1;
1130 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, 0/* flags:XXX */);
1131 //don't update p2m table because this page belongs to rd, not d.
1132 out1:
1133 put_domain(rd);
1134 out0:
1135 return error;
1138 // grant table host mapping
1139 // mpaddr: host_addr: pseudo physical address
1140 // mfn: frame: machine page frame
1141 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1142 int
1143 create_grant_host_mapping(unsigned long gpaddr,
1144 unsigned long mfn, unsigned int flags)
1146 struct domain* d = current->domain;
1147 struct page_info* page;
1148 int ret;
1150 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1151 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1152 return GNTST_general_error;
1154 if (flags & GNTMAP_readonly) {
1155 #if 0
1156 DPRINTK("%s: GNTMAP_readonly is not implemented yet. flags %x\n",
1157 __func__, flags);
1158 #endif
1159 flags &= ~GNTMAP_readonly;
1162 page = mfn_to_page(mfn);
1163 ret = get_page(page, page_get_owner(page));
1164 BUG_ON(ret == 0);
1165 assign_domain_page_replace(d, gpaddr, mfn, flags);
1167 return GNTST_okay;
1170 // grant table host unmapping
1171 int
1172 destroy_grant_host_mapping(unsigned long gpaddr,
1173 unsigned long mfn, unsigned int flags)
1175 struct domain* d = current->domain;
1176 pte_t* pte;
1177 pte_t old_pte;
1178 unsigned long old_mfn = INVALID_MFN;
1179 struct page_info* old_page;
1181 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1182 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1183 return GNTST_general_error;
1185 if (flags & GNTMAP_readonly) {
1186 #if 0
1187 DPRINTK("%s: GNTMAP_readonly is not implemented yet. flags %x\n",
1188 __func__, flags);
1189 #endif
1190 flags &= ~GNTMAP_readonly;
1193 pte = lookup_noalloc_domain_pte(d, gpaddr);
1194 if (pte == NULL || !pte_present(*pte) || pte_pfn(*pte) != mfn)
1195 return GNTST_general_error;//XXX GNTST_bad_pseudo_phys_addr
1197 // update pte
1198 old_pte = ptep_get_and_clear(d->arch.mm, gpaddr, pte);
1199 if (pte_present(old_pte)) {
1200 old_mfn = pte_pfn(old_pte);//XXX
1202 domain_page_flush(d, gpaddr, old_mfn, INVALID_MFN);
1204 old_page = mfn_to_page(old_mfn);
1205 BUG_ON(page_get_owner(old_page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1206 put_page(old_page);
1208 return GNTST_okay;
1211 //XXX needs refcount patch
1212 //XXX heavily depends on the struct page layout.
1213 //XXX SMP
1214 int
1215 steal_page_for_grant_transfer(struct domain *d, struct page_info *page)
1217 #if 0 /* if big endian */
1218 # error "implement big endian version of steal_page_for_grant_transfer()"
1219 #endif
1220 u32 _d, _nd;
1221 u64 x, nx, y;
1222 unsigned long mpaddr = get_gpfn_from_mfn(page_to_mfn(page)) << PAGE_SHIFT;
1223 struct page_info *new;
1225 zap_domain_page_one(d, mpaddr, 0);
1226 put_page(page);
1228 spin_lock(&d->page_alloc_lock);
1230 /*
1231 * The tricky bit: atomically release ownership while there is just one
1232 * benign reference to the page (PGC_allocated). If that reference
1233 * disappears then the deallocation routine will safely spin.
1234 */
1235 _d = pickle_domptr(d);
1236 y = *((u64*)&page->count_info);
1237 do {
1238 x = y;
1239 nx = x & 0xffffffff;
1240 // page->count_info: untouched
1241 // page->u.inused._domain = 0;
1242 _nd = x >> 32;
1244 if (unlikely((x & (PGC_count_mask | PGC_allocated)) !=
1245 (1 | PGC_allocated)) ||
1246 unlikely(_nd != _d)) {
1247 struct domain* nd = unpickle_domptr(_nd);
1248 if (nd == NULL) {
1249 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1250 "sd=%p 0x%x,"
1251 " caf=%016lx, taf=%" PRtype_info "\n",
1252 (void *) page_to_mfn(page),
1253 d, d->domain_id, _d,
1254 nd, _nd,
1255 x,
1256 page->u.inuse.type_info);
1257 } else {
1258 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1259 "sd=%p(%u) 0x%x,"
1260 " caf=%016lx, taf=%" PRtype_info "\n",
1261 (void *) page_to_mfn(page),
1262 d, d->domain_id, _d,
1263 nd, nd->domain_id, _nd,
1264 x,
1265 page->u.inuse.type_info);
1267 spin_unlock(&d->page_alloc_lock);
1268 return -1;
1271 y = cmpxchg((u64*)&page->count_info, x, nx);
1272 } while (unlikely(y != x));
1274 /*
1275 * Unlink from 'd'. At least one reference remains (now anonymous), so
1276 * noone else is spinning to try to delete this page from 'd'.
1277 */
1278 d->tot_pages--;
1279 list_del(&page->list);
1281 spin_unlock(&d->page_alloc_lock);
1283 #if 1
1284 //XXX Until net_rx_action() fix
1285 // assign new page for this mpaddr
1286 new = assign_new_domain_page(d, mpaddr);
1287 BUG_ON(new == NULL);//XXX
1288 #endif
1290 return 0;
1293 void
1294 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1295 unsigned long mfn)
1297 int ret;
1299 ret = get_page(mfn_to_page(mfn), d);
1300 BUG_ON(ret == 0);
1301 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, 0/* XXX */);
1302 set_gpfn_from_mfn(mfn, gpfn);//XXX SMP
1304 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1307 void
1308 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1309 unsigned long mfn)
1311 BUG_ON(mfn == 0);//XXX
1312 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1);
1314 #endif
1316 /* Flush cache of domain d. */
1317 void domain_cache_flush (struct domain *d, int sync_only)
1319 struct mm_struct *mm = d->arch.mm;
1320 pgd_t *pgd = mm->pgd;
1321 unsigned long maddr;
1322 int i,j,k, l;
1323 int nbr_page = 0;
1324 void (*flush_func)(unsigned long start, unsigned long end);
1325 extern void flush_dcache_range (unsigned long, unsigned long);
1327 if (sync_only)
1328 flush_func = &flush_icache_range;
1329 else
1330 flush_func = &flush_dcache_range;
1332 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1333 if (d == dom0) {
1334 /* This is not fully correct (because of hole), but it should
1335 be enough for now. */
1336 (*flush_func)(__va_ul (dom0_start),
1337 __va_ul (dom0_start + dom0_size));
1338 return;
1340 #endif
1341 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1342 pud_t *pud;
1343 if (!pgd_present(*pgd))
1344 continue;
1345 pud = pud_offset(pgd, 0);
1346 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1347 pmd_t *pmd;
1348 if (!pud_present(*pud))
1349 continue;
1350 pmd = pmd_offset(pud, 0);
1351 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1352 pte_t *pte;
1353 if (!pmd_present(*pmd))
1354 continue;
1355 pte = pte_offset_map(pmd, 0);
1356 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1357 if (!pte_present(*pte))
1358 continue;
1359 /* Convert PTE to maddr. */
1360 maddr = __va_ul (pte_val(*pte)
1361 & _PAGE_PPN_MASK);
1362 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1363 nbr_page++;
1368 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1371 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
1372 #if 1
1373 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
1375 unsigned long pte = lookup_domain_mpa(d,mpaddr);
1376 unsigned long imva;
1378 pte &= _PAGE_PPN_MASK;
1379 imva = (unsigned long) __va(pte);
1380 imva |= mpaddr & ~PAGE_MASK;
1381 return(imva);
1383 #else
1384 unsigned long domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
1386 unsigned long imva = __gpa_to_mpa(d, mpaddr);
1388 return __va(imva);
1390 #endif
1392 // remove following line if not privifying in memory
1393 //#define HAVE_PRIVIFY_MEMORY
1394 #ifndef HAVE_PRIVIFY_MEMORY
1395 #define privify_memory(x,y) do {} while(0)
1396 #endif
1398 // see arch/x86/xxx/domain_build.c
1399 int elf_sanity_check(Elf_Ehdr *ehdr)
1401 if (!(IS_ELF(*ehdr)))
1403 printk("DOM0 image is not a Xen-compatible Elf image.\n");
1404 return 0;
1406 return 1;
1409 static void copy_memory(void *dst, void *src, int size)
1411 int remain;
1413 if (IS_XEN_ADDRESS(dom0,(unsigned long) src)) {
1414 memcpy(dst,src,size);
1416 else {
1417 printf("About to call __copy_from_user(%p,%p,%d)\n",
1418 dst,src,size);
1419 while ((remain = __copy_from_user(dst,src,size)) != 0) {
1420 printf("incomplete user copy, %d remain of %d\n",
1421 remain,size);
1422 dst += size - remain; src += size - remain;
1423 size -= remain;
1428 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
1430 char *elfbase = (char *) image_start;
1431 //Elf_Ehdr *ehdr = (Elf_Ehdr *)image_start;
1432 Elf_Ehdr ehdr;
1433 Elf_Phdr phdr;
1434 int h, filesz, memsz;
1435 unsigned long elfaddr, dom_mpaddr, dom_imva;
1436 struct page_info *p;
1438 copy_memory(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
1439 for ( h = 0; h < ehdr.e_phnum; h++ ) {
1440 copy_memory(&phdr,
1441 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
1442 sizeof(Elf_Phdr));
1443 if ((phdr.p_type != PT_LOAD))
1444 continue;
1446 filesz = phdr.p_filesz;
1447 memsz = phdr.p_memsz;
1448 elfaddr = (unsigned long) elfbase + phdr.p_offset;
1449 dom_mpaddr = phdr.p_paddr;
1451 //printf("p_offset: %x, size=%x\n",elfaddr,filesz);
1452 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1453 if (d == dom0) {
1454 if (dom_mpaddr+memsz>dom0_size)
1455 panic("Dom0 doesn't fit in memory space!\n");
1456 dom_imva = __va_ul(dom_mpaddr + dom0_start);
1457 copy_memory((void *)dom_imva, (void *)elfaddr, filesz);
1458 if (memsz > filesz)
1459 memset((void *)dom_imva+filesz, 0,
1460 memsz-filesz);
1461 //FIXME: This test for code seems to find a lot more than objdump -x does
1462 if (phdr.p_flags & PF_X) {
1463 privify_memory(dom_imva,filesz);
1464 flush_icache_range (dom_imva, dom_imva+filesz);
1467 else
1468 #endif
1469 while (memsz > 0) {
1470 p = assign_new_domain_page(d,dom_mpaddr);
1471 BUG_ON (unlikely(p == NULL));
1472 dom_imva = __va_ul(page_to_maddr(p));
1473 if (filesz > 0) {
1474 if (filesz >= PAGE_SIZE)
1475 copy_memory((void *) dom_imva,
1476 (void *) elfaddr,
1477 PAGE_SIZE);
1478 else {
1479 // copy partial page
1480 copy_memory((void *) dom_imva,
1481 (void *) elfaddr, filesz);
1482 // zero the rest of page
1483 memset((void *) dom_imva+filesz, 0,
1484 PAGE_SIZE-filesz);
1486 //FIXME: This test for code seems to find a lot more than objdump -x does
1487 if (phdr.p_flags & PF_X) {
1488 privify_memory(dom_imva,PAGE_SIZE);
1489 flush_icache_range(dom_imva,
1490 dom_imva+PAGE_SIZE);
1493 else if (memsz > 0) {
1494 /* always zero out entire page */
1495 memset((void *) dom_imva, 0, PAGE_SIZE);
1497 memsz -= PAGE_SIZE;
1498 filesz -= PAGE_SIZE;
1499 elfaddr += PAGE_SIZE;
1500 dom_mpaddr += PAGE_SIZE;
1505 void alloc_dom0(void)
1507 if (platform_is_hp_ski()) {
1508 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1510 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1511 printf("alloc_dom0: starting (initializing %lu MB...)\n",dom0_size/(1024*1024));
1513 /* FIXME: The first trunk (say 256M) should always be assigned to
1514 * Dom0, since Dom0's physical == machine address for DMA purpose.
1515 * Some old version linux, like 2.4, assumes physical memory existing
1516 * in 2nd 64M space.
1517 */
1518 dom0_start = alloc_boot_pages(dom0_size >> PAGE_SHIFT, dom0_align >> PAGE_SHIFT);
1519 dom0_start <<= PAGE_SHIFT;
1520 if (!dom0_start) {
1521 panic("alloc_dom0: can't allocate contiguous memory size=%lu\n",
1522 dom0_size);
1524 printf("alloc_dom0: dom0_start=0x%lx\n", dom0_start);
1525 #else
1526 // no need to allocate pages for now
1527 // pages are allocated by map_new_domain_page() via loaddomainelfimage()
1528 dom0_start = 0;
1529 #endif
1534 /*
1535 * Domain 0 has direct access to all devices absolutely. However
1536 * the major point of this stub here, is to allow alloc_dom_mem
1537 * handled with order > 0 request. Dom0 requires that bit set to
1538 * allocate memory for other domains.
1539 */
1540 static void physdev_init_dom0(struct domain *d)
1542 if (iomem_permit_access(d, 0UL, ~0UL))
1543 BUG();
1544 if (irqs_permit_access(d, 0, NR_IRQS-1))
1545 BUG();
1548 static unsigned int vmx_dom0 = 0;
1549 int construct_dom0(struct domain *d,
1550 unsigned long image_start, unsigned long image_len,
1551 unsigned long initrd_start, unsigned long initrd_len,
1552 char *cmdline)
1554 int i, rc;
1555 unsigned long alloc_start, alloc_end;
1556 start_info_t *si;
1557 struct vcpu *v = d->vcpu[0];
1558 unsigned long max_pages;
1560 struct domain_setup_info dsi;
1561 unsigned long p_start;
1562 unsigned long pkern_start;
1563 unsigned long pkern_entry;
1564 unsigned long pkern_end;
1565 unsigned long pinitrd_start = 0;
1566 unsigned long pstart_info;
1567 struct page_info *start_info_page;
1569 #ifdef VALIDATE_VT
1570 unsigned long mfn;
1571 struct page_info *page = NULL;
1572 #endif
1574 //printf("construct_dom0: starting\n");
1576 /* Sanity! */
1577 BUG_ON(d != dom0);
1578 BUG_ON(d->vcpu[0] == NULL);
1579 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1581 memset(&dsi, 0, sizeof(struct domain_setup_info));
1583 printk("*** LOADING DOMAIN 0 ***\n");
1585 alloc_start = dom0_start;
1586 alloc_end = dom0_start + dom0_size;
1587 max_pages = dom0_size / PAGE_SIZE;
1588 d->max_pages = max_pages;
1589 #ifndef CONFIG_XEN_IA64_DOM0_VP
1590 d->tot_pages = d->max_pages;
1591 #else
1592 d->tot_pages = 0;
1593 #endif
1594 dsi.image_addr = (unsigned long)image_start;
1595 dsi.image_len = image_len;
1596 rc = parseelfimage(&dsi);
1597 if ( rc != 0 )
1598 return rc;
1600 #ifdef VALIDATE_VT
1601 /* Temp workaround */
1602 if (running_on_sim)
1603 dsi.xen_section_string = (char *)1;
1605 /* Check whether dom0 is vti domain */
1606 if ((!vmx_enabled) && !dsi.xen_section_string) {
1607 printk("Lack of hardware support for unmodified vmx dom0\n");
1608 panic("");
1611 if (vmx_enabled && !dsi.xen_section_string) {
1612 printk("Dom0 is vmx domain!\n");
1613 vmx_dom0 = 1;
1615 #endif
1617 p_start = dsi.v_start;
1618 pkern_start = dsi.v_kernstart;
1619 pkern_end = dsi.v_kernend;
1620 pkern_entry = dsi.v_kernentry;
1622 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1624 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1626 printk("Initial guest OS must load to a page boundary.\n");
1627 return -EINVAL;
1630 pstart_info = PAGE_ALIGN(pkern_end);
1631 if(initrd_start && initrd_len){
1632 unsigned long offset;
1634 pinitrd_start= (dom0_start + dom0_size) -
1635 (PAGE_ALIGN(initrd_len) + 4*1024*1024);
1636 if (pinitrd_start <= pstart_info)
1637 panic("%s:enough memory is not assigned to dom0", __func__);
1639 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1640 struct page_info *p;
1641 p = assign_new_domain_page(d, pinitrd_start + offset);
1642 if (p == NULL)
1643 panic("%s: can't allocate page for initrd image", __func__);
1644 if (initrd_len < offset + PAGE_SIZE)
1645 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1646 initrd_len - offset);
1647 else
1648 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1652 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1653 " Kernel image: %lx->%lx\n"
1654 " Entry address: %lx\n"
1655 " Init. ramdisk: %lx len %lx\n"
1656 " Start info.: %lx->%lx\n",
1657 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1658 pstart_info, pstart_info + PAGE_SIZE);
1660 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1662 printk("Initial guest OS requires too much space\n"
1663 "(%luMB is greater than %luMB limit)\n",
1664 (pkern_end-pkern_start)>>20,
1665 (max_pages <<PAGE_SHIFT)>>20);
1666 return -ENOMEM;
1669 // if high 3 bits of pkern start are non-zero, error
1671 // if pkern end is after end of metaphysical memory, error
1672 // (we should be able to deal with this... later)
1674 /* Mask all upcalls... */
1675 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1676 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1678 if (dom0_max_vcpus == 0)
1679 dom0_max_vcpus = MAX_VIRT_CPUS;
1680 if (dom0_max_vcpus > num_online_cpus())
1681 dom0_max_vcpus = num_online_cpus();
1682 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1683 dom0_max_vcpus = MAX_VIRT_CPUS;
1685 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1686 for ( i = 1; i < dom0_max_vcpus; i++ )
1687 if (alloc_vcpu(d, i, i) == NULL)
1688 printf ("Cannot allocate dom0 vcpu %d\n", i);
1690 #if defined(VALIDATE_VT) && !defined(CONFIG_XEN_IA64_DOM0_VP)
1691 /* Construct a frame-allocation list for the initial domain, since these
1692 * pages are allocated by boot allocator and pfns are not set properly
1693 */
1694 for ( mfn = (alloc_start>>PAGE_SHIFT);
1695 mfn < (alloc_end>>PAGE_SHIFT);
1696 mfn++ )
1698 page = mfn_to_page(mfn);
1699 page_set_owner(page, d);
1700 page->u.inuse.type_info = 0;
1701 page->count_info = PGC_allocated | 1;
1702 list_add_tail(&page->list, &d->page_list);
1704 /* Construct 1:1 mapping */
1705 set_gpfn_from_mfn(mfn, mfn);
1707 #endif
1709 /* Copy the OS image. */
1710 loaddomainelfimage(d,image_start);
1712 /* Copy the initial ramdisk. */
1713 //if ( initrd_len != 0 )
1714 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1717 /* Set up start info area. */
1718 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1719 start_info_page = assign_new_domain_page(d, pstart_info);
1720 if (start_info_page == NULL)
1721 panic("can't allocate start info page");
1722 si = page_to_virt(start_info_page);
1723 memset(si, 0, PAGE_SIZE);
1724 sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION);
1725 si->nr_pages = max_pages;
1727 /* Give up the VGA console if DOM0 is configured to grab it. */
1728 if (cmdline != NULL)
1729 console_endboot(strstr(cmdline, "tty0") != NULL);
1731 /* VMX specific construction for Dom0, if hardware supports VMX
1732 * and Dom0 is unmodified image
1733 */
1734 printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d);
1735 if (vmx_dom0)
1736 vmx_final_setup_guest(v);
1738 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1740 new_thread(v, pkern_entry, 0, 0);
1741 physdev_init_dom0(d);
1743 // dom0 doesn't need build_physmap_table()
1744 // see arch_set_info_guest()
1745 // instead we allocate pages manually.
1746 for (i = 0; i < max_pages; i++) {
1747 assign_new_domain0_page(d, i << PAGE_SHIFT);
1749 d->arch.physmap_built = 1;
1751 // FIXME: Hack for keyboard input
1752 //serial_input_init();
1754 return 0;
1757 void machine_restart(char * __unused)
1759 if (platform_is_hp_ski()) dummy();
1760 printf("machine_restart called: spinning....\n");
1761 while(1);
1764 void machine_halt(void)
1766 if (platform_is_hp_ski()) dummy();
1767 printf("machine_halt called: spinning....\n");
1768 while(1);
1771 void dummy_called(char *function)
1773 if (platform_is_hp_ski()) asm("break 0;;");
1774 printf("dummy called in %s: spinning....\n", function);
1775 while(1);
1778 void domain_pend_keyboard_interrupt(int irq)
1780 vcpu_pend_interrupt(dom0->vcpu[0],irq);
1783 void sync_vcpu_execstate(struct vcpu *v)
1785 // __ia64_save_fpu(v->arch._thread.fph);
1786 // if (VMX_DOMAIN(v))
1787 // vmx_save_state(v);
1788 // FIXME SMP: Anything else needed here for SMP?
1791 // FIXME: It would be nice to print out a nice error message for bad
1792 // values of these boot-time parameters, but it seems we are too early
1793 // in the boot and attempts to print freeze the system?
1794 #define abort(x...) do {} while(0)
1795 #define warn(x...) do {} while(0)
1797 static void parse_dom0_mem(char *s)
1799 unsigned long bytes = parse_size_and_unit(s);
1801 if (dom0_size < 4 * 1024 * 1024) {
1802 abort("parse_dom0_mem: too small, boot aborted"
1803 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1805 if (dom0_size % dom0_align) {
1806 dom0_size = ((dom0_size / dom0_align) + 1) * dom0_align;
1807 warn("parse_dom0_mem: dom0_size rounded up from"
1808 " %lx to %lx bytes, due to dom0_align=%lx\n",
1809 bytes,dom0_size,dom0_align);
1811 else dom0_size = bytes;
1813 custom_param("dom0_mem", parse_dom0_mem);
1816 static void parse_dom0_align(char *s)
1818 unsigned long bytes = parse_size_and_unit(s);
1820 if ((bytes - 1) ^ bytes) { /* not a power of two */
1821 abort("parse_dom0_align: dom0_align must be power of two, "
1822 "boot aborted"
1823 " (try e.g. dom0_align=256M or dom0_align=65536K)\n");
1825 else if (bytes < PAGE_SIZE) {
1826 abort("parse_dom0_align: dom0_align must be >= %ld, "
1827 "boot aborted"
1828 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
1829 PAGE_SIZE);
1831 else dom0_align = bytes;
1832 if (dom0_size % dom0_align) {
1833 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
1834 warn("parse_dom0_align: dom0_size rounded up from"
1835 " %ld to %ld bytes, due to dom0_align=%lx\n",
1836 bytes,dom0_size,dom0_align);
1839 custom_param("dom0_align", parse_dom0_align);