direct-io.hg

view xen/arch/ia64/xen/mm.c @ 11456:3e4fa8b5b245

merge with xen-unstable.hg
author awilliam@xenbuild.aw
date Tue Sep 12 11:43:22 2006 -0600 (2006-09-12)
parents af50fb41612c bfd00b317815
children dc9fa4dcd19c
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <linux/efi.h>
176 #include <xen/guest_access.h>
177 #include <asm/page.h>
178 #include <public/memory.h>
180 static void domain_page_flush(struct domain* d, unsigned long mpaddr,
181 unsigned long old_mfn, unsigned long new_mfn);
183 extern unsigned long ia64_iobase;
185 static struct domain *dom_xen, *dom_io;
187 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
188 void
189 alloc_dom_xen_and_dom_io(void)
190 {
191 /*
192 * Initialise our DOMID_XEN domain.
193 * Any Xen-heap pages that we will allow to be mapped will have
194 * their domain field set to dom_xen.
195 */
196 dom_xen = alloc_domain(DOMID_XEN);
197 BUG_ON(dom_xen == NULL);
199 /*
200 * Initialise our DOMID_IO domain.
201 * This domain owns I/O pages that are within the range of the page_info
202 * array. Mappings occur at the priv of the caller.
203 */
204 dom_io = alloc_domain(DOMID_IO);
205 BUG_ON(dom_io == NULL);
206 }
208 // heavily depends on the struct page_info layout.
209 // if (page_get_owner(page) == d &&
210 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
211 // put_page(page);
212 // }
213 static void
214 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
215 {
216 u32 _d, _nd;
217 u64 x, nx, y;
219 _d = pickle_domptr(d);
220 y = *((u64*)&page->count_info);
221 do {
222 x = y;
223 _nd = x >> 32;
224 nx = x - 1;
225 __clear_bit(_PGC_allocated, &nx);
227 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
228 struct domain* nd = unpickle_domptr(_nd);
229 if (nd == NULL) {
230 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
231 "sd=%p 0x%x,"
232 " caf=%016lx, taf=%" PRtype_info "\n",
233 (void *) page_to_mfn(page),
234 d, d->domain_id, _d,
235 nd, _nd,
236 x,
237 page->u.inuse.type_info);
238 }
239 break;
240 }
242 BUG_ON((nx & PGC_count_mask) < 1);
243 y = cmpxchg((u64*)&page->count_info, x, nx);
244 } while (unlikely(y != x));
245 }
247 static void
248 relinquish_pte(struct domain* d, pte_t* pte)
249 {
250 unsigned long mfn = pte_pfn(*pte);
251 struct page_info* page;
253 // vmx domain use bit[58:56] to distinguish io region from memory.
254 // see vmx_build_physmap_table() in vmx_init.c
255 if (!pte_mem(*pte))
256 return;
258 // domain might map IO space or acpi table pages. check it.
259 if (!mfn_valid(mfn))
260 return;
261 page = mfn_to_page(mfn);
262 // struct page_info corresponding to mfn may exist or not depending
263 // on CONFIG_VIRTUAL_FRAME_TABLE.
264 // This check is too easy.
265 // The right way is to check whether this page is of io area or acpi pages
266 if (page_get_owner(page) == NULL) {
267 BUG_ON(page->count_info != 0);
268 return;
269 }
271 if (page_get_owner(page) == d) {
272 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
273 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
274 }
276 try_to_clear_PGC_allocate(d, page);
277 put_page(page);
278 }
280 static void
281 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
282 {
283 unsigned long i;
284 pte_t* pte = pte_offset_map(pmd, offset);
286 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
287 if (!pte_present(*pte))
288 continue;
290 relinquish_pte(d, pte);
291 }
292 pte_free_kernel(pte_offset_map(pmd, offset));
293 }
295 static void
296 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
297 {
298 unsigned long i;
299 pmd_t *pmd = pmd_offset(pud, offset);
301 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
302 if (!pmd_present(*pmd))
303 continue;
305 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
306 }
307 pmd_free(pmd_offset(pud, offset));
308 }
310 static void
311 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
312 {
313 unsigned long i;
314 pud_t *pud = pud_offset(pgd, offset);
316 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
317 if (!pud_present(*pud))
318 continue;
320 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
321 }
322 pud_free(pud_offset(pgd, offset));
323 }
325 void
326 relinquish_mm(struct domain* d)
327 {
328 struct mm_struct* mm = &d->arch.mm;
329 unsigned long i;
330 pgd_t* pgd;
332 if (mm->pgd == NULL)
333 return;
335 pgd = pgd_offset(mm, 0);
336 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
337 if (!pgd_present(*pgd))
338 continue;
340 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
341 }
342 pgd_free(mm->pgd);
343 mm->pgd = NULL;
344 }
346 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
347 void
348 share_xen_page_with_guest(struct page_info *page,
349 struct domain *d, int readonly)
350 {
351 if ( page_get_owner(page) == d )
352 return;
354 #if 1
355 if (readonly) {
356 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
357 }
358 #endif
360 // alloc_xenheap_pages() doesn't initialize page owner.
361 //BUG_ON(page_get_owner(page) != NULL);
363 spin_lock(&d->page_alloc_lock);
365 #ifndef __ia64__
366 /* The incremented type count pins as writable or read-only. */
367 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
368 page->u.inuse.type_info |= PGT_validated | 1;
369 #endif
371 page_set_owner(page, d);
372 wmb(); /* install valid domain ptr before updating refcnt. */
373 ASSERT(page->count_info == 0);
374 page->count_info |= PGC_allocated | 1;
376 if ( unlikely(d->xenheap_pages++ == 0) )
377 get_knownalive_domain(d);
378 list_add_tail(&page->list, &d->xenpage_list);
380 // grant_table_destroy() releases these pages.
381 // but it doesn't clear their m2p entry. So there might remain stale
382 // entries. such a stale entry is cleared here.
383 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
385 spin_unlock(&d->page_alloc_lock);
386 }
388 void
389 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
390 {
391 share_xen_page_with_guest(page, dom_xen, readonly);
392 }
394 unsigned long
395 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
396 {
397 unsigned long pte;
399 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
400 if (!pte) {
401 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
402 }
403 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
404 }
406 // given a domain virtual address, pte and pagesize, extract the metaphysical
407 // address, convert the pte for a physical address for (possibly different)
408 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
409 // PAGE_SIZE!)
410 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
411 struct p2m_entry* entry)
412 {
413 struct domain *d = current->domain;
414 ia64_itir_t itir = {.itir = itir__};
415 u64 mask, mpaddr, pteval2;
416 u64 arflags;
417 u64 arflags2;
418 u64 maflags2;
420 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
422 // FIXME address had better be pre-validated on insert
423 mask = ~itir_mask(itir.itir);
424 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
426 if (itir.ps > PAGE_SHIFT)
427 itir.ps = PAGE_SHIFT;
429 *logps = itir.ps;
431 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
433 /* Check access rights. */
434 arflags = pteval & _PAGE_AR_MASK;
435 arflags2 = pteval2 & _PAGE_AR_MASK;
436 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
437 #if 0
438 DPRINTK("%s:%d "
439 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
440 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
441 __func__, __LINE__,
442 pteval, arflags, address, itir__,
443 pteval2, arflags2, mpaddr);
444 #endif
445 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
446 }
448 /* Check memory attribute. The switch is on the *requested* memory
449 attribute. */
450 maflags2 = pteval2 & _PAGE_MA_MASK;
451 switch (pteval & _PAGE_MA_MASK) {
452 case _PAGE_MA_NAT:
453 /* NaT pages are always accepted! */
454 break;
455 case _PAGE_MA_UC:
456 case _PAGE_MA_UCE:
457 case _PAGE_MA_WC:
458 if (maflags2 == _PAGE_MA_WB) {
459 /* Don't let domains WB-map uncached addresses.
460 This can happen when domU tries to touch i/o
461 port space. Also prevents possible address
462 aliasing issues. */
463 printf("Warning: UC to WB for mpaddr=%lx\n", mpaddr);
464 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
465 }
466 break;
467 case _PAGE_MA_WB:
468 if (maflags2 != _PAGE_MA_WB) {
469 /* Forbid non-coherent access to coherent memory. */
470 panic_domain(NULL, "try to use WB mem attr on "
471 "UC page, mpaddr=%lx\n", mpaddr);
472 }
473 break;
474 default:
475 panic_domain(NULL, "try to use unknown mem attribute\n");
476 }
478 /* If shadow mode is enabled, virtualize dirty bit. */
479 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
480 u64 mp_page = mpaddr >> PAGE_SHIFT;
481 pteval |= _PAGE_VIRT_D;
483 /* If the page is not already dirty, don't set the dirty bit! */
484 if (mp_page < d->arch.shadow_bitmap_size * 8
485 && !test_bit(mp_page, d->arch.shadow_bitmap))
486 pteval &= ~_PAGE_D;
487 }
489 /* Ignore non-addr bits of pteval2 and force PL0->2
490 (PL3 is unaffected) */
491 return (pteval & ~_PAGE_PPN_MASK) |
492 (pteval2 & _PAGE_PPN_MASK) | _PAGE_PL_2;
493 }
495 // given a current domain metaphysical address, return the physical address
496 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
497 struct p2m_entry* entry)
498 {
499 unsigned long pteval;
501 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
502 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
503 }
505 //XXX !xxx_present() should be used instread of !xxx_none()?
506 // __assign_new_domain_page(), assign_new_domain_page() and
507 // assign_new_domain0_page() are used only when domain creation.
508 // their accesses aren't racy so that returned pte_t doesn't need
509 // volatile qualifier
510 static pte_t*
511 __lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
512 {
513 struct mm_struct *mm = &d->arch.mm;
514 pgd_t *pgd;
515 pud_t *pud;
516 pmd_t *pmd;
518 BUG_ON(mm->pgd == NULL);
519 pgd = pgd_offset(mm, mpaddr);
520 if (pgd_none(*pgd)) {
521 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
522 }
524 pud = pud_offset(pgd, mpaddr);
525 if (pud_none(*pud)) {
526 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
527 }
529 pmd = pmd_offset(pud, mpaddr);
530 if (pmd_none(*pmd)) {
531 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
532 }
534 return pte_offset_map(pmd, mpaddr);
535 }
537 //XXX !xxx_present() should be used instread of !xxx_none()?
538 // pud, pmd, pte page is zero cleared when they are allocated.
539 // Their area must be visible before population so that
540 // cmpxchg must have release semantics.
541 static volatile pte_t*
542 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
543 {
544 struct mm_struct *mm = &d->arch.mm;
545 pgd_t *pgd;
546 pud_t *pud;
547 pmd_t *pmd;
549 BUG_ON(mm->pgd == NULL);
551 pgd = pgd_offset(mm, mpaddr);
552 again_pgd:
553 if (unlikely(pgd_none(*pgd))) {
554 pud_t *old_pud = NULL;
555 pud = pud_alloc_one(mm, mpaddr);
556 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
557 pud_free(pud);
558 goto again_pgd;
559 }
560 }
562 pud = pud_offset(pgd, mpaddr);
563 again_pud:
564 if (unlikely(pud_none(*pud))) {
565 pmd_t* old_pmd = NULL;
566 pmd = pmd_alloc_one(mm, mpaddr);
567 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
568 pmd_free(pmd);
569 goto again_pud;
570 }
571 }
573 pmd = pmd_offset(pud, mpaddr);
574 again_pmd:
575 if (unlikely(pmd_none(*pmd))) {
576 pte_t* old_pte = NULL;
577 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
578 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
579 pte_free_kernel(pte);
580 goto again_pmd;
581 }
582 }
584 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
585 }
587 //XXX xxx_none() should be used instread of !xxx_present()?
588 volatile pte_t*
589 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
590 {
591 struct mm_struct *mm = &d->arch.mm;
592 pgd_t *pgd;
593 pud_t *pud;
594 pmd_t *pmd;
596 BUG_ON(mm->pgd == NULL);
597 pgd = pgd_offset(mm, mpaddr);
598 if (unlikely(!pgd_present(*pgd)))
599 return NULL;
601 pud = pud_offset(pgd, mpaddr);
602 if (unlikely(!pud_present(*pud)))
603 return NULL;
605 pmd = pmd_offset(pud, mpaddr);
606 if (unlikely(!pmd_present(*pmd)))
607 return NULL;
609 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
610 }
612 static volatile pte_t*
613 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
614 {
615 struct mm_struct *mm = &d->arch.mm;
616 pgd_t *pgd;
617 pud_t *pud;
618 pmd_t *pmd;
620 BUG_ON(mm->pgd == NULL);
621 pgd = pgd_offset(mm, mpaddr);
622 if (unlikely(pgd_none(*pgd)))
623 return NULL;
625 pud = pud_offset(pgd, mpaddr);
626 if (unlikely(pud_none(*pud)))
627 return NULL;
629 pmd = pmd_offset(pud, mpaddr);
630 if (unlikely(pmd_none(*pmd)))
631 return NULL;
633 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
634 }
636 unsigned long
637 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
638 {
639 volatile pte_t *pte;
641 pte = lookup_noalloc_domain_pte(d, mpaddr);
642 if (pte == NULL)
643 return INVALID_MFN;
645 if (pte_present(*pte))
646 return (pte->pte & _PFN_MASK);
647 else if (VMX_DOMAIN(d->vcpu[0]))
648 return GPFN_INV_MASK;
649 return INVALID_MFN;
650 }
652 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
653 struct p2m_entry* entry)
654 {
655 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
657 if (pte != NULL) {
658 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
659 if (pte_present(tmp_pte)) {
660 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
661 if (entry != NULL)
662 p2m_entry_set(entry, pte, tmp_pte);
663 return pte_val(tmp_pte);
664 } else if (VMX_DOMAIN(d->vcpu[0]))
665 return GPFN_INV_MASK;
666 }
668 printk("%s: d 0x%p id %d current 0x%p id %d\n",
669 __func__, d, d->domain_id, current, current->vcpu_id);
670 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
671 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
672 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
673 else
674 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
675 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
677 if (entry != NULL)
678 p2m_entry_set(entry, NULL, __pte(0));
679 //XXX This is a work around until the emulation memory access to a region
680 // where memory or device are attached is implemented.
681 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
682 }
684 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
685 #if 1
686 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
687 {
688 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
689 unsigned long imva;
691 pte &= _PAGE_PPN_MASK;
692 imva = (unsigned long) __va(pte);
693 imva |= mpaddr & ~PAGE_MASK;
694 return (void*)imva;
695 }
696 #else
697 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
698 {
699 unsigned long imva = __gpa_to_mpa(d, mpaddr);
701 return (void *)__va(imva);
702 }
703 #endif
705 /* Allocate a new page for domain and map it to the specified metaphysical
706 address. */
707 static struct page_info *
708 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
709 {
710 struct page_info *p;
711 unsigned long maddr;
712 int ret;
714 BUG_ON(!pte_none(*pte));
716 p = alloc_domheap_page(d);
717 if (unlikely(!p)) {
718 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
719 return(p);
720 }
722 // zero out pages for security reasons
723 clear_page(page_to_virt(p));
724 maddr = page_to_maddr (p);
725 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
726 && maddr < __get_cpu_var(vhpt_pend))) {
727 /* FIXME: how can this happen ?
728 vhpt is allocated by alloc_domheap_page. */
729 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
730 maddr);
731 }
733 ret = get_page(p, d);
734 BUG_ON(ret == 0);
735 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
736 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
737 // because set_pte_rel() has release semantics
738 set_pte_rel(pte,
739 pfn_pte(maddr >> PAGE_SHIFT,
740 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
742 smp_mb();
743 return p;
744 }
746 struct page_info *
747 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
748 {
749 pte_t *pte = __lookup_alloc_domain_pte(d, mpaddr);
751 if (!pte_none(*pte))
752 return NULL;
754 return __assign_new_domain_page(d, mpaddr, pte);
755 }
757 void
758 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
759 {
760 pte_t *pte;
762 BUG_ON(d != dom0);
763 pte = __lookup_alloc_domain_pte(d, mpaddr);
764 if (pte_none(*pte)) {
765 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
766 if (p == NULL) {
767 panic("%s: can't allocate page for dom0", __func__);
768 }
769 }
770 }
772 static unsigned long
773 flags_to_prot (unsigned long flags)
774 {
775 unsigned long res = _PAGE_PL_2 | __DIRTY_BITS;
777 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
778 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
780 return res;
781 }
783 /* map a physical address to the specified metaphysical addr */
784 // flags: currently only ASSIGN_readonly, ASSIGN_nocache
785 // This is called by assign_domain_mmio_page().
786 // So accessing to pte is racy.
787 void
788 __assign_domain_page(struct domain *d,
789 unsigned long mpaddr, unsigned long physaddr,
790 unsigned long flags)
791 {
792 volatile pte_t *pte;
793 pte_t old_pte;
794 pte_t new_pte;
795 pte_t ret_pte;
796 unsigned long prot = flags_to_prot(flags);
798 pte = lookup_alloc_domain_pte(d, mpaddr);
800 old_pte = __pte(0);
801 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
802 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
803 if (pte_val(ret_pte) == pte_val(old_pte))
804 smp_mb();
805 }
807 /* get_page() and map a physical address to the specified metaphysical addr */
808 void
809 assign_domain_page(struct domain *d,
810 unsigned long mpaddr, unsigned long physaddr)
811 {
812 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
813 int ret;
815 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
816 ret = get_page(page, d);
817 BUG_ON(ret == 0);
818 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
819 // because __assign_domain_page() uses set_pte_rel() which has
820 // release semantics, smp_mb() isn't needed.
821 __assign_domain_page(d, mpaddr, physaddr, ASSIGN_writable);
822 }
824 int
825 ioports_permit_access(struct domain *d, unsigned long fp, unsigned long lp)
826 {
827 int ret;
828 unsigned long off;
829 unsigned long fp_offset;
830 unsigned long lp_offset;
832 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
833 if (ret != 0)
834 return ret;
836 /* Domain 0 doesn't virtualize IO ports space. */
837 if (d == dom0)
838 return 0;
840 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
841 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
843 for (off = fp_offset; off <= lp_offset; off += PAGE_SIZE)
844 __assign_domain_page(d, IO_PORTS_PADDR + off,
845 __pa(ia64_iobase) + off, ASSIGN_nocache);
847 return 0;
848 }
850 static int
851 ioports_has_allowed(struct domain *d, unsigned long fp, unsigned long lp)
852 {
853 unsigned long i;
854 for (i = fp; i < lp; i++)
855 if (rangeset_contains_singleton(d->arch.ioport_caps, i))
856 return 1;
857 return 0;
858 }
860 int
861 ioports_deny_access(struct domain *d, unsigned long fp, unsigned long lp)
862 {
863 int ret;
864 struct mm_struct *mm = &d->arch.mm;
865 unsigned long off;
866 unsigned long io_ports_base;
867 unsigned long fp_offset;
868 unsigned long lp_offset;
870 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
871 if (ret != 0)
872 return ret;
873 if (d == dom0)
874 io_ports_base = __pa(ia64_iobase);
875 else
876 io_ports_base = IO_PORTS_PADDR;
878 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
879 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
881 for (off = fp_offset; off < lp_offset; off += PAGE_SIZE) {
882 unsigned long mpaddr = io_ports_base + off;
883 unsigned long port;
884 volatile pte_t *pte;
885 pte_t old_pte;
887 port = IO_SPACE_SPARSE_DECODING (off);
888 if (port < fp || port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1 > lp) {
889 /* Maybe this covers an allowed port. */
890 if (ioports_has_allowed(d, port,
891 port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1))
892 continue;
893 }
895 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
896 BUG_ON(pte == NULL);
897 BUG_ON(pte_none(*pte));
899 // clear pte
900 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
901 }
902 domain_flush_vtlb_all();
903 return 0;
904 }
906 static void
907 assign_domain_same_page(struct domain *d,
908 unsigned long mpaddr, unsigned long size,
909 unsigned long flags)
910 {
911 //XXX optimization
912 unsigned long end = PAGE_ALIGN(mpaddr + size);
913 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
914 __assign_domain_page(d, mpaddr, mpaddr, flags);
915 }
916 }
918 int
919 efi_mmio(unsigned long physaddr, unsigned long size)
920 {
921 void *efi_map_start, *efi_map_end;
922 u64 efi_desc_size;
923 void* p;
925 efi_map_start = __va(ia64_boot_param->efi_memmap);
926 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
927 efi_desc_size = ia64_boot_param->efi_memdesc_size;
929 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
930 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
931 unsigned long start = md->phys_addr;
932 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
934 if (start <= physaddr && physaddr < end) {
935 if ((physaddr + size) > end) {
936 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
937 __func__, __LINE__, physaddr, size);
938 return 0;
939 }
941 // for io space
942 if (md->type == EFI_MEMORY_MAPPED_IO ||
943 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
944 return 1;
945 }
947 // for runtime
948 // see efi_enter_virtual_mode(void)
949 // in linux/arch/ia64/kernel/efi.c
950 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
951 !(md->attribute & EFI_MEMORY_WB)) {
952 return 1;
953 }
955 return 0;
956 }
958 if (physaddr < start) {
959 break;
960 }
961 }
963 return 1;
964 }
966 unsigned long
967 assign_domain_mmio_page(struct domain *d,
968 unsigned long mpaddr, unsigned long size)
969 {
970 if (size == 0) {
971 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
972 __func__, d, mpaddr, size);
973 }
974 if (!efi_mmio(mpaddr, size)) {
975 #ifndef NDEBUG
976 DPRINTK("%s:%d domain %p mpaddr 0x%lx size = 0x%lx\n",
977 __func__, __LINE__, d, mpaddr, size);
978 #endif
979 return -EINVAL;
980 }
981 assign_domain_same_page(d, mpaddr, size, ASSIGN_writable | ASSIGN_nocache);
982 return mpaddr;
983 }
985 unsigned long
986 assign_domain_mach_page(struct domain *d,
987 unsigned long mpaddr, unsigned long size,
988 unsigned long flags)
989 {
990 assign_domain_same_page(d, mpaddr, size, flags);
991 return mpaddr;
992 }
994 // caller must get_page(mfn_to_page(mfn)) before call.
995 // caller must call set_gpfn_from_mfn() before call if necessary.
996 // because set_gpfn_from_mfn() result must be visible before pte xchg
997 // caller must use memory barrier. NOTE: xchg has acquire semantics.
998 // flags: currently only ASSIGN_readonly
999 static void
1000 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1001 unsigned long mfn, unsigned long flags)
1003 struct mm_struct *mm = &d->arch.mm;
1004 volatile pte_t* pte;
1005 pte_t old_pte;
1006 pte_t npte;
1007 unsigned long prot = flags_to_prot(flags);
1009 pte = lookup_alloc_domain_pte(d, mpaddr);
1011 // update pte
1012 npte = pfn_pte(mfn, __pgprot(prot));
1013 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1014 if (pte_mem(old_pte)) {
1015 unsigned long old_mfn = pte_pfn(old_pte);
1017 // mfn = old_mfn case can happen when domain maps a granted page
1018 // twice with the same pseudo physial address.
1019 // It's non sense, but allowed.
1020 // __gnttab_map_grant_ref()
1021 // => create_host_mapping()
1022 // => assign_domain_page_replace()
1023 if (mfn != old_mfn) {
1024 struct page_info* old_page = mfn_to_page(old_mfn);
1026 if (page_get_owner(old_page) == d ||
1027 page_get_owner(old_page) == NULL) {
1028 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1029 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1032 domain_page_flush(d, mpaddr, old_mfn, mfn);
1034 try_to_clear_PGC_allocate(d, old_page);
1035 put_page(old_page);
1040 // caller must get_page(new_page) before
1041 // Only steal_page() calls this function.
1042 static int
1043 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1044 struct page_info* old_page,
1045 struct page_info* new_page,
1046 unsigned long flags)
1048 struct mm_struct *mm = &d->arch.mm;
1049 volatile pte_t* pte;
1050 unsigned long old_mfn;
1051 unsigned long old_arflags;
1052 pte_t old_pte;
1053 unsigned long new_mfn;
1054 unsigned long new_prot;
1055 pte_t new_pte;
1056 pte_t ret_pte;
1058 pte = lookup_alloc_domain_pte(d, mpaddr);
1060 again:
1061 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1062 old_mfn = page_to_mfn(old_page);
1063 old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
1064 if (!pte_present(old_pte)) {
1065 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
1066 __func__, pte_val(old_pte), old_arflags, old_mfn);
1067 return -EINVAL;
1070 new_prot = flags_to_prot(flags);
1071 new_mfn = page_to_mfn(new_page);
1072 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1074 // update pte
1075 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1076 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1077 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1078 goto again;
1081 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
1082 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1083 __func__,
1084 pte_val(old_pte), old_arflags, old_mfn,
1085 pte_val(ret_pte), pte_pfn(ret_pte));
1086 return -EINVAL;
1089 BUG_ON(!pte_mem(old_pte));
1090 BUG_ON(page_get_owner(old_page) != d);
1091 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1092 BUG_ON(old_mfn == new_mfn);
1094 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1096 domain_page_flush(d, mpaddr, old_mfn, new_mfn);
1097 put_page(old_page);
1098 return 0;
1101 static void
1102 zap_domain_page_one(struct domain *d, unsigned long mpaddr, unsigned long mfn)
1104 struct mm_struct *mm = &d->arch.mm;
1105 volatile pte_t *pte;
1106 pte_t old_pte;
1107 struct page_info *page;
1109 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1110 if (pte == NULL)
1111 return;
1112 if (pte_none(*pte))
1113 return;
1115 if (mfn == INVALID_MFN) {
1116 // clear pte
1117 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1118 mfn = pte_pfn(old_pte);
1119 } else {
1120 unsigned long old_arflags;
1121 pte_t new_pte;
1122 pte_t ret_pte;
1124 again:
1125 // memory_exchange() calls guest_physmap_remove_page() with
1126 // a stealed page. i.e. page owner = NULL.
1127 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1128 page_get_owner(mfn_to_page(mfn)) != NULL);
1129 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1130 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1131 new_pte = __pte(0);
1133 // update pte
1134 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1135 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1136 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1137 goto again;
1140 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1141 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1142 __func__,
1143 pte_val(old_pte), old_arflags, mfn,
1144 pte_val(ret_pte), pte_pfn(ret_pte));
1145 return;
1147 BUG_ON(mfn != pte_pfn(ret_pte));
1150 page = mfn_to_page(mfn);
1151 BUG_ON((page->count_info & PGC_count_mask) == 0);
1153 if (page_get_owner(page) == d ||
1154 page_get_owner(page) == NULL) {
1155 // exchange_memory() calls
1156 // steal_page()
1157 // page owner is set to NULL
1158 // guest_physmap_remove_page()
1159 // zap_domain_page_one()
1160 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1161 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1164 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
1166 if (page_get_owner(page) != NULL) {
1167 try_to_clear_PGC_allocate(d, page);
1169 put_page(page);
1172 unsigned long
1173 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1174 unsigned int extent_order)
1176 if (extent_order != 0) {
1177 //XXX
1178 return -ENOSYS;
1181 zap_domain_page_one(d, gpfn << PAGE_SHIFT, INVALID_MFN);
1182 return 0;
1185 unsigned long
1186 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1187 unsigned long flags, domid_t domid)
1189 int error = 0;
1190 struct domain* rd;
1192 /* Not allowed by a domain. */
1193 if (flags & ASSIGN_nocache)
1194 return -EINVAL;
1196 rd = find_domain_by_id(domid);
1197 if (unlikely(rd == NULL)) {
1198 switch (domid) {
1199 case DOMID_XEN:
1200 rd = dom_xen;
1201 break;
1202 case DOMID_IO:
1203 rd = dom_io;
1204 break;
1205 default:
1206 DPRINTK("d 0x%p domid %d "
1207 "pgfn 0x%lx mfn 0x%lx flags 0x%lx domid %d\n",
1208 d, d->domain_id, gpfn, mfn, flags, domid);
1209 return -ESRCH;
1211 BUG_ON(rd == NULL);
1212 get_knownalive_domain(rd);
1215 if (unlikely(rd == d || !mfn_valid(mfn))) {
1216 error = -EINVAL;
1217 goto out1;
1219 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1220 error = -EINVAL;
1221 goto out1;
1223 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1224 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1225 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1226 //don't update p2m table because this page belongs to rd, not d.
1227 out1:
1228 put_domain(rd);
1229 return error;
1232 // grant table host mapping
1233 // mpaddr: host_addr: pseudo physical address
1234 // mfn: frame: machine page frame
1235 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1236 int
1237 create_grant_host_mapping(unsigned long gpaddr,
1238 unsigned long mfn, unsigned int flags)
1240 struct domain* d = current->domain;
1241 struct page_info* page;
1242 int ret;
1244 if (flags & (GNTMAP_device_map |
1245 GNTMAP_application_map | GNTMAP_contains_pte)) {
1246 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1247 return GNTST_general_error;
1250 BUG_ON(!mfn_valid(mfn));
1251 page = mfn_to_page(mfn);
1252 ret = get_page(page, page_get_owner(page));
1253 BUG_ON(ret == 0);
1254 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1255 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1256 assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
1257 ASSIGN_readonly: ASSIGN_writable);
1258 return GNTST_okay;
1261 // grant table host unmapping
1262 int
1263 destroy_grant_host_mapping(unsigned long gpaddr,
1264 unsigned long mfn, unsigned int flags)
1266 struct domain* d = current->domain;
1267 volatile pte_t* pte;
1268 unsigned long cur_arflags;
1269 pte_t cur_pte;
1270 pte_t new_pte;
1271 pte_t old_pte;
1272 struct page_info* page;
1274 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1275 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1276 return GNTST_general_error;
1279 pte = lookup_noalloc_domain_pte(d, gpaddr);
1280 if (pte == NULL) {
1281 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx\n", __func__, gpaddr, mfn);
1282 return GNTST_general_error;
1285 again:
1286 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1287 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1288 if (!pte_present(cur_pte)) {
1289 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1290 __func__, gpaddr, mfn, pte_val(cur_pte));
1291 return GNTST_general_error;
1293 new_pte = __pte(0);
1295 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1296 if (unlikely(!pte_present(old_pte))) {
1297 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1298 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1299 return GNTST_general_error;
1301 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1302 if (pte_pfn(old_pte) == mfn) {
1303 goto again;
1305 DPRINTK("%s gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1306 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1307 return GNTST_general_error;
1309 BUG_ON(pte_pfn(old_pte) != mfn);
1311 domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
1313 page = mfn_to_page(mfn);
1314 BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1315 put_page(page);
1317 return GNTST_okay;
1320 // heavily depends on the struct page layout.
1321 // gnttab_transfer() calls steal_page() with memflags = 0
1322 // For grant table transfer, we must fill the page.
1323 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1324 // For memory exchange, we don't have to fill the page because
1325 // memory_exchange() does it.
1326 int
1327 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1329 #if 0 /* if big endian */
1330 # error "implement big endian version of steal_page()"
1331 #endif
1332 u32 _d, _nd;
1333 u64 x, nx, y;
1335 if (page_get_owner(page) != d) {
1336 DPRINTK("%s d 0x%p owner 0x%p\n", __func__, d, page_get_owner(page));
1337 return -1;
1340 if (!(memflags & MEMF_no_refcount)) {
1341 unsigned long gpfn;
1342 struct page_info *new;
1343 unsigned long new_mfn;
1344 int ret;
1346 new = alloc_domheap_page(d);
1347 if (new == NULL) {
1348 DPRINTK("alloc_domheap_page() failed\n");
1349 return -1;
1351 // zero out pages for security reasons
1352 clear_page(page_to_virt(new));
1353 // assign_domain_page_cmpxchg_rel() has release semantics
1354 // so smp_mb() isn't needed.
1356 ret = get_page(new, d);
1357 BUG_ON(ret == 0);
1359 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1360 if (gpfn == INVALID_M2P_ENTRY) {
1361 free_domheap_page(new);
1362 return -1;
1364 new_mfn = page_to_mfn(new);
1365 set_gpfn_from_mfn(new_mfn, gpfn);
1366 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1367 // has release semantics.
1369 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1370 ASSIGN_writable);
1371 if (ret < 0) {
1372 DPRINTK("assign_domain_page_cmpxchg_rel failed %d\n", ret);
1373 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1374 free_domheap_page(new);
1375 return -1;
1379 spin_lock(&d->page_alloc_lock);
1381 /*
1382 * The tricky bit: atomically release ownership while there is just one
1383 * benign reference to the page (PGC_allocated). If that reference
1384 * disappears then the deallocation routine will safely spin.
1385 */
1386 _d = pickle_domptr(d);
1387 y = *((u64*)&page->count_info);
1388 do {
1389 x = y;
1390 nx = x & 0xffffffff;
1391 // page->count_info: untouched
1392 // page->u.inused._domain = 0;
1393 _nd = x >> 32;
1395 if (unlikely(!(memflags & MEMF_no_refcount) &&
1396 ((x & (PGC_count_mask | PGC_allocated)) !=
1397 (1 | PGC_allocated))) ||
1399 // when MEMF_no_refcount, page isn't de-assigned from
1400 // this domain yet. So count_info = 2
1401 unlikely((memflags & MEMF_no_refcount) &&
1402 ((x & (PGC_count_mask | PGC_allocated)) !=
1403 (2 | PGC_allocated))) ||
1405 unlikely(_nd != _d)) {
1406 struct domain* nd = unpickle_domptr(_nd);
1407 if (nd == NULL) {
1408 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1409 "sd=%p 0x%x,"
1410 " caf=%016lx, taf=%" PRtype_info
1411 " memflags 0x%x\n",
1412 (void *) page_to_mfn(page),
1413 d, d->domain_id, _d,
1414 nd, _nd,
1415 x,
1416 page->u.inuse.type_info,
1417 memflags);
1418 } else {
1419 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1420 "sd=%p(%u) 0x%x,"
1421 " caf=%016lx, taf=%" PRtype_info
1422 " memflags 0x%x\n",
1423 (void *) page_to_mfn(page),
1424 d, d->domain_id, _d,
1425 nd, nd->domain_id, _nd,
1426 x,
1427 page->u.inuse.type_info,
1428 memflags);
1430 spin_unlock(&d->page_alloc_lock);
1431 return -1;
1434 y = cmpxchg((u64*)&page->count_info, x, nx);
1435 } while (unlikely(y != x));
1437 /*
1438 * Unlink from 'd'. At least one reference remains (now anonymous), so
1439 * noone else is spinning to try to delete this page from 'd'.
1440 */
1441 if ( !(memflags & MEMF_no_refcount) )
1442 d->tot_pages--;
1443 list_del(&page->list);
1445 spin_unlock(&d->page_alloc_lock);
1446 return 0;
1449 void
1450 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1451 unsigned long mfn)
1453 int ret;
1455 BUG_ON(!mfn_valid(mfn));
1456 ret = get_page(mfn_to_page(mfn), d);
1457 BUG_ON(ret == 0);
1458 set_gpfn_from_mfn(mfn, gpfn);
1459 smp_mb();
1460 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, ASSIGN_writable);
1462 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1465 void
1466 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1467 unsigned long mfn)
1469 BUG_ON(mfn == 0);//XXX
1470 zap_domain_page_one(d, gpfn << PAGE_SHIFT, mfn);
1473 //XXX sledgehammer.
1474 // flush finer range.
1475 static void
1476 domain_page_flush(struct domain* d, unsigned long mpaddr,
1477 unsigned long old_mfn, unsigned long new_mfn)
1479 if (shadow_mode_enabled(d))
1480 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1482 domain_flush_vtlb_all();
1485 int
1486 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1488 volatile pte_t * pte;
1490 pte = lookup_noalloc_domain_pte(d, mpaddr);
1491 if(pte != NULL && !pte_none(*pte))
1492 return 1;
1493 return 0;
1496 /* Flush cache of domain d. */
1497 void domain_cache_flush (struct domain *d, int sync_only)
1499 struct mm_struct *mm = &d->arch.mm;
1500 pgd_t *pgd = mm->pgd;
1501 unsigned long maddr;
1502 int i,j,k, l;
1503 int nbr_page = 0;
1504 void (*flush_func)(unsigned long start, unsigned long end);
1505 extern void flush_dcache_range (unsigned long, unsigned long);
1507 if (sync_only)
1508 flush_func = &flush_icache_range;
1509 else
1510 flush_func = &flush_dcache_range;
1512 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1513 pud_t *pud;
1514 if (!pgd_present(*pgd))
1515 continue;
1516 pud = pud_offset(pgd, 0);
1517 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1518 pmd_t *pmd;
1519 if (!pud_present(*pud))
1520 continue;
1521 pmd = pmd_offset(pud, 0);
1522 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1523 pte_t *pte;
1524 if (!pmd_present(*pmd))
1525 continue;
1526 pte = pte_offset_map(pmd, 0);
1527 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1528 if (!pte_present(*pte))
1529 continue;
1530 /* Convert PTE to maddr. */
1531 maddr = __va_ul (pte_val(*pte)
1532 & _PAGE_PPN_MASK);
1533 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1534 nbr_page++;
1539 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1542 #ifdef VERBOSE
1543 #define MEM_LOG(_f, _a...) \
1544 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1545 current->domain->domain_id , __LINE__ , ## _a )
1546 #else
1547 #define MEM_LOG(_f, _a...) ((void)0)
1548 #endif
1550 static void free_page_type(struct page_info *page, u32 type)
1554 static int alloc_page_type(struct page_info *page, u32 type)
1556 return 1;
1559 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1561 void *p = alloc_xenheap_pages(order);
1563 memset(p,0,PAGE_SIZE<<order);
1564 return (unsigned long)p;
1567 void __free_pages(struct page_info *page, unsigned int order)
1569 if (order) BUG();
1570 free_xenheap_page(page);
1573 void *pgtable_quicklist_alloc(void)
1575 void *p;
1576 p = alloc_xenheap_pages(0);
1577 if (p)
1578 clear_page(p);
1579 return p;
1582 void pgtable_quicklist_free(void *pgtable_entry)
1584 free_xenheap_page(pgtable_entry);
1587 void put_page_type(struct page_info *page)
1589 u32 nx, x, y = page->u.inuse.type_info;
1591 again:
1592 do {
1593 x = y;
1594 nx = x - 1;
1596 ASSERT((x & PGT_count_mask) != 0);
1598 /*
1599 * The page should always be validated while a reference is held. The
1600 * exception is during domain destruction, when we forcibly invalidate
1601 * page-table pages if we detect a referential loop.
1602 * See domain.c:relinquish_list().
1603 */
1604 ASSERT((x & PGT_validated) ||
1605 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1607 if ( unlikely((nx & PGT_count_mask) == 0) )
1609 /* Record TLB information for flush later. Races are harmless. */
1610 page->tlbflush_timestamp = tlbflush_current_time();
1612 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1613 likely(nx & PGT_validated) )
1615 /*
1616 * Page-table pages must be unvalidated when count is zero. The
1617 * 'free' is safe because the refcnt is non-zero and validated
1618 * bit is clear => other ops will spin or fail.
1619 */
1620 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1621 x & ~PGT_validated)) != x) )
1622 goto again;
1623 /* We cleared the 'valid bit' so we do the clean up. */
1624 free_page_type(page, x);
1625 /* Carry on, but with the 'valid bit' now clear. */
1626 x &= ~PGT_validated;
1627 nx &= ~PGT_validated;
1630 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1631 (PGT_pinned | 1)) &&
1632 ((nx & PGT_type_mask) != PGT_writable_page)) )
1634 /* Page is now only pinned. Make the back pointer mutable again. */
1635 nx |= PGT_va_mutable;
1638 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1642 int get_page_type(struct page_info *page, u32 type)
1644 u32 nx, x, y = page->u.inuse.type_info;
1646 again:
1647 do {
1648 x = y;
1649 nx = x + 1;
1650 if ( unlikely((nx & PGT_count_mask) == 0) )
1652 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1653 return 0;
1655 else if ( unlikely((x & PGT_count_mask) == 0) )
1657 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1659 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1661 /*
1662 * On type change we check to flush stale TLB
1663 * entries. This may be unnecessary (e.g., page
1664 * was GDT/LDT) but those circumstances should be
1665 * very rare.
1666 */
1667 cpumask_t mask =
1668 page_get_owner(page)->domain_dirty_cpumask;
1669 tlbflush_filter(mask, page->tlbflush_timestamp);
1671 if ( unlikely(!cpus_empty(mask)) )
1673 perfc_incrc(need_flush_tlb_flush);
1674 flush_tlb_mask(mask);
1678 /* We lose existing type, back pointer, and validity. */
1679 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1680 nx |= type;
1682 /* No special validation needed for writable pages. */
1683 /* Page tables and GDT/LDT need to be scanned for validity. */
1684 if ( type == PGT_writable_page )
1685 nx |= PGT_validated;
1688 else
1690 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1692 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1694 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1695 ((type & PGT_type_mask) != PGT_l1_page_table) )
1696 MEM_LOG("Bad type (saw %08x != exp %08x) "
1697 "for mfn %016lx (pfn %016lx)",
1698 x, type, page_to_mfn(page),
1699 get_gpfn_from_mfn(page_to_mfn(page)));
1700 return 0;
1702 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1704 /* The va backpointer is mutable, hence we update it. */
1705 nx &= ~PGT_va_mask;
1706 nx |= type; /* we know the actual type is correct */
1708 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1709 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1711 #ifdef CONFIG_X86_PAE
1712 /* We use backptr as extra typing. Cannot be unknown. */
1713 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1714 return 0;
1715 #endif
1716 /* This table is possibly mapped at multiple locations. */
1717 nx &= ~PGT_va_mask;
1718 nx |= PGT_va_unknown;
1721 if ( unlikely(!(x & PGT_validated)) )
1723 /* Someone else is updating validation of this page. Wait... */
1724 while ( (y = page->u.inuse.type_info) == x )
1725 cpu_relax();
1726 goto again;
1730 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
1732 if ( unlikely(!(nx & PGT_validated)) )
1734 /* Try to validate page type; drop the new reference on failure. */
1735 if ( unlikely(!alloc_page_type(page, type)) )
1737 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
1738 ": caf=%08x taf=%" PRtype_info,
1739 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1740 type, page->count_info, page->u.inuse.type_info);
1741 /* Noone else can get a reference. We hold the only ref. */
1742 page->u.inuse.type_info = 0;
1743 return 0;
1746 /* Noone else is updating simultaneously. */
1747 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1750 return 1;
1753 int memory_is_conventional_ram(paddr_t p)
1755 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
1759 long
1760 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
1762 switch (op) {
1763 case XENMEM_add_to_physmap:
1765 struct xen_add_to_physmap xatp;
1766 unsigned long prev_mfn, mfn = 0, gpfn;
1767 struct domain *d;
1769 if (copy_from_guest(&xatp, arg, 1))
1770 return -EFAULT;
1772 if (xatp.domid == DOMID_SELF) {
1773 d = current->domain;
1774 get_knownalive_domain(d);
1776 else if (!IS_PRIV(current->domain))
1777 return -EPERM;
1778 else if ((d = find_domain_by_id(xatp.domid)) == NULL)
1779 return -ESRCH;
1781 /* This hypercall is used for VT-i domain only */
1782 if (!VMX_DOMAIN(d->vcpu[0])) {
1783 put_domain(d);
1784 return -ENOSYS;
1787 switch (xatp.space) {
1788 case XENMAPSPACE_shared_info:
1789 if (xatp.idx == 0)
1790 mfn = virt_to_mfn(d->shared_info);
1791 break;
1792 case XENMAPSPACE_grant_table:
1793 if (xatp.idx < NR_GRANT_FRAMES)
1794 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
1795 break;
1796 default:
1797 break;
1800 LOCK_BIGLOCK(d);
1802 /* Remove previously mapped page if it was present. */
1803 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
1804 if (prev_mfn && mfn_valid(prev_mfn)) {
1805 if (IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)))
1806 /* Xen heap frames are simply unhooked from this phys slot. */
1807 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
1808 else
1809 /* Normal domain memory is freed, to avoid leaking memory. */
1810 guest_remove_page(d, xatp.gpfn);
1813 /* Unmap from old location, if any. */
1814 gpfn = get_gpfn_from_mfn(mfn);
1815 if (gpfn != INVALID_M2P_ENTRY)
1816 guest_physmap_remove_page(d, gpfn, mfn);
1818 /* Map at new location. */
1819 guest_physmap_add_page(d, xatp.gpfn, mfn);
1821 UNLOCK_BIGLOCK(d);
1823 put_domain(d);
1825 break;
1828 default:
1829 return -ENOSYS;
1832 return 0;
1835 /*
1836 * Local variables:
1837 * mode: C
1838 * c-set-style: "BSD"
1839 * c-basic-offset: 4
1840 * tab-width: 4
1841 * indent-tabs-mode: nil
1842 * End:
1843 */