ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 10934:2aaad9cbc926

[IA64] enable ioports_deny_access for dom0

Signed-off-by: Tristan Gingold <tristan.gingold@bull.net>
author awilliam@xenbuild.aw
date Tue Aug 08 14:36:21 2006 -0600 (2006-08-08)
parents 6c67ca1e1c1a
children 555eb7402bd8
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <asm/mm.h>
170 #include <asm/pgalloc.h>
171 #include <asm/vhpt.h>
172 #include <asm/vcpu.h>
173 #include <asm/shadow.h>
174 #include <linux/efi.h>
176 static void domain_page_flush(struct domain* d, unsigned long mpaddr,
177 unsigned long old_mfn, unsigned long new_mfn);
179 extern unsigned long ia64_iobase;
181 static struct domain *dom_xen, *dom_io;
183 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
184 void
185 alloc_dom_xen_and_dom_io(void)
186 {
187 /*
188 * Initialise our DOMID_XEN domain.
189 * Any Xen-heap pages that we will allow to be mapped will have
190 * their domain field set to dom_xen.
191 */
192 dom_xen = alloc_domain(DOMID_XEN);
193 BUG_ON(dom_xen == NULL);
195 /*
196 * Initialise our DOMID_IO domain.
197 * This domain owns I/O pages that are within the range of the page_info
198 * array. Mappings occur at the priv of the caller.
199 */
200 dom_io = alloc_domain(DOMID_IO);
201 BUG_ON(dom_io == NULL);
202 }
204 // heavily depends on the struct page_info layout.
205 // if (page_get_owner(page) == d &&
206 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
207 // put_page(page);
208 // }
209 static void
210 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
211 {
212 u32 _d, _nd;
213 u64 x, nx, y;
215 _d = pickle_domptr(d);
216 y = *((u64*)&page->count_info);
217 do {
218 x = y;
219 _nd = x >> 32;
220 nx = x - 1;
221 __clear_bit(_PGC_allocated, &nx);
223 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
224 struct domain* nd = unpickle_domptr(_nd);
225 if (nd == NULL) {
226 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
227 "sd=%p 0x%x,"
228 " caf=%016lx, taf=%" PRtype_info "\n",
229 (void *) page_to_mfn(page),
230 d, d->domain_id, _d,
231 nd, _nd,
232 x,
233 page->u.inuse.type_info);
234 }
235 break;
236 }
238 BUG_ON((nx & PGC_count_mask) < 1);
239 y = cmpxchg((u64*)&page->count_info, x, nx);
240 } while (unlikely(y != x));
241 }
243 static void
244 relinquish_pte(struct domain* d, pte_t* pte)
245 {
246 unsigned long mfn = pte_pfn(*pte);
247 struct page_info* page;
249 // vmx domain use bit[58:56] to distinguish io region from memory.
250 // see vmx_build_physmap_table() in vmx_init.c
251 if (!pte_mem(*pte))
252 return;
254 // domain might map IO space or acpi table pages. check it.
255 if (!mfn_valid(mfn))
256 return;
257 page = mfn_to_page(mfn);
258 // struct page_info corresponding to mfn may exist or not depending
259 // on CONFIG_VIRTUAL_FRAME_TABLE.
260 // This check is too easy.
261 // The right way is to check whether this page is of io area or acpi pages
262 if (page_get_owner(page) == NULL) {
263 BUG_ON(page->count_info != 0);
264 return;
265 }
267 if (page_get_owner(page) == d) {
268 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
269 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
270 }
272 try_to_clear_PGC_allocate(d, page);
273 put_page(page);
274 }
276 static void
277 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
278 {
279 unsigned long i;
280 pte_t* pte = pte_offset_map(pmd, offset);
282 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
283 if (!pte_present(*pte))
284 continue;
286 relinquish_pte(d, pte);
287 }
288 pte_free_kernel(pte_offset_map(pmd, offset));
289 }
291 static void
292 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
293 {
294 unsigned long i;
295 pmd_t *pmd = pmd_offset(pud, offset);
297 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
298 if (!pmd_present(*pmd))
299 continue;
301 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
302 }
303 pmd_free(pmd_offset(pud, offset));
304 }
306 static void
307 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
308 {
309 unsigned long i;
310 pud_t *pud = pud_offset(pgd, offset);
312 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
313 if (!pud_present(*pud))
314 continue;
316 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
317 }
318 pud_free(pud_offset(pgd, offset));
319 }
321 void
322 relinquish_mm(struct domain* d)
323 {
324 struct mm_struct* mm = &d->arch.mm;
325 unsigned long i;
326 pgd_t* pgd;
328 if (mm->pgd == NULL)
329 return;
331 pgd = pgd_offset(mm, 0);
332 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
333 if (!pgd_present(*pgd))
334 continue;
336 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
337 }
338 pgd_free(mm->pgd);
339 mm->pgd = NULL;
340 }
342 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
343 void
344 share_xen_page_with_guest(struct page_info *page,
345 struct domain *d, int readonly)
346 {
347 if ( page_get_owner(page) == d )
348 return;
350 #if 1
351 if (readonly) {
352 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
353 }
354 #endif
356 // alloc_xenheap_pages() doesn't initialize page owner.
357 //BUG_ON(page_get_owner(page) != NULL);
359 spin_lock(&d->page_alloc_lock);
361 #ifndef __ia64__
362 /* The incremented type count pins as writable or read-only. */
363 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
364 page->u.inuse.type_info |= PGT_validated | 1;
365 #endif
367 page_set_owner(page, d);
368 wmb(); /* install valid domain ptr before updating refcnt. */
369 ASSERT(page->count_info == 0);
370 page->count_info |= PGC_allocated | 1;
372 if ( unlikely(d->xenheap_pages++ == 0) )
373 get_knownalive_domain(d);
374 list_add_tail(&page->list, &d->xenpage_list);
376 // grant_table_destroy() releases these pages.
377 // but it doesn't clear their m2p entry. So there might remain stale
378 // entries. such a stale entry is cleared here.
379 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
381 spin_unlock(&d->page_alloc_lock);
382 }
384 void
385 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
386 {
387 share_xen_page_with_guest(page, dom_xen, readonly);
388 }
390 unsigned long
391 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
392 {
393 unsigned long pte;
395 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
396 if (!pte) {
397 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
398 }
399 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
400 }
402 // given a domain virtual address, pte and pagesize, extract the metaphysical
403 // address, convert the pte for a physical address for (possibly different)
404 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
405 // PAGE_SIZE!)
406 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
407 struct p2m_entry* entry)
408 {
409 struct domain *d = current->domain;
410 ia64_itir_t itir = {.itir = itir__};
411 u64 mask, mpaddr, pteval2;
412 u64 arflags;
413 u64 arflags2;
414 u64 maflags2;
416 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
418 // FIXME address had better be pre-validated on insert
419 mask = ~itir_mask(itir.itir);
420 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
422 if (itir.ps > PAGE_SHIFT)
423 itir.ps = PAGE_SHIFT;
425 *logps = itir.ps;
427 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
429 /* Check access rights. */
430 arflags = pteval & _PAGE_AR_MASK;
431 arflags2 = pteval2 & _PAGE_AR_MASK;
432 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
433 #if 0
434 DPRINTK("%s:%d "
435 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
436 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
437 __func__, __LINE__,
438 pteval, arflags, address, itir__,
439 pteval2, arflags2, mpaddr);
440 #endif
441 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
442 }
444 /* Check memory attribute. The switch is on the *requested* memory
445 attribute. */
446 maflags2 = pteval2 & _PAGE_MA_MASK;
447 switch (pteval & _PAGE_MA_MASK) {
448 case _PAGE_MA_NAT:
449 /* NaT pages are always accepted! */
450 break;
451 case _PAGE_MA_UC:
452 case _PAGE_MA_UCE:
453 case _PAGE_MA_WC:
454 if (maflags2 == _PAGE_MA_WB) {
455 /* Don't let domains WB-map uncached addresses.
456 This can happen when domU tries to touch i/o
457 port space. Also prevents possible address
458 aliasing issues. */
459 printf("Warning: UC to WB for mpaddr=%lx\n", mpaddr);
460 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
461 }
462 break;
463 case _PAGE_MA_WB:
464 if (maflags2 != _PAGE_MA_WB) {
465 /* Forbid non-coherent access to coherent memory. */
466 panic_domain(NULL, "try to use WB mem attr on "
467 "UC page, mpaddr=%lx\n", mpaddr);
468 }
469 break;
470 default:
471 panic_domain(NULL, "try to use unknown mem attribute\n");
472 }
474 /* If shadow mode is enabled, virtualize dirty bit. */
475 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
476 u64 mp_page = mpaddr >> PAGE_SHIFT;
477 pteval |= _PAGE_VIRT_D;
479 /* If the page is not already dirty, don't set the dirty bit! */
480 if (mp_page < d->arch.shadow_bitmap_size * 8
481 && !test_bit(mp_page, d->arch.shadow_bitmap))
482 pteval &= ~_PAGE_D;
483 }
485 /* Ignore non-addr bits of pteval2 and force PL0->2
486 (PL3 is unaffected) */
487 return (pteval & ~_PAGE_PPN_MASK) |
488 (pteval2 & _PAGE_PPN_MASK) | _PAGE_PL_2;
489 }
491 // given a current domain metaphysical address, return the physical address
492 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
493 struct p2m_entry* entry)
494 {
495 unsigned long pteval;
497 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
498 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
499 }
501 //XXX !xxx_present() should be used instread of !xxx_none()?
502 // __assign_new_domain_page(), assign_new_domain_page() and
503 // assign_new_domain0_page() are used only when domain creation.
504 // their accesses aren't racy so that returned pte_t doesn't need
505 // volatile qualifier
506 static pte_t*
507 __lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
508 {
509 struct mm_struct *mm = &d->arch.mm;
510 pgd_t *pgd;
511 pud_t *pud;
512 pmd_t *pmd;
514 BUG_ON(mm->pgd == NULL);
515 pgd = pgd_offset(mm, mpaddr);
516 if (pgd_none(*pgd)) {
517 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
518 }
520 pud = pud_offset(pgd, mpaddr);
521 if (pud_none(*pud)) {
522 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
523 }
525 pmd = pmd_offset(pud, mpaddr);
526 if (pmd_none(*pmd)) {
527 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
528 }
530 return pte_offset_map(pmd, mpaddr);
531 }
533 //XXX !xxx_present() should be used instread of !xxx_none()?
534 // pud, pmd, pte page is zero cleared when they are allocated.
535 // Their area must be visible before population so that
536 // cmpxchg must have release semantics.
537 static volatile pte_t*
538 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
539 {
540 struct mm_struct *mm = &d->arch.mm;
541 pgd_t *pgd;
542 pud_t *pud;
543 pmd_t *pmd;
545 BUG_ON(mm->pgd == NULL);
547 pgd = pgd_offset(mm, mpaddr);
548 again_pgd:
549 if (unlikely(pgd_none(*pgd))) {
550 pud_t *old_pud = NULL;
551 pud = pud_alloc_one(mm, mpaddr);
552 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
553 pud_free(pud);
554 goto again_pgd;
555 }
556 }
558 pud = pud_offset(pgd, mpaddr);
559 again_pud:
560 if (unlikely(pud_none(*pud))) {
561 pmd_t* old_pmd = NULL;
562 pmd = pmd_alloc_one(mm, mpaddr);
563 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
564 pmd_free(pmd);
565 goto again_pud;
566 }
567 }
569 pmd = pmd_offset(pud, mpaddr);
570 again_pmd:
571 if (unlikely(pmd_none(*pmd))) {
572 pte_t* old_pte = NULL;
573 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
574 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
575 pte_free_kernel(pte);
576 goto again_pmd;
577 }
578 }
580 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
581 }
583 //XXX xxx_none() should be used instread of !xxx_present()?
584 volatile pte_t*
585 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
586 {
587 struct mm_struct *mm = &d->arch.mm;
588 pgd_t *pgd;
589 pud_t *pud;
590 pmd_t *pmd;
592 BUG_ON(mm->pgd == NULL);
593 pgd = pgd_offset(mm, mpaddr);
594 if (unlikely(!pgd_present(*pgd)))
595 return NULL;
597 pud = pud_offset(pgd, mpaddr);
598 if (unlikely(!pud_present(*pud)))
599 return NULL;
601 pmd = pmd_offset(pud, mpaddr);
602 if (unlikely(!pmd_present(*pmd)))
603 return NULL;
605 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
606 }
608 static volatile pte_t*
609 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
610 {
611 struct mm_struct *mm = &d->arch.mm;
612 pgd_t *pgd;
613 pud_t *pud;
614 pmd_t *pmd;
616 BUG_ON(mm->pgd == NULL);
617 pgd = pgd_offset(mm, mpaddr);
618 if (unlikely(pgd_none(*pgd)))
619 return NULL;
621 pud = pud_offset(pgd, mpaddr);
622 if (unlikely(pud_none(*pud)))
623 return NULL;
625 pmd = pmd_offset(pud, mpaddr);
626 if (unlikely(pmd_none(*pmd)))
627 return NULL;
629 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
630 }
632 unsigned long
633 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
634 {
635 volatile pte_t *pte;
637 pte = lookup_noalloc_domain_pte(d, mpaddr);
638 if (pte == NULL)
639 return INVALID_MFN;
641 if (pte_present(*pte))
642 return (pte->pte & _PFN_MASK);
643 else if (VMX_DOMAIN(d->vcpu[0]))
644 return GPFN_INV_MASK;
645 return INVALID_MFN;
646 }
648 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
649 struct p2m_entry* entry)
650 {
651 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
653 if (pte != NULL) {
654 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
655 if (pte_present(tmp_pte)) {
656 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
657 if (entry != NULL)
658 p2m_entry_set(entry, pte, tmp_pte);
659 return pte_val(tmp_pte);
660 } else if (VMX_DOMAIN(d->vcpu[0]))
661 return GPFN_INV_MASK;
662 }
664 printk("%s: d 0x%p id %d current 0x%p id %d\n",
665 __func__, d, d->domain_id, current, current->vcpu_id);
666 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
667 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
668 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
669 else
670 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
671 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
673 if (entry != NULL)
674 p2m_entry_set(entry, NULL, __pte(0));
675 //XXX This is a work around until the emulation memory access to a region
676 // where memory or device are attached is implemented.
677 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
678 }
680 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
681 #if 1
682 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
683 {
684 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
685 unsigned long imva;
687 pte &= _PAGE_PPN_MASK;
688 imva = (unsigned long) __va(pte);
689 imva |= mpaddr & ~PAGE_MASK;
690 return (void*)imva;
691 }
692 #else
693 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
694 {
695 unsigned long imva = __gpa_to_mpa(d, mpaddr);
697 return (void *)__va(imva);
698 }
699 #endif
701 /* Allocate a new page for domain and map it to the specified metaphysical
702 address. */
703 static struct page_info *
704 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
705 {
706 struct page_info *p;
707 unsigned long maddr;
708 int ret;
710 BUG_ON(!pte_none(*pte));
712 p = alloc_domheap_page(d);
713 if (unlikely(!p)) {
714 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
715 return(p);
716 }
718 // zero out pages for security reasons
719 clear_page(page_to_virt(p));
720 maddr = page_to_maddr (p);
721 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
722 && maddr < __get_cpu_var(vhpt_pend))) {
723 /* FIXME: how can this happen ?
724 vhpt is allocated by alloc_domheap_page. */
725 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
726 maddr);
727 }
729 ret = get_page(p, d);
730 BUG_ON(ret == 0);
731 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
732 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
733 // because set_pte_rel() has release semantics
734 set_pte_rel(pte,
735 pfn_pte(maddr >> PAGE_SHIFT,
736 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
738 smp_mb();
739 return p;
740 }
742 struct page_info *
743 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
744 {
745 pte_t *pte = __lookup_alloc_domain_pte(d, mpaddr);
747 if (!pte_none(*pte))
748 return NULL;
750 return __assign_new_domain_page(d, mpaddr, pte);
751 }
753 void
754 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
755 {
756 pte_t *pte;
758 BUG_ON(d != dom0);
759 pte = __lookup_alloc_domain_pte(d, mpaddr);
760 if (pte_none(*pte)) {
761 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
762 if (p == NULL) {
763 panic("%s: can't allocate page for dom0", __func__);
764 }
765 }
766 }
768 static unsigned long
769 flags_to_prot (unsigned long flags)
770 {
771 unsigned long res = _PAGE_PL_2 | __DIRTY_BITS;
773 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
774 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
776 return res;
777 }
779 /* map a physical address to the specified metaphysical addr */
780 // flags: currently only ASSIGN_readonly, ASSIGN_nocache
781 // This is called by assign_domain_mmio_page().
782 // So accessing to pte is racy.
783 void
784 __assign_domain_page(struct domain *d,
785 unsigned long mpaddr, unsigned long physaddr,
786 unsigned long flags)
787 {
788 volatile pte_t *pte;
789 pte_t old_pte;
790 pte_t new_pte;
791 pte_t ret_pte;
792 unsigned long prot = flags_to_prot(flags);
794 pte = lookup_alloc_domain_pte(d, mpaddr);
796 old_pte = __pte(0);
797 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
798 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
799 if (pte_val(ret_pte) == pte_val(old_pte))
800 smp_mb();
801 }
803 /* get_page() and map a physical address to the specified metaphysical addr */
804 void
805 assign_domain_page(struct domain *d,
806 unsigned long mpaddr, unsigned long physaddr)
807 {
808 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
809 int ret;
811 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
812 ret = get_page(page, d);
813 BUG_ON(ret == 0);
814 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
815 // because __assign_domain_page() uses set_pte_rel() which has
816 // release semantics, smp_mb() isn't needed.
817 __assign_domain_page(d, mpaddr, physaddr, ASSIGN_writable);
818 }
820 int
821 ioports_permit_access(struct domain *d, unsigned long fp, unsigned long lp)
822 {
823 int ret;
824 unsigned long off;
825 unsigned long fp_offset;
826 unsigned long lp_offset;
828 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
829 if (ret != 0)
830 return ret;
832 /* Domain 0 doesn't virtualize IO ports space. */
833 if (d == dom0)
834 return 0;
836 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
837 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
839 for (off = fp_offset; off <= lp_offset; off += PAGE_SIZE)
840 __assign_domain_page(d, IO_PORTS_PADDR + off,
841 __pa(ia64_iobase) + off, ASSIGN_nocache);
843 return 0;
844 }
846 static int
847 ioports_has_allowed(struct domain *d, unsigned long fp, unsigned long lp)
848 {
849 unsigned long i;
850 for (i = fp; i < lp; i++)
851 if (rangeset_contains_singleton(d->arch.ioport_caps, i))
852 return 1;
853 return 0;
854 }
856 int
857 ioports_deny_access(struct domain *d, unsigned long fp, unsigned long lp)
858 {
859 int ret;
860 struct mm_struct *mm = &d->arch.mm;
861 unsigned long off;
862 unsigned long io_ports_base;
863 unsigned long fp_offset;
864 unsigned long lp_offset;
866 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
867 if (ret != 0)
868 return ret;
869 if (d == dom0)
870 io_ports_base = __pa(ia64_iobase);
871 else
872 io_ports_base = IO_PORTS_PADDR;
874 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
875 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
877 for (off = fp_offset; off < lp_offset; off += PAGE_SIZE) {
878 unsigned long mpaddr = io_ports_base + off;
879 unsigned long port;
880 volatile pte_t *pte;
881 pte_t old_pte;
883 port = IO_SPACE_SPARSE_DECODING (off);
884 if (port < fp || port + IO_SPACE_SPARSE_PORTS_PER_PAGE > lp) {
885 /* Maybe this covers an allowed port. */
886 if (ioports_has_allowed(d, port,
887 port + IO_SPACE_SPARSE_PORTS_PER_PAGE))
888 continue;
889 }
891 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
892 BUG_ON(pte == NULL);
893 BUG_ON(pte_none(*pte));
895 // clear pte
896 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
897 }
898 domain_flush_vtlb_all();
899 return 0;
900 }
902 static void
903 assign_domain_same_page(struct domain *d,
904 unsigned long mpaddr, unsigned long size,
905 unsigned long flags)
906 {
907 //XXX optimization
908 unsigned long end = PAGE_ALIGN(mpaddr + size);
909 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
910 __assign_domain_page(d, mpaddr, mpaddr, flags);
911 }
912 }
914 int
915 efi_mmio(unsigned long physaddr, unsigned long size)
916 {
917 void *efi_map_start, *efi_map_end;
918 u64 efi_desc_size;
919 void* p;
921 efi_map_start = __va(ia64_boot_param->efi_memmap);
922 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
923 efi_desc_size = ia64_boot_param->efi_memdesc_size;
925 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
926 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
927 unsigned long start = md->phys_addr;
928 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
930 if (start <= physaddr && physaddr < end) {
931 if ((physaddr + size) > end) {
932 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
933 __func__, __LINE__, physaddr, size);
934 return 0;
935 }
937 // for io space
938 if (md->type == EFI_MEMORY_MAPPED_IO ||
939 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
940 return 1;
941 }
943 // for runtime
944 // see efi_enter_virtual_mode(void)
945 // in linux/arch/ia64/kernel/efi.c
946 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
947 !(md->attribute & EFI_MEMORY_WB)) {
948 return 1;
949 }
951 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
952 __func__, __LINE__, physaddr, size);
953 return 0;
954 }
956 if (physaddr < start) {
957 break;
958 }
959 }
961 return 1;
962 }
964 unsigned long
965 assign_domain_mmio_page(struct domain *d,
966 unsigned long mpaddr, unsigned long size)
967 {
968 if (size == 0) {
969 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
970 __func__, d, mpaddr, size);
971 }
972 if (!efi_mmio(mpaddr, size)) {
973 DPRINTK("%s:%d domain %p mpaddr 0x%lx size = 0x%lx\n",
974 __func__, __LINE__, d, mpaddr, size);
975 return -EINVAL;
976 }
977 assign_domain_same_page(d, mpaddr, size, ASSIGN_writable | ASSIGN_nocache);
978 return mpaddr;
979 }
981 unsigned long
982 assign_domain_mach_page(struct domain *d,
983 unsigned long mpaddr, unsigned long size,
984 unsigned long flags)
985 {
986 assign_domain_same_page(d, mpaddr, size, flags);
987 return mpaddr;
988 }
990 // caller must get_page(mfn_to_page(mfn)) before call.
991 // caller must call set_gpfn_from_mfn() before call if necessary.
992 // because set_gpfn_from_mfn() result must be visible before pte xchg
993 // caller must use memory barrier. NOTE: xchg has acquire semantics.
994 // flags: currently only ASSIGN_readonly
995 static void
996 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
997 unsigned long mfn, unsigned long flags)
998 {
999 struct mm_struct *mm = &d->arch.mm;
1000 volatile pte_t* pte;
1001 pte_t old_pte;
1002 pte_t npte;
1003 unsigned long prot = flags_to_prot(flags);
1005 pte = lookup_alloc_domain_pte(d, mpaddr);
1007 // update pte
1008 npte = pfn_pte(mfn, __pgprot(prot));
1009 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1010 if (pte_mem(old_pte)) {
1011 unsigned long old_mfn = pte_pfn(old_pte);
1013 // mfn = old_mfn case can happen when domain maps a granted page
1014 // twice with the same pseudo physial address.
1015 // It's non sense, but allowed.
1016 // __gnttab_map_grant_ref()
1017 // => create_host_mapping()
1018 // => assign_domain_page_replace()
1019 if (mfn != old_mfn) {
1020 struct page_info* old_page = mfn_to_page(old_mfn);
1022 if (page_get_owner(old_page) == d ||
1023 page_get_owner(old_page) == NULL) {
1024 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1025 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1028 domain_page_flush(d, mpaddr, old_mfn, mfn);
1030 try_to_clear_PGC_allocate(d, old_page);
1031 put_page(old_page);
1036 // caller must get_page(new_page) before
1037 // Only steal_page() calls this function.
1038 static int
1039 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1040 struct page_info* old_page,
1041 struct page_info* new_page,
1042 unsigned long flags)
1044 struct mm_struct *mm = &d->arch.mm;
1045 volatile pte_t* pte;
1046 unsigned long old_mfn;
1047 unsigned long old_arflags;
1048 pte_t old_pte;
1049 unsigned long new_mfn;
1050 unsigned long new_prot;
1051 pte_t new_pte;
1052 pte_t ret_pte;
1054 pte = lookup_alloc_domain_pte(d, mpaddr);
1056 again:
1057 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1058 old_mfn = page_to_mfn(old_page);
1059 old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
1060 if (!pte_present(old_pte)) {
1061 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
1062 __func__, pte_val(old_pte), old_arflags, old_mfn);
1063 return -EINVAL;
1066 new_prot = flags_to_prot(flags);
1067 new_mfn = page_to_mfn(new_page);
1068 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1070 // update pte
1071 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1072 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1073 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1074 goto again;
1077 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
1078 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1079 __func__,
1080 pte_val(old_pte), old_arflags, old_mfn,
1081 pte_val(ret_pte), pte_pfn(ret_pte));
1082 return -EINVAL;
1085 BUG_ON(!pte_mem(old_pte));
1086 BUG_ON(page_get_owner(old_page) != d);
1087 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1088 BUG_ON(old_mfn == new_mfn);
1090 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1092 domain_page_flush(d, mpaddr, old_mfn, new_mfn);
1093 put_page(old_page);
1094 return 0;
1097 static void
1098 zap_domain_page_one(struct domain *d, unsigned long mpaddr, unsigned long mfn)
1100 struct mm_struct *mm = &d->arch.mm;
1101 volatile pte_t *pte;
1102 pte_t old_pte;
1103 struct page_info *page;
1105 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1106 if (pte == NULL)
1107 return;
1108 if (pte_none(*pte))
1109 return;
1111 if (mfn == INVALID_MFN) {
1112 // clear pte
1113 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1114 mfn = pte_pfn(old_pte);
1115 } else {
1116 unsigned long old_arflags;
1117 pte_t new_pte;
1118 pte_t ret_pte;
1120 again:
1121 // memory_exchange() calls guest_physmap_remove_page() with
1122 // a stealed page. i.e. page owner = NULL.
1123 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1124 page_get_owner(mfn_to_page(mfn)) != NULL);
1125 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1126 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1127 new_pte = __pte(0);
1129 // update pte
1130 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1131 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1132 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1133 goto again;
1136 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1137 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1138 __func__,
1139 pte_val(old_pte), old_arflags, mfn,
1140 pte_val(ret_pte), pte_pfn(ret_pte));
1141 return;
1143 BUG_ON(mfn != pte_pfn(ret_pte));
1146 page = mfn_to_page(mfn);
1147 BUG_ON((page->count_info & PGC_count_mask) == 0);
1149 if (page_get_owner(page) == d ||
1150 page_get_owner(page) == NULL) {
1151 // exchange_memory() calls
1152 // steal_page()
1153 // page owner is set to NULL
1154 // guest_physmap_remove_page()
1155 // zap_domain_page_one()
1156 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1157 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1160 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
1162 if (page_get_owner(page) != NULL) {
1163 try_to_clear_PGC_allocate(d, page);
1165 put_page(page);
1168 unsigned long
1169 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1170 unsigned int extent_order)
1172 if (extent_order != 0) {
1173 //XXX
1174 return -ENOSYS;
1177 zap_domain_page_one(d, gpfn << PAGE_SHIFT, INVALID_MFN);
1178 return 0;
1181 unsigned long
1182 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1183 unsigned long flags, domid_t domid)
1185 int error = 0;
1186 struct domain* rd;
1188 /* Not allowed by a domain. */
1189 if (flags & ASSIGN_nocache)
1190 return -EINVAL;
1192 rd = find_domain_by_id(domid);
1193 if (unlikely(rd == NULL)) {
1194 switch (domid) {
1195 case DOMID_XEN:
1196 rd = dom_xen;
1197 break;
1198 case DOMID_IO:
1199 rd = dom_io;
1200 break;
1201 default:
1202 DPRINTK("d 0x%p domid %d "
1203 "pgfn 0x%lx mfn 0x%lx flags 0x%lx domid %d\n",
1204 d, d->domain_id, gpfn, mfn, flags, domid);
1205 return -ESRCH;
1207 BUG_ON(rd == NULL);
1208 get_knownalive_domain(rd);
1211 if (unlikely(rd == d || !mfn_valid(mfn))) {
1212 error = -EINVAL;
1213 goto out1;
1215 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1216 error = -EINVAL;
1217 goto out1;
1219 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1220 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1221 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1222 //don't update p2m table because this page belongs to rd, not d.
1223 out1:
1224 put_domain(rd);
1225 return error;
1228 // grant table host mapping
1229 // mpaddr: host_addr: pseudo physical address
1230 // mfn: frame: machine page frame
1231 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1232 int
1233 create_grant_host_mapping(unsigned long gpaddr,
1234 unsigned long mfn, unsigned int flags)
1236 struct domain* d = current->domain;
1237 struct page_info* page;
1238 int ret;
1240 if (flags & (GNTMAP_device_map |
1241 GNTMAP_application_map | GNTMAP_contains_pte)) {
1242 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1243 return GNTST_general_error;
1246 BUG_ON(!mfn_valid(mfn));
1247 page = mfn_to_page(mfn);
1248 ret = get_page(page, page_get_owner(page));
1249 BUG_ON(ret == 0);
1250 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1251 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1252 assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
1253 ASSIGN_readonly: ASSIGN_writable);
1254 return GNTST_okay;
1257 // grant table host unmapping
1258 int
1259 destroy_grant_host_mapping(unsigned long gpaddr,
1260 unsigned long mfn, unsigned int flags)
1262 struct domain* d = current->domain;
1263 volatile pte_t* pte;
1264 unsigned long cur_arflags;
1265 pte_t cur_pte;
1266 pte_t new_pte;
1267 pte_t old_pte;
1268 struct page_info* page;
1270 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1271 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1272 return GNTST_general_error;
1275 pte = lookup_noalloc_domain_pte(d, gpaddr);
1276 if (pte == NULL) {
1277 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx\n", __func__, gpaddr, mfn);
1278 return GNTST_general_error;
1281 again:
1282 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1283 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1284 if (!pte_present(cur_pte)) {
1285 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1286 __func__, gpaddr, mfn, pte_val(cur_pte));
1287 return GNTST_general_error;
1289 new_pte = __pte(0);
1291 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1292 if (unlikely(!pte_present(old_pte))) {
1293 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1294 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1295 return GNTST_general_error;
1297 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1298 if (pte_pfn(old_pte) == mfn) {
1299 goto again;
1301 DPRINTK("%s gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1302 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1303 return GNTST_general_error;
1305 BUG_ON(pte_pfn(old_pte) != mfn);
1307 domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
1309 page = mfn_to_page(mfn);
1310 BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1311 put_page(page);
1313 return GNTST_okay;
1316 // heavily depends on the struct page layout.
1317 // gnttab_transfer() calls steal_page() with memflags = 0
1318 // For grant table transfer, we must fill the page.
1319 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1320 // For memory exchange, we don't have to fill the page because
1321 // memory_exchange() does it.
1322 int
1323 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1325 #if 0 /* if big endian */
1326 # error "implement big endian version of steal_page()"
1327 #endif
1328 u32 _d, _nd;
1329 u64 x, nx, y;
1331 if (page_get_owner(page) != d) {
1332 DPRINTK("%s d 0x%p owner 0x%p\n", __func__, d, page_get_owner(page));
1333 return -1;
1336 if (!(memflags & MEMF_no_refcount)) {
1337 unsigned long gpfn;
1338 struct page_info *new;
1339 unsigned long new_mfn;
1340 int ret;
1342 new = alloc_domheap_page(d);
1343 if (new == NULL) {
1344 DPRINTK("alloc_domheap_page() failed\n");
1345 return -1;
1347 // zero out pages for security reasons
1348 clear_page(page_to_virt(new));
1349 // assign_domain_page_cmpxchg_rel() has release semantics
1350 // so smp_mb() isn't needed.
1352 ret = get_page(new, d);
1353 BUG_ON(ret == 0);
1355 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1356 if (gpfn == INVALID_M2P_ENTRY) {
1357 free_domheap_page(new);
1358 return -1;
1360 new_mfn = page_to_mfn(new);
1361 set_gpfn_from_mfn(new_mfn, gpfn);
1362 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1363 // has release semantics.
1365 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1366 ASSIGN_writable);
1367 if (ret < 0) {
1368 DPRINTK("assign_domain_page_cmpxchg_rel failed %d\n", ret);
1369 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1370 free_domheap_page(new);
1371 return -1;
1375 spin_lock(&d->page_alloc_lock);
1377 /*
1378 * The tricky bit: atomically release ownership while there is just one
1379 * benign reference to the page (PGC_allocated). If that reference
1380 * disappears then the deallocation routine will safely spin.
1381 */
1382 _d = pickle_domptr(d);
1383 y = *((u64*)&page->count_info);
1384 do {
1385 x = y;
1386 nx = x & 0xffffffff;
1387 // page->count_info: untouched
1388 // page->u.inused._domain = 0;
1389 _nd = x >> 32;
1391 if (unlikely(!(memflags & MEMF_no_refcount) &&
1392 ((x & (PGC_count_mask | PGC_allocated)) !=
1393 (1 | PGC_allocated))) ||
1395 // when MEMF_no_refcount, page isn't de-assigned from
1396 // this domain yet. So count_info = 2
1397 unlikely((memflags & MEMF_no_refcount) &&
1398 ((x & (PGC_count_mask | PGC_allocated)) !=
1399 (2 | PGC_allocated))) ||
1401 unlikely(_nd != _d)) {
1402 struct domain* nd = unpickle_domptr(_nd);
1403 if (nd == NULL) {
1404 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1405 "sd=%p 0x%x,"
1406 " caf=%016lx, taf=%" PRtype_info
1407 " memflags 0x%x\n",
1408 (void *) page_to_mfn(page),
1409 d, d->domain_id, _d,
1410 nd, _nd,
1411 x,
1412 page->u.inuse.type_info,
1413 memflags);
1414 } else {
1415 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1416 "sd=%p(%u) 0x%x,"
1417 " caf=%016lx, taf=%" PRtype_info
1418 " memflags 0x%x\n",
1419 (void *) page_to_mfn(page),
1420 d, d->domain_id, _d,
1421 nd, nd->domain_id, _nd,
1422 x,
1423 page->u.inuse.type_info,
1424 memflags);
1426 spin_unlock(&d->page_alloc_lock);
1427 return -1;
1430 y = cmpxchg((u64*)&page->count_info, x, nx);
1431 } while (unlikely(y != x));
1433 /*
1434 * Unlink from 'd'. At least one reference remains (now anonymous), so
1435 * noone else is spinning to try to delete this page from 'd'.
1436 */
1437 if ( !(memflags & MEMF_no_refcount) )
1438 d->tot_pages--;
1439 list_del(&page->list);
1441 spin_unlock(&d->page_alloc_lock);
1442 return 0;
1445 void
1446 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1447 unsigned long mfn)
1449 int ret;
1451 BUG_ON(!mfn_valid(mfn));
1452 ret = get_page(mfn_to_page(mfn), d);
1453 BUG_ON(ret == 0);
1454 set_gpfn_from_mfn(mfn, gpfn);
1455 smp_mb();
1456 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, ASSIGN_writable);
1458 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1461 void
1462 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1463 unsigned long mfn)
1465 BUG_ON(mfn == 0);//XXX
1466 zap_domain_page_one(d, gpfn << PAGE_SHIFT, mfn);
1469 //XXX sledgehammer.
1470 // flush finer range.
1471 static void
1472 domain_page_flush(struct domain* d, unsigned long mpaddr,
1473 unsigned long old_mfn, unsigned long new_mfn)
1475 if (shadow_mode_enabled(d))
1476 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1478 domain_flush_vtlb_all();
1481 int
1482 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1484 volatile pte_t * pte;
1486 pte = lookup_noalloc_domain_pte(d, mpaddr);
1487 if(pte != NULL && !pte_none(*pte))
1488 return 1;
1489 return 0;
1492 /* Flush cache of domain d. */
1493 void domain_cache_flush (struct domain *d, int sync_only)
1495 struct mm_struct *mm = &d->arch.mm;
1496 pgd_t *pgd = mm->pgd;
1497 unsigned long maddr;
1498 int i,j,k, l;
1499 int nbr_page = 0;
1500 void (*flush_func)(unsigned long start, unsigned long end);
1501 extern void flush_dcache_range (unsigned long, unsigned long);
1503 if (sync_only)
1504 flush_func = &flush_icache_range;
1505 else
1506 flush_func = &flush_dcache_range;
1508 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1509 pud_t *pud;
1510 if (!pgd_present(*pgd))
1511 continue;
1512 pud = pud_offset(pgd, 0);
1513 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1514 pmd_t *pmd;
1515 if (!pud_present(*pud))
1516 continue;
1517 pmd = pmd_offset(pud, 0);
1518 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1519 pte_t *pte;
1520 if (!pmd_present(*pmd))
1521 continue;
1522 pte = pte_offset_map(pmd, 0);
1523 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1524 if (!pte_present(*pte))
1525 continue;
1526 /* Convert PTE to maddr. */
1527 maddr = __va_ul (pte_val(*pte)
1528 & _PAGE_PPN_MASK);
1529 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1530 nbr_page++;
1535 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1538 #ifdef VERBOSE
1539 #define MEM_LOG(_f, _a...) \
1540 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1541 current->domain->domain_id , __LINE__ , ## _a )
1542 #else
1543 #define MEM_LOG(_f, _a...) ((void)0)
1544 #endif
1546 static void free_page_type(struct page_info *page, u32 type)
1550 static int alloc_page_type(struct page_info *page, u32 type)
1552 return 1;
1555 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1557 void *p = alloc_xenheap_pages(order);
1559 memset(p,0,PAGE_SIZE<<order);
1560 return (unsigned long)p;
1563 void __free_pages(struct page_info *page, unsigned int order)
1565 if (order) BUG();
1566 free_xenheap_page(page);
1569 void *pgtable_quicklist_alloc(void)
1571 void *p;
1572 p = alloc_xenheap_pages(0);
1573 if (p)
1574 clear_page(p);
1575 return p;
1578 void pgtable_quicklist_free(void *pgtable_entry)
1580 free_xenheap_page(pgtable_entry);
1583 void cleanup_writable_pagetable(struct domain *d)
1585 return;
1588 void put_page_type(struct page_info *page)
1590 u32 nx, x, y = page->u.inuse.type_info;
1592 again:
1593 do {
1594 x = y;
1595 nx = x - 1;
1597 ASSERT((x & PGT_count_mask) != 0);
1599 /*
1600 * The page should always be validated while a reference is held. The
1601 * exception is during domain destruction, when we forcibly invalidate
1602 * page-table pages if we detect a referential loop.
1603 * See domain.c:relinquish_list().
1604 */
1605 ASSERT((x & PGT_validated) ||
1606 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1608 if ( unlikely((nx & PGT_count_mask) == 0) )
1610 /* Record TLB information for flush later. Races are harmless. */
1611 page->tlbflush_timestamp = tlbflush_current_time();
1613 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1614 likely(nx & PGT_validated) )
1616 /*
1617 * Page-table pages must be unvalidated when count is zero. The
1618 * 'free' is safe because the refcnt is non-zero and validated
1619 * bit is clear => other ops will spin or fail.
1620 */
1621 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1622 x & ~PGT_validated)) != x) )
1623 goto again;
1624 /* We cleared the 'valid bit' so we do the clean up. */
1625 free_page_type(page, x);
1626 /* Carry on, but with the 'valid bit' now clear. */
1627 x &= ~PGT_validated;
1628 nx &= ~PGT_validated;
1631 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1632 (PGT_pinned | 1)) &&
1633 ((nx & PGT_type_mask) != PGT_writable_page)) )
1635 /* Page is now only pinned. Make the back pointer mutable again. */
1636 nx |= PGT_va_mutable;
1639 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1643 int get_page_type(struct page_info *page, u32 type)
1645 u32 nx, x, y = page->u.inuse.type_info;
1647 again:
1648 do {
1649 x = y;
1650 nx = x + 1;
1651 if ( unlikely((nx & PGT_count_mask) == 0) )
1653 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1654 return 0;
1656 else if ( unlikely((x & PGT_count_mask) == 0) )
1658 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1660 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1662 /*
1663 * On type change we check to flush stale TLB
1664 * entries. This may be unnecessary (e.g., page
1665 * was GDT/LDT) but those circumstances should be
1666 * very rare.
1667 */
1668 cpumask_t mask =
1669 page_get_owner(page)->domain_dirty_cpumask;
1670 tlbflush_filter(mask, page->tlbflush_timestamp);
1672 if ( unlikely(!cpus_empty(mask)) )
1674 perfc_incrc(need_flush_tlb_flush);
1675 flush_tlb_mask(mask);
1679 /* We lose existing type, back pointer, and validity. */
1680 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1681 nx |= type;
1683 /* No special validation needed for writable pages. */
1684 /* Page tables and GDT/LDT need to be scanned for validity. */
1685 if ( type == PGT_writable_page )
1686 nx |= PGT_validated;
1689 else
1691 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1693 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1695 if ( current->domain == page_get_owner(page) )
1697 /*
1698 * This ensures functions like set_gdt() see up-to-date
1699 * type info without needing to clean up writable p.t.
1700 * state on the fast path.
1701 */
1702 LOCK_BIGLOCK(current->domain);
1703 cleanup_writable_pagetable(current->domain);
1704 y = page->u.inuse.type_info;
1705 UNLOCK_BIGLOCK(current->domain);
1706 /* Can we make progress now? */
1707 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1708 ((y & PGT_count_mask) == 0) )
1709 goto again;
1711 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1712 ((type & PGT_type_mask) != PGT_l1_page_table) )
1713 MEM_LOG("Bad type (saw %08x != exp %08x) "
1714 "for mfn %016lx (pfn %016lx)",
1715 x, type, page_to_mfn(page),
1716 get_gpfn_from_mfn(page_to_mfn(page)));
1717 return 0;
1719 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1721 /* The va backpointer is mutable, hence we update it. */
1722 nx &= ~PGT_va_mask;
1723 nx |= type; /* we know the actual type is correct */
1725 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1726 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1728 #ifdef CONFIG_X86_PAE
1729 /* We use backptr as extra typing. Cannot be unknown. */
1730 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1731 return 0;
1732 #endif
1733 /* This table is possibly mapped at multiple locations. */
1734 nx &= ~PGT_va_mask;
1735 nx |= PGT_va_unknown;
1738 if ( unlikely(!(x & PGT_validated)) )
1740 /* Someone else is updating validation of this page. Wait... */
1741 while ( (y = page->u.inuse.type_info) == x )
1742 cpu_relax();
1743 goto again;
1747 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
1749 if ( unlikely(!(nx & PGT_validated)) )
1751 /* Try to validate page type; drop the new reference on failure. */
1752 if ( unlikely(!alloc_page_type(page, type)) )
1754 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
1755 ": caf=%08x taf=%" PRtype_info,
1756 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1757 type, page->count_info, page->u.inuse.type_info);
1758 /* Noone else can get a reference. We hold the only ref. */
1759 page->u.inuse.type_info = 0;
1760 return 0;
1763 /* Noone else is updating simultaneously. */
1764 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1767 return 1;
1770 /*
1771 * Local variables:
1772 * mode: C
1773 * c-set-style: "BSD"
1774 * c-basic-offset: 4
1775 * tab-width: 4
1776 * indent-tabs-mode: nil
1777 * End:
1778 */