ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 10816:7be1cfe8345b

[IA64] iomem support for driver domains.

First steps in hypevisor to support driver domains.

IO ports capabilities added (not yet used).
IO mem capabilities checked.
ASSIGN_nocache flag added.
Memory attributes checked.

Signed-off-by: Tristan Gingold <tristan.gingold@bull.net>
author awilliam@xenbuild.aw
date Thu Jul 27 09:47:10 2006 -0600 (2006-07-27)
parents 86e5d8458c08
children 199d53efd029
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <asm/mm.h>
170 #include <asm/pgalloc.h>
171 #include <asm/vhpt.h>
172 #include <asm/vcpu.h>
173 #include <asm/shadow.h>
174 #include <linux/efi.h>
176 #ifndef CONFIG_XEN_IA64_DOM0_VP
177 #define CONFIG_DOMAIN0_CONTIGUOUS
178 #else
179 static void domain_page_flush(struct domain* d, unsigned long mpaddr,
180 unsigned long old_mfn, unsigned long new_mfn);
181 #endif
183 static struct domain *dom_xen, *dom_io;
185 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
186 void
187 alloc_dom_xen_and_dom_io(void)
188 {
189 /*
190 * Initialise our DOMID_XEN domain.
191 * Any Xen-heap pages that we will allow to be mapped will have
192 * their domain field set to dom_xen.
193 */
194 dom_xen = alloc_domain(DOMID_XEN);
195 BUG_ON(dom_xen == NULL);
197 /*
198 * Initialise our DOMID_IO domain.
199 * This domain owns I/O pages that are within the range of the page_info
200 * array. Mappings occur at the priv of the caller.
201 */
202 dom_io = alloc_domain(DOMID_IO);
203 BUG_ON(dom_io == NULL);
204 }
206 // heavily depends on the struct page_info layout.
207 // if (page_get_owner(page) == d &&
208 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
209 // put_page(page);
210 // }
211 static void
212 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
213 {
214 u32 _d, _nd;
215 u64 x, nx, y;
217 _d = pickle_domptr(d);
218 y = *((u64*)&page->count_info);
219 do {
220 x = y;
221 _nd = x >> 32;
222 nx = x - 1;
223 __clear_bit(_PGC_allocated, &nx);
225 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
226 struct domain* nd = unpickle_domptr(_nd);
227 if (nd == NULL) {
228 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
229 "sd=%p 0x%x,"
230 " caf=%016lx, taf=%" PRtype_info "\n",
231 (void *) page_to_mfn(page),
232 d, d->domain_id, _d,
233 nd, _nd,
234 x,
235 page->u.inuse.type_info);
236 }
237 break;
238 }
240 BUG_ON((nx & PGC_count_mask) < 1);
241 y = cmpxchg((u64*)&page->count_info, x, nx);
242 } while (unlikely(y != x));
243 }
245 static void
246 relinquish_pte(struct domain* d, pte_t* pte)
247 {
248 unsigned long mfn = pte_pfn(*pte);
249 struct page_info* page;
251 // vmx domain use bit[58:56] to distinguish io region from memory.
252 // see vmx_build_physmap_table() in vmx_init.c
253 if (!pte_mem(*pte))
254 return;
256 // domain might map IO space or acpi table pages. check it.
257 if (!mfn_valid(mfn))
258 return;
259 page = mfn_to_page(mfn);
260 // struct page_info corresponding to mfn may exist or not depending
261 // on CONFIG_VIRTUAL_FRAME_TABLE.
262 // This check is too easy.
263 // The right way is to check whether this page is of io area or acpi pages
264 if (page_get_owner(page) == NULL) {
265 BUG_ON(page->count_info != 0);
266 return;
267 }
269 #ifdef CONFIG_XEN_IA64_DOM0_VP
270 if (page_get_owner(page) == d) {
271 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
272 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
273 }
274 #endif
275 try_to_clear_PGC_allocate(d, page);
276 put_page(page);
277 }
279 static void
280 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
281 {
282 unsigned long i;
283 pte_t* pte = pte_offset_map(pmd, offset);
285 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
286 if (!pte_present(*pte))
287 continue;
289 relinquish_pte(d, pte);
290 }
291 pte_free_kernel(pte_offset_map(pmd, offset));
292 }
294 static void
295 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
296 {
297 unsigned long i;
298 pmd_t *pmd = pmd_offset(pud, offset);
300 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
301 if (!pmd_present(*pmd))
302 continue;
304 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
305 }
306 pmd_free(pmd_offset(pud, offset));
307 }
309 static void
310 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
311 {
312 unsigned long i;
313 pud_t *pud = pud_offset(pgd, offset);
315 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
316 if (!pud_present(*pud))
317 continue;
319 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
320 }
321 pud_free(pud_offset(pgd, offset));
322 }
324 void
325 relinquish_mm(struct domain* d)
326 {
327 struct mm_struct* mm = &d->arch.mm;
328 unsigned long i;
329 pgd_t* pgd;
331 if (mm->pgd == NULL)
332 return;
334 pgd = pgd_offset(mm, 0);
335 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
336 if (!pgd_present(*pgd))
337 continue;
339 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
340 }
341 pgd_free(mm->pgd);
342 mm->pgd = NULL;
343 }
345 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
346 void
347 share_xen_page_with_guest(struct page_info *page,
348 struct domain *d, int readonly)
349 {
350 if ( page_get_owner(page) == d )
351 return;
353 #if 1
354 if (readonly) {
355 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
356 }
357 #endif
359 // alloc_xenheap_pages() doesn't initialize page owner.
360 //BUG_ON(page_get_owner(page) != NULL);
362 spin_lock(&d->page_alloc_lock);
364 #ifndef __ia64__
365 /* The incremented type count pins as writable or read-only. */
366 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
367 page->u.inuse.type_info |= PGT_validated | 1;
368 #endif
370 page_set_owner(page, d);
371 wmb(); /* install valid domain ptr before updating refcnt. */
372 ASSERT(page->count_info == 0);
373 page->count_info |= PGC_allocated | 1;
375 if ( unlikely(d->xenheap_pages++ == 0) )
376 get_knownalive_domain(d);
377 list_add_tail(&page->list, &d->xenpage_list);
379 // grant_table_destroy() releases these pages.
380 // but it doesn't clear their m2p entry. So there might remain stale
381 // entries. such a stale entry is cleared here.
382 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
384 spin_unlock(&d->page_alloc_lock);
385 }
387 void
388 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
389 {
390 share_xen_page_with_guest(page, dom_xen, readonly);
391 }
393 unsigned long
394 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
395 {
396 unsigned long pte;
398 #ifndef CONFIG_XEN_IA64_DOM0_VP
399 if (d == dom0)
400 return(gpfn);
401 #endif
402 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
403 if (!pte) {
404 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
405 }
406 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
407 }
409 // given a domain virtual address, pte and pagesize, extract the metaphysical
410 // address, convert the pte for a physical address for (possibly different)
411 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
412 // PAGE_SIZE!)
413 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
414 struct p2m_entry* entry)
415 {
416 struct domain *d = current->domain;
417 ia64_itir_t itir = {.itir = itir__};
418 u64 mask, mpaddr, pteval2;
419 u64 arflags;
420 u64 arflags2;
421 u64 maflags2;
423 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
425 // FIXME address had better be pre-validated on insert
426 mask = ~itir_mask(itir.itir);
427 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
428 #ifdef CONFIG_XEN_IA64_DOM0_VP
429 if (itir.ps > PAGE_SHIFT) {
430 itir.ps = PAGE_SHIFT;
431 }
432 #endif
433 *logps = itir.ps;
434 #ifndef CONFIG_XEN_IA64_DOM0_VP
435 if (d == dom0) {
436 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
437 /*
438 printk("translate_domain_pte: out-of-bounds dom0 mpaddr 0x%lx! itc=%lx...\n",
439 mpaddr, ia64_get_itc());
440 */
441 }
442 }
443 else if ((mpaddr >> PAGE_SHIFT) > d->max_pages) {
444 /* Address beyond the limit. However the grant table is
445 also beyond the limit. Display a message if not in the
446 grant table. */
447 if (mpaddr >= IA64_GRANT_TABLE_PADDR
448 && mpaddr < (IA64_GRANT_TABLE_PADDR
449 + (ORDER_GRANT_FRAMES << PAGE_SHIFT)))
450 printf("translate_domain_pte: bad mpa=0x%lx (> 0x%lx),"
451 "vadr=0x%lx,pteval=0x%lx,itir=0x%lx\n",
452 mpaddr, (unsigned long)d->max_pages<<PAGE_SHIFT,
453 address, pteval, itir.itir);
454 }
455 #endif
456 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
458 /* Check access rights. */
459 arflags = pteval & _PAGE_AR_MASK;
460 arflags2 = pteval2 & _PAGE_AR_MASK;
461 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
462 #if 0
463 DPRINTK("%s:%d "
464 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
465 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
466 __func__, __LINE__,
467 pteval, arflags, address, itir__,
468 pteval2, arflags2, mpaddr);
469 #endif
470 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
471 }
473 /* Check memory attribute. The switch is on the *requested* memory
474 attribute. */
475 maflags2 = pteval2 & _PAGE_MA_MASK;
476 switch (pteval & _PAGE_MA_MASK) {
477 case _PAGE_MA_NAT:
478 /* NaT pages are always accepted! */
479 break;
480 case _PAGE_MA_UC:
481 case _PAGE_MA_UCE:
482 case _PAGE_MA_WC:
483 if (maflags2 == _PAGE_MA_WB) {
484 /* Don't let domains WB-map uncached addresses.
485 This can happen when domU tries to touch i/o
486 port space. Also prevents possible address
487 aliasing issues. */
488 printf("Warning: UC to WB for mpaddr=%lx\n", mpaddr);
489 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
490 }
491 break;
492 case _PAGE_MA_WB:
493 if (maflags2 != _PAGE_MA_WB) {
494 /* Forbid non-coherent access to coherent memory. */
495 panic_domain(NULL, "try to use WB mem attr on "
496 "UC page, mpaddr=%lx\n", mpaddr);
497 }
498 break;
499 default:
500 panic_domain(NULL, "try to use unknown mem attribute\n");
501 }
503 /* If shadow mode is enabled, virtualize dirty bit. */
504 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
505 u64 mp_page = mpaddr >> PAGE_SHIFT;
506 pteval |= _PAGE_VIRT_D;
508 /* If the page is not already dirty, don't set the dirty bit! */
509 if (mp_page < d->arch.shadow_bitmap_size * 8
510 && !test_bit(mp_page, d->arch.shadow_bitmap))
511 pteval &= ~_PAGE_D;
512 }
514 /* Ignore non-addr bits of pteval2 and force PL0->2
515 (PL3 is unaffected) */
516 return (pteval & ~_PAGE_PPN_MASK) |
517 (pteval2 & _PAGE_PPN_MASK) | _PAGE_PL_2;
518 }
520 // given a current domain metaphysical address, return the physical address
521 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
522 struct p2m_entry* entry)
523 {
524 unsigned long pteval;
526 #ifndef CONFIG_XEN_IA64_DOM0_VP
527 if (current->domain == dom0) {
528 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
529 printk("translate_domain_mpaddr: out-of-bounds dom0 mpaddr 0x%lx! continuing...\n",
530 mpaddr);
531 }
532 }
533 #endif
534 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
535 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
536 }
538 //XXX !xxx_present() should be used instread of !xxx_none()?
539 // __assign_new_domain_page(), assign_new_domain_page() and
540 // assign_new_domain0_page() are used only when domain creation.
541 // their accesses aren't racy so that returned pte_t doesn't need
542 // volatile qualifier
543 static pte_t*
544 __lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
545 {
546 struct mm_struct *mm = &d->arch.mm;
547 pgd_t *pgd;
548 pud_t *pud;
549 pmd_t *pmd;
551 BUG_ON(mm->pgd == NULL);
552 pgd = pgd_offset(mm, mpaddr);
553 if (pgd_none(*pgd)) {
554 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
555 }
557 pud = pud_offset(pgd, mpaddr);
558 if (pud_none(*pud)) {
559 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
560 }
562 pmd = pmd_offset(pud, mpaddr);
563 if (pmd_none(*pmd)) {
564 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
565 }
567 return pte_offset_map(pmd, mpaddr);
568 }
570 //XXX !xxx_present() should be used instread of !xxx_none()?
571 // pud, pmd, pte page is zero cleared when they are allocated.
572 // Their area must be visible before population so that
573 // cmpxchg must have release semantics.
574 static volatile pte_t*
575 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
576 {
577 struct mm_struct *mm = &d->arch.mm;
578 pgd_t *pgd;
579 pud_t *pud;
580 pmd_t *pmd;
582 BUG_ON(mm->pgd == NULL);
584 pgd = pgd_offset(mm, mpaddr);
585 again_pgd:
586 if (unlikely(pgd_none(*pgd))) {
587 pud_t *old_pud = NULL;
588 pud = pud_alloc_one(mm, mpaddr);
589 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
590 pud_free(pud);
591 goto again_pgd;
592 }
593 }
595 pud = pud_offset(pgd, mpaddr);
596 again_pud:
597 if (unlikely(pud_none(*pud))) {
598 pmd_t* old_pmd = NULL;
599 pmd = pmd_alloc_one(mm, mpaddr);
600 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
601 pmd_free(pmd);
602 goto again_pud;
603 }
604 }
606 pmd = pmd_offset(pud, mpaddr);
607 again_pmd:
608 if (unlikely(pmd_none(*pmd))) {
609 pte_t* old_pte = NULL;
610 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
611 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
612 pte_free_kernel(pte);
613 goto again_pmd;
614 }
615 }
617 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
618 }
620 //XXX xxx_none() should be used instread of !xxx_present()?
621 volatile pte_t*
622 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
623 {
624 struct mm_struct *mm = &d->arch.mm;
625 pgd_t *pgd;
626 pud_t *pud;
627 pmd_t *pmd;
629 BUG_ON(mm->pgd == NULL);
630 pgd = pgd_offset(mm, mpaddr);
631 if (unlikely(!pgd_present(*pgd)))
632 return NULL;
634 pud = pud_offset(pgd, mpaddr);
635 if (unlikely(!pud_present(*pud)))
636 return NULL;
638 pmd = pmd_offset(pud, mpaddr);
639 if (unlikely(!pmd_present(*pmd)))
640 return NULL;
642 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
643 }
645 #ifdef CONFIG_XEN_IA64_DOM0_VP
646 static volatile pte_t*
647 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
648 {
649 struct mm_struct *mm = &d->arch.mm;
650 pgd_t *pgd;
651 pud_t *pud;
652 pmd_t *pmd;
654 BUG_ON(mm->pgd == NULL);
655 pgd = pgd_offset(mm, mpaddr);
656 if (unlikely(pgd_none(*pgd)))
657 return NULL;
659 pud = pud_offset(pgd, mpaddr);
660 if (unlikely(pud_none(*pud)))
661 return NULL;
663 pmd = pmd_offset(pud, mpaddr);
664 if (unlikely(pmd_none(*pmd)))
665 return NULL;
667 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
668 }
670 unsigned long
671 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
672 {
673 volatile pte_t *pte;
675 pte = lookup_noalloc_domain_pte(d, mpaddr);
676 if (pte == NULL)
677 return INVALID_MFN;
679 if (pte_present(*pte))
680 return (pte->pte & _PFN_MASK);
681 else if (VMX_DOMAIN(d->vcpu[0]))
682 return GPFN_INV_MASK;
683 return INVALID_MFN;
684 }
685 #endif
687 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
688 struct p2m_entry* entry)
689 {
690 volatile pte_t *pte;
692 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
693 if (d == dom0) {
694 pte_t pteval;
695 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
696 //printk("lookup_domain_mpa: bad dom0 mpaddr 0x%lx!\n",mpaddr);
697 //printk("lookup_domain_mpa: start=0x%lx,end=0x%lx!\n",dom0_start,dom0_start+dom0_size);
698 }
699 pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
700 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
701 return pte_val(pteval);
702 }
703 #endif
704 pte = lookup_noalloc_domain_pte(d, mpaddr);
705 if (pte != NULL) {
706 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
707 if (pte_present(tmp_pte)) {
708 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
709 if (entry != NULL)
710 p2m_entry_set(entry, pte, tmp_pte);
711 return pte_val(tmp_pte);
712 } else if (VMX_DOMAIN(d->vcpu[0]))
713 return GPFN_INV_MASK;
714 }
716 printk("%s: d 0x%p id %d current 0x%p id %d\n",
717 __func__, d, d->domain_id, current, current->vcpu_id);
718 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
719 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
720 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
721 else
722 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
723 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
725 if (entry != NULL)
726 p2m_entry_set(entry, NULL, __pte(0));
727 //XXX This is a work around until the emulation memory access to a region
728 // where memory or device are attached is implemented.
729 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
730 }
732 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
733 #if 1
734 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
735 {
736 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
737 unsigned long imva;
739 pte &= _PAGE_PPN_MASK;
740 imva = (unsigned long) __va(pte);
741 imva |= mpaddr & ~PAGE_MASK;
742 return (void*)imva;
743 }
744 #else
745 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
746 {
747 unsigned long imva = __gpa_to_mpa(d, mpaddr);
749 return (void *)__va(imva);
750 }
751 #endif
753 /* Allocate a new page for domain and map it to the specified metaphysical
754 address. */
755 static struct page_info *
756 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
757 {
758 struct page_info *p = NULL;
759 unsigned long maddr;
760 int ret;
762 BUG_ON(!pte_none(*pte));
764 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
765 if (d == dom0) {
766 #if 0
767 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
768 /* FIXME: is it true ?
769 dom0 memory is not contiguous! */
770 panic("assign_new_domain_page: bad domain0 "
771 "mpaddr=%lx, start=%lx, end=%lx!\n",
772 mpaddr, dom0_start, dom0_start+dom0_size);
773 }
774 #endif
775 p = mfn_to_page((mpaddr >> PAGE_SHIFT));
776 return p;
777 }
778 #endif
780 p = alloc_domheap_page(d);
781 if (unlikely(!p)) {
782 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
783 return(p);
784 }
786 // zero out pages for security reasons
787 clear_page(page_to_virt(p));
788 maddr = page_to_maddr (p);
789 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
790 && maddr < __get_cpu_var(vhpt_pend))) {
791 /* FIXME: how can this happen ?
792 vhpt is allocated by alloc_domheap_page. */
793 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
794 maddr);
795 }
797 ret = get_page(p, d);
798 BUG_ON(ret == 0);
799 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
800 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
801 // because set_pte_rel() has release semantics
802 set_pte_rel(pte,
803 pfn_pte(maddr >> PAGE_SHIFT,
804 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
806 smp_mb();
807 return p;
808 }
810 struct page_info *
811 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
812 {
813 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
814 pte_t dummy_pte = __pte(0);
815 return __assign_new_domain_page(d, mpaddr, &dummy_pte);
816 #else
817 struct page_info *p = NULL;
818 pte_t *pte;
820 pte = __lookup_alloc_domain_pte(d, mpaddr);
821 if (pte_none(*pte))
822 p = __assign_new_domain_page(d, mpaddr, pte);
824 return p;
825 #endif
826 }
828 void
829 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
830 {
831 #ifndef CONFIG_DOMAIN0_CONTIGUOUS
832 pte_t *pte;
834 BUG_ON(d != dom0);
835 pte = __lookup_alloc_domain_pte(d, mpaddr);
836 if (pte_none(*pte)) {
837 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
838 if (p == NULL) {
839 panic("%s: can't allocate page for dom0", __func__);
840 }
841 }
842 #endif
843 }
845 static unsigned long
846 flags_to_prot (unsigned long flags)
847 {
848 unsigned long res = _PAGE_PL_2 | __DIRTY_BITS;
850 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
851 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
853 return res;
854 }
856 /* map a physical address to the specified metaphysical addr */
857 // flags: currently only ASSIGN_readonly, ASSIGN_nocache
858 // This is called by assign_domain_mmio_page().
859 // So accessing to pte is racy.
860 void
861 __assign_domain_page(struct domain *d,
862 unsigned long mpaddr, unsigned long physaddr,
863 unsigned long flags)
864 {
865 volatile pte_t *pte;
866 pte_t old_pte;
867 pte_t new_pte;
868 pte_t ret_pte;
869 unsigned long prot = flags_to_prot(flags);
871 pte = lookup_alloc_domain_pte(d, mpaddr);
873 old_pte = __pte(0);
874 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
875 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
876 if (pte_val(ret_pte) == pte_val(old_pte))
877 smp_mb();
878 }
880 /* get_page() and map a physical address to the specified metaphysical addr */
881 void
882 assign_domain_page(struct domain *d,
883 unsigned long mpaddr, unsigned long physaddr)
884 {
885 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
886 int ret;
888 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
889 ret = get_page(page, d);
890 BUG_ON(ret == 0);
891 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
892 // because __assign_domain_page() uses set_pte_rel() which has
893 // release semantics, smp_mb() isn't needed.
894 __assign_domain_page(d, mpaddr, physaddr, ASSIGN_writable);
895 }
897 #ifdef CONFIG_XEN_IA64_DOM0_VP
898 static void
899 assign_domain_same_page(struct domain *d,
900 unsigned long mpaddr, unsigned long size,
901 unsigned long flags)
902 {
903 //XXX optimization
904 unsigned long end = PAGE_ALIGN(mpaddr + size);
905 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
906 __assign_domain_page(d, mpaddr, mpaddr, flags);
907 }
908 }
910 int
911 efi_mmio(unsigned long physaddr, unsigned long size)
912 {
913 void *efi_map_start, *efi_map_end;
914 u64 efi_desc_size;
915 void* p;
917 efi_map_start = __va(ia64_boot_param->efi_memmap);
918 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
919 efi_desc_size = ia64_boot_param->efi_memdesc_size;
921 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
922 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
923 unsigned long start = md->phys_addr;
924 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
926 if (start <= physaddr && physaddr < end) {
927 if ((physaddr + size) > end) {
928 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
929 __func__, __LINE__, physaddr, size);
930 return 0;
931 }
933 // for io space
934 if (md->type == EFI_MEMORY_MAPPED_IO ||
935 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
936 return 1;
937 }
939 // for runtime
940 // see efi_enter_virtual_mode(void)
941 // in linux/arch/ia64/kernel/efi.c
942 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
943 !(md->attribute & EFI_MEMORY_WB)) {
944 return 1;
945 }
947 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
948 __func__, __LINE__, physaddr, size);
949 return 0;
950 }
952 if (physaddr < start) {
953 break;
954 }
955 }
957 return 1;
958 }
960 unsigned long
961 assign_domain_mmio_page(struct domain *d,
962 unsigned long mpaddr, unsigned long size)
963 {
964 if (size == 0) {
965 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
966 __func__, d, mpaddr, size);
967 }
968 if (!efi_mmio(mpaddr, size)) {
969 DPRINTK("%s:%d domain %p mpaddr 0x%lx size = 0x%lx\n",
970 __func__, __LINE__, d, mpaddr, size);
971 return -EINVAL;
972 }
973 assign_domain_same_page(d, mpaddr, size, ASSIGN_writable | ASSIGN_nocache);
974 return mpaddr;
975 }
977 unsigned long
978 assign_domain_mach_page(struct domain *d,
979 unsigned long mpaddr, unsigned long size,
980 unsigned long flags)
981 {
982 assign_domain_same_page(d, mpaddr, size, flags);
983 return mpaddr;
984 }
986 // caller must get_page(mfn_to_page(mfn)) before call.
987 // caller must call set_gpfn_from_mfn() before call if necessary.
988 // because set_gpfn_from_mfn() result must be visible before pte xchg
989 // caller must use memory barrier. NOTE: xchg has acquire semantics.
990 // flags: currently only ASSIGN_readonly
991 static void
992 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
993 unsigned long mfn, unsigned long flags)
994 {
995 struct mm_struct *mm = &d->arch.mm;
996 volatile pte_t* pte;
997 pte_t old_pte;
998 pte_t npte;
999 unsigned long prot = flags_to_prot(flags);
1001 pte = lookup_alloc_domain_pte(d, mpaddr);
1003 // update pte
1004 npte = pfn_pte(mfn, __pgprot(prot));
1005 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1006 if (pte_mem(old_pte)) {
1007 unsigned long old_mfn = pte_pfn(old_pte);
1009 // mfn = old_mfn case can happen when domain maps a granted page
1010 // twice with the same pseudo physial address.
1011 // It's non sense, but allowed.
1012 // __gnttab_map_grant_ref()
1013 // => create_host_mapping()
1014 // => assign_domain_page_replace()
1015 if (mfn != old_mfn) {
1016 struct page_info* old_page = mfn_to_page(old_mfn);
1018 if (page_get_owner(old_page) == d ||
1019 page_get_owner(old_page) == NULL) {
1020 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1021 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1024 domain_page_flush(d, mpaddr, old_mfn, mfn);
1026 try_to_clear_PGC_allocate(d, old_page);
1027 put_page(old_page);
1032 // caller must get_page(new_page) before
1033 // Only steal_page() calls this function.
1034 static int
1035 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1036 struct page_info* old_page,
1037 struct page_info* new_page,
1038 unsigned long flags)
1040 struct mm_struct *mm = &d->arch.mm;
1041 volatile pte_t* pte;
1042 unsigned long old_mfn;
1043 unsigned long old_arflags;
1044 pte_t old_pte;
1045 unsigned long new_mfn;
1046 unsigned long new_prot;
1047 pte_t new_pte;
1048 pte_t ret_pte;
1050 pte = lookup_alloc_domain_pte(d, mpaddr);
1052 again:
1053 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1054 old_mfn = page_to_mfn(old_page);
1055 old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
1056 if (!pte_present(old_pte)) {
1057 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
1058 __func__, pte_val(old_pte), old_arflags, old_mfn);
1059 return -EINVAL;
1062 new_prot = flags_to_prot(flags);
1063 new_mfn = page_to_mfn(new_page);
1064 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1066 // update pte
1067 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1068 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1069 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1070 goto again;
1073 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
1074 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1075 __func__,
1076 pte_val(old_pte), old_arflags, old_mfn,
1077 pte_val(ret_pte), pte_pfn(ret_pte));
1078 return -EINVAL;
1081 BUG_ON(!pte_mem(old_pte));
1082 BUG_ON(page_get_owner(old_page) != d);
1083 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1084 BUG_ON(old_mfn == new_mfn);
1086 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1088 domain_page_flush(d, mpaddr, old_mfn, new_mfn);
1089 put_page(old_page);
1090 return 0;
1093 static void
1094 zap_domain_page_one(struct domain *d, unsigned long mpaddr, unsigned long mfn)
1096 struct mm_struct *mm = &d->arch.mm;
1097 volatile pte_t *pte;
1098 pte_t old_pte;
1099 struct page_info *page;
1101 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1102 if (pte == NULL)
1103 return;
1104 if (pte_none(*pte))
1105 return;
1107 if (mfn == INVALID_MFN) {
1108 // clear pte
1109 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1110 mfn = pte_pfn(old_pte);
1111 } else {
1112 unsigned long old_arflags;
1113 pte_t new_pte;
1114 pte_t ret_pte;
1116 again:
1117 // memory_exchange() calls guest_physmap_remove_page() with
1118 // a stealed page. i.e. page owner = NULL.
1119 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1120 page_get_owner(mfn_to_page(mfn)) != NULL);
1121 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1122 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1123 new_pte = __pte(0);
1125 // update pte
1126 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1127 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1128 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1129 goto again;
1132 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1133 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1134 __func__,
1135 pte_val(old_pte), old_arflags, mfn,
1136 pte_val(ret_pte), pte_pfn(ret_pte));
1137 return;
1139 BUG_ON(mfn != pte_pfn(ret_pte));
1142 page = mfn_to_page(mfn);
1143 BUG_ON((page->count_info & PGC_count_mask) == 0);
1145 if (page_get_owner(page) == d ||
1146 page_get_owner(page) == NULL) {
1147 // exchange_memory() calls
1148 // steal_page()
1149 // page owner is set to NULL
1150 // guest_physmap_remove_page()
1151 // zap_domain_page_one()
1152 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1153 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1156 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
1158 if (page_get_owner(page) != NULL) {
1159 try_to_clear_PGC_allocate(d, page);
1161 put_page(page);
1164 unsigned long
1165 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1166 unsigned int extent_order)
1168 if (extent_order != 0) {
1169 //XXX
1170 return -ENOSYS;
1173 zap_domain_page_one(d, gpfn << PAGE_SHIFT, INVALID_MFN);
1174 return 0;
1177 unsigned long
1178 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1179 unsigned long flags, domid_t domid)
1181 int error = 0;
1182 struct domain* rd;
1184 /* Not allowed by a domain. */
1185 if (flags & ASSIGN_nocache)
1186 return -EINVAL;
1188 rd = find_domain_by_id(domid);
1189 if (unlikely(rd == NULL)) {
1190 switch (domid) {
1191 case DOMID_XEN:
1192 rd = dom_xen;
1193 break;
1194 case DOMID_IO:
1195 rd = dom_io;
1196 break;
1197 default:
1198 DPRINTK("d 0x%p domid %d "
1199 "pgfn 0x%lx mfn 0x%lx flags 0x%lx domid %d\n",
1200 d, d->domain_id, gpfn, mfn, flags, domid);
1201 return -ESRCH;
1203 BUG_ON(rd == NULL);
1204 get_knownalive_domain(rd);
1207 if (unlikely(rd == d || !mfn_valid(mfn))) {
1208 error = -EINVAL;
1209 goto out1;
1211 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1212 error = -EINVAL;
1213 goto out1;
1215 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1216 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1217 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1218 //don't update p2m table because this page belongs to rd, not d.
1219 out1:
1220 put_domain(rd);
1221 return error;
1224 // grant table host mapping
1225 // mpaddr: host_addr: pseudo physical address
1226 // mfn: frame: machine page frame
1227 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1228 int
1229 create_grant_host_mapping(unsigned long gpaddr,
1230 unsigned long mfn, unsigned int flags)
1232 struct domain* d = current->domain;
1233 struct page_info* page;
1234 int ret;
1236 if (flags & (GNTMAP_device_map |
1237 GNTMAP_application_map | GNTMAP_contains_pte)) {
1238 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1239 return GNTST_general_error;
1242 BUG_ON(!mfn_valid(mfn));
1243 page = mfn_to_page(mfn);
1244 ret = get_page(page, page_get_owner(page));
1245 BUG_ON(ret == 0);
1246 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1247 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1248 assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
1249 ASSIGN_readonly: ASSIGN_writable);
1250 return GNTST_okay;
1253 // grant table host unmapping
1254 int
1255 destroy_grant_host_mapping(unsigned long gpaddr,
1256 unsigned long mfn, unsigned int flags)
1258 struct domain* d = current->domain;
1259 volatile pte_t* pte;
1260 unsigned long cur_arflags;
1261 pte_t cur_pte;
1262 pte_t new_pte;
1263 pte_t old_pte;
1264 struct page_info* page;
1266 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1267 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1268 return GNTST_general_error;
1271 pte = lookup_noalloc_domain_pte(d, gpaddr);
1272 if (pte == NULL) {
1273 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx\n", __func__, gpaddr, mfn);
1274 return GNTST_general_error;
1277 again:
1278 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1279 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1280 if (!pte_present(cur_pte)) {
1281 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1282 __func__, gpaddr, mfn, pte_val(cur_pte));
1283 return GNTST_general_error;
1285 new_pte = __pte(0);
1287 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1288 if (unlikely(!pte_present(old_pte))) {
1289 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1290 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1291 return GNTST_general_error;
1293 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1294 if (pte_pfn(old_pte) == mfn) {
1295 goto again;
1297 DPRINTK("%s gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1298 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1299 return GNTST_general_error;
1301 BUG_ON(pte_pfn(old_pte) != mfn);
1303 domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
1305 page = mfn_to_page(mfn);
1306 BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1307 put_page(page);
1309 return GNTST_okay;
1312 // heavily depends on the struct page layout.
1313 // gnttab_transfer() calls steal_page() with memflags = 0
1314 // For grant table transfer, we must fill the page.
1315 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1316 // For memory exchange, we don't have to fill the page because
1317 // memory_exchange() does it.
1318 int
1319 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1321 #if 0 /* if big endian */
1322 # error "implement big endian version of steal_page()"
1323 #endif
1324 u32 _d, _nd;
1325 u64 x, nx, y;
1327 if (page_get_owner(page) != d) {
1328 DPRINTK("%s d 0x%p owner 0x%p\n", __func__, d, page_get_owner(page));
1329 return -1;
1332 if (!(memflags & MEMF_no_refcount)) {
1333 unsigned long gpfn;
1334 struct page_info *new;
1335 unsigned long new_mfn;
1336 int ret;
1338 new = alloc_domheap_page(d);
1339 if (new == NULL) {
1340 DPRINTK("alloc_domheap_page() failed\n");
1341 return -1;
1343 // zero out pages for security reasons
1344 clear_page(page_to_virt(new));
1345 // assign_domain_page_cmpxchg_rel() has release semantics
1346 // so smp_mb() isn't needed.
1348 ret = get_page(new, d);
1349 BUG_ON(ret == 0);
1351 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1352 if (gpfn == INVALID_M2P_ENTRY) {
1353 free_domheap_page(new);
1354 return -1;
1356 new_mfn = page_to_mfn(new);
1357 set_gpfn_from_mfn(new_mfn, gpfn);
1358 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1359 // has release semantics.
1361 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1362 ASSIGN_writable);
1363 if (ret < 0) {
1364 DPRINTK("assign_domain_page_cmpxchg_rel failed %d\n", ret);
1365 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1366 free_domheap_page(new);
1367 return -1;
1371 spin_lock(&d->page_alloc_lock);
1373 /*
1374 * The tricky bit: atomically release ownership while there is just one
1375 * benign reference to the page (PGC_allocated). If that reference
1376 * disappears then the deallocation routine will safely spin.
1377 */
1378 _d = pickle_domptr(d);
1379 y = *((u64*)&page->count_info);
1380 do {
1381 x = y;
1382 nx = x & 0xffffffff;
1383 // page->count_info: untouched
1384 // page->u.inused._domain = 0;
1385 _nd = x >> 32;
1387 if (unlikely(!(memflags & MEMF_no_refcount) &&
1388 ((x & (PGC_count_mask | PGC_allocated)) !=
1389 (1 | PGC_allocated))) ||
1391 // when MEMF_no_refcount, page isn't de-assigned from
1392 // this domain yet. So count_info = 2
1393 unlikely((memflags & MEMF_no_refcount) &&
1394 ((x & (PGC_count_mask | PGC_allocated)) !=
1395 (2 | PGC_allocated))) ||
1397 unlikely(_nd != _d)) {
1398 struct domain* nd = unpickle_domptr(_nd);
1399 if (nd == NULL) {
1400 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1401 "sd=%p 0x%x,"
1402 " caf=%016lx, taf=%" PRtype_info
1403 " memflags 0x%x\n",
1404 (void *) page_to_mfn(page),
1405 d, d->domain_id, _d,
1406 nd, _nd,
1407 x,
1408 page->u.inuse.type_info,
1409 memflags);
1410 } else {
1411 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1412 "sd=%p(%u) 0x%x,"
1413 " caf=%016lx, taf=%" PRtype_info
1414 " memflags 0x%x\n",
1415 (void *) page_to_mfn(page),
1416 d, d->domain_id, _d,
1417 nd, nd->domain_id, _nd,
1418 x,
1419 page->u.inuse.type_info,
1420 memflags);
1422 spin_unlock(&d->page_alloc_lock);
1423 return -1;
1426 y = cmpxchg((u64*)&page->count_info, x, nx);
1427 } while (unlikely(y != x));
1429 /*
1430 * Unlink from 'd'. At least one reference remains (now anonymous), so
1431 * noone else is spinning to try to delete this page from 'd'.
1432 */
1433 if ( !(memflags & MEMF_no_refcount) )
1434 d->tot_pages--;
1435 list_del(&page->list);
1437 spin_unlock(&d->page_alloc_lock);
1438 return 0;
1441 void
1442 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1443 unsigned long mfn)
1445 int ret;
1447 BUG_ON(!mfn_valid(mfn));
1448 ret = get_page(mfn_to_page(mfn), d);
1449 BUG_ON(ret == 0);
1450 set_gpfn_from_mfn(mfn, gpfn);
1451 smp_mb();
1452 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, ASSIGN_writable);
1454 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1457 void
1458 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1459 unsigned long mfn)
1461 BUG_ON(mfn == 0);//XXX
1462 zap_domain_page_one(d, gpfn << PAGE_SHIFT, mfn);
1465 //XXX sledgehammer.
1466 // flush finer range.
1467 static void
1468 domain_page_flush(struct domain* d, unsigned long mpaddr,
1469 unsigned long old_mfn, unsigned long new_mfn)
1471 if (shadow_mode_enabled(d))
1472 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1474 domain_flush_vtlb_all();
1477 int
1478 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1480 volatile pte_t * pte;
1482 pte = lookup_noalloc_domain_pte(d, mpaddr);
1483 if(pte != NULL && !pte_none(*pte))
1484 return 1;
1485 return 0;
1487 #endif
1489 /* Flush cache of domain d. */
1490 void domain_cache_flush (struct domain *d, int sync_only)
1492 struct mm_struct *mm = &d->arch.mm;
1493 pgd_t *pgd = mm->pgd;
1494 unsigned long maddr;
1495 int i,j,k, l;
1496 int nbr_page = 0;
1497 void (*flush_func)(unsigned long start, unsigned long end);
1498 extern void flush_dcache_range (unsigned long, unsigned long);
1500 if (sync_only)
1501 flush_func = &flush_icache_range;
1502 else
1503 flush_func = &flush_dcache_range;
1505 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1506 if (d == dom0) {
1507 /* This is not fully correct (because of hole), but it should
1508 be enough for now. */
1509 (*flush_func)(__va_ul (dom0_start),
1510 __va_ul (dom0_start + dom0_size));
1511 return;
1513 #endif
1514 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1515 pud_t *pud;
1516 if (!pgd_present(*pgd))
1517 continue;
1518 pud = pud_offset(pgd, 0);
1519 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1520 pmd_t *pmd;
1521 if (!pud_present(*pud))
1522 continue;
1523 pmd = pmd_offset(pud, 0);
1524 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1525 pte_t *pte;
1526 if (!pmd_present(*pmd))
1527 continue;
1528 pte = pte_offset_map(pmd, 0);
1529 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1530 if (!pte_present(*pte))
1531 continue;
1532 /* Convert PTE to maddr. */
1533 maddr = __va_ul (pte_val(*pte)
1534 & _PAGE_PPN_MASK);
1535 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1536 nbr_page++;
1541 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1544 #ifdef VERBOSE
1545 #define MEM_LOG(_f, _a...) \
1546 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1547 current->domain->domain_id , __LINE__ , ## _a )
1548 #else
1549 #define MEM_LOG(_f, _a...) ((void)0)
1550 #endif
1552 static void free_page_type(struct page_info *page, u32 type)
1556 static int alloc_page_type(struct page_info *page, u32 type)
1558 return 1;
1561 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1563 void *p = alloc_xenheap_pages(order);
1565 memset(p,0,PAGE_SIZE<<order);
1566 return (unsigned long)p;
1569 void __free_pages(struct page_info *page, unsigned int order)
1571 if (order) BUG();
1572 free_xenheap_page(page);
1575 void *pgtable_quicklist_alloc(void)
1577 void *p;
1578 p = alloc_xenheap_pages(0);
1579 if (p)
1580 clear_page(p);
1581 return p;
1584 void pgtable_quicklist_free(void *pgtable_entry)
1586 free_xenheap_page(pgtable_entry);
1589 void cleanup_writable_pagetable(struct domain *d)
1591 return;
1594 void put_page_type(struct page_info *page)
1596 u32 nx, x, y = page->u.inuse.type_info;
1598 again:
1599 do {
1600 x = y;
1601 nx = x - 1;
1603 ASSERT((x & PGT_count_mask) != 0);
1605 /*
1606 * The page should always be validated while a reference is held. The
1607 * exception is during domain destruction, when we forcibly invalidate
1608 * page-table pages if we detect a referential loop.
1609 * See domain.c:relinquish_list().
1610 */
1611 ASSERT((x & PGT_validated) ||
1612 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1614 if ( unlikely((nx & PGT_count_mask) == 0) )
1616 /* Record TLB information for flush later. Races are harmless. */
1617 page->tlbflush_timestamp = tlbflush_current_time();
1619 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1620 likely(nx & PGT_validated) )
1622 /*
1623 * Page-table pages must be unvalidated when count is zero. The
1624 * 'free' is safe because the refcnt is non-zero and validated
1625 * bit is clear => other ops will spin or fail.
1626 */
1627 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1628 x & ~PGT_validated)) != x) )
1629 goto again;
1630 /* We cleared the 'valid bit' so we do the clean up. */
1631 free_page_type(page, x);
1632 /* Carry on, but with the 'valid bit' now clear. */
1633 x &= ~PGT_validated;
1634 nx &= ~PGT_validated;
1637 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1638 (PGT_pinned | 1)) &&
1639 ((nx & PGT_type_mask) != PGT_writable_page)) )
1641 /* Page is now only pinned. Make the back pointer mutable again. */
1642 nx |= PGT_va_mutable;
1645 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1649 int get_page_type(struct page_info *page, u32 type)
1651 u32 nx, x, y = page->u.inuse.type_info;
1653 again:
1654 do {
1655 x = y;
1656 nx = x + 1;
1657 if ( unlikely((nx & PGT_count_mask) == 0) )
1659 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1660 return 0;
1662 else if ( unlikely((x & PGT_count_mask) == 0) )
1664 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1666 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1668 /*
1669 * On type change we check to flush stale TLB
1670 * entries. This may be unnecessary (e.g., page
1671 * was GDT/LDT) but those circumstances should be
1672 * very rare.
1673 */
1674 cpumask_t mask =
1675 page_get_owner(page)->domain_dirty_cpumask;
1676 tlbflush_filter(mask, page->tlbflush_timestamp);
1678 if ( unlikely(!cpus_empty(mask)) )
1680 perfc_incrc(need_flush_tlb_flush);
1681 flush_tlb_mask(mask);
1685 /* We lose existing type, back pointer, and validity. */
1686 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1687 nx |= type;
1689 /* No special validation needed for writable pages. */
1690 /* Page tables and GDT/LDT need to be scanned for validity. */
1691 if ( type == PGT_writable_page )
1692 nx |= PGT_validated;
1695 else
1697 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1699 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1701 if ( current->domain == page_get_owner(page) )
1703 /*
1704 * This ensures functions like set_gdt() see up-to-date
1705 * type info without needing to clean up writable p.t.
1706 * state on the fast path.
1707 */
1708 LOCK_BIGLOCK(current->domain);
1709 cleanup_writable_pagetable(current->domain);
1710 y = page->u.inuse.type_info;
1711 UNLOCK_BIGLOCK(current->domain);
1712 /* Can we make progress now? */
1713 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1714 ((y & PGT_count_mask) == 0) )
1715 goto again;
1717 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1718 ((type & PGT_type_mask) != PGT_l1_page_table) )
1719 MEM_LOG("Bad type (saw %08x != exp %08x) "
1720 "for mfn %016lx (pfn %016lx)",
1721 x, type, page_to_mfn(page),
1722 get_gpfn_from_mfn(page_to_mfn(page)));
1723 return 0;
1725 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1727 /* The va backpointer is mutable, hence we update it. */
1728 nx &= ~PGT_va_mask;
1729 nx |= type; /* we know the actual type is correct */
1731 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1732 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1734 #ifdef CONFIG_X86_PAE
1735 /* We use backptr as extra typing. Cannot be unknown. */
1736 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1737 return 0;
1738 #endif
1739 /* This table is possibly mapped at multiple locations. */
1740 nx &= ~PGT_va_mask;
1741 nx |= PGT_va_unknown;
1744 if ( unlikely(!(x & PGT_validated)) )
1746 /* Someone else is updating validation of this page. Wait... */
1747 while ( (y = page->u.inuse.type_info) == x )
1748 cpu_relax();
1749 goto again;
1753 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
1755 if ( unlikely(!(nx & PGT_validated)) )
1757 /* Try to validate page type; drop the new reference on failure. */
1758 if ( unlikely(!alloc_page_type(page, type)) )
1760 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
1761 ": caf=%08x taf=%" PRtype_info,
1762 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1763 type, page->count_info, page->u.inuse.type_info);
1764 /* Noone else can get a reference. We hold the only ref. */
1765 page->u.inuse.type_info = 0;
1766 return 0;
1769 /* Noone else is updating simultaneously. */
1770 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1773 return 1;
1776 /*
1777 * Local variables:
1778 * mode: C
1779 * c-set-style: "BSD"
1780 * c-basic-offset: 4
1781 * tab-width: 4
1782 * indent-tabs-mode: nil
1783 * End:
1784 */