ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 11636:3470d9cd27e5

[IA64] Modify p2m converter to avoid hypervisor crash

Crash occurs during destruction of VT-i domain with PV-on-HVM.

Signed-off-by: Tsunehisa Doi <Doi.Tsunehisa@jp.fujitsu.com>
Signed-off-by: Tomonari Horikoshi <t.horikoshi@jp.fujitsu.com>
author awilliam@xenbuild.aw
date Sun Oct 01 10:48:40 2006 -0600 (2006-10-01)
parents dc9fa4dcd19c
children 5c97ef4c7147
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <linux/efi.h>
176 #include <xen/guest_access.h>
177 #include <asm/page.h>
178 #include <public/memory.h>
180 static void domain_page_flush(struct domain* d, unsigned long mpaddr,
181 unsigned long old_mfn, unsigned long new_mfn);
183 extern unsigned long ia64_iobase;
185 static struct domain *dom_xen, *dom_io;
187 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
188 void
189 alloc_dom_xen_and_dom_io(void)
190 {
191 /*
192 * Initialise our DOMID_XEN domain.
193 * Any Xen-heap pages that we will allow to be mapped will have
194 * their domain field set to dom_xen.
195 */
196 dom_xen = alloc_domain(DOMID_XEN);
197 BUG_ON(dom_xen == NULL);
199 /*
200 * Initialise our DOMID_IO domain.
201 * This domain owns I/O pages that are within the range of the page_info
202 * array. Mappings occur at the priv of the caller.
203 */
204 dom_io = alloc_domain(DOMID_IO);
205 BUG_ON(dom_io == NULL);
206 }
208 // heavily depends on the struct page_info layout.
209 // if (page_get_owner(page) == d &&
210 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
211 // put_page(page);
212 // }
213 static void
214 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
215 {
216 u32 _d, _nd;
217 u64 x, nx, y;
219 _d = pickle_domptr(d);
220 y = *((u64*)&page->count_info);
221 do {
222 x = y;
223 _nd = x >> 32;
224 nx = x - 1;
225 __clear_bit(_PGC_allocated, &nx);
227 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
228 struct domain* nd = unpickle_domptr(_nd);
229 if (nd == NULL) {
230 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
231 "sd=%p 0x%x,"
232 " caf=%016lx, taf=%" PRtype_info "\n",
233 (void *) page_to_mfn(page),
234 d, d->domain_id, _d,
235 nd, _nd,
236 x,
237 page->u.inuse.type_info);
238 }
239 break;
240 }
242 BUG_ON((nx & PGC_count_mask) < 1);
243 y = cmpxchg((u64*)&page->count_info, x, nx);
244 } while (unlikely(y != x));
245 }
247 static void
248 relinquish_pte(struct domain* d, pte_t* pte)
249 {
250 unsigned long mfn = pte_pfn(*pte);
251 struct page_info* page;
253 // vmx domain use bit[58:56] to distinguish io region from memory.
254 // see vmx_build_physmap_table() in vmx_init.c
255 if (!pte_mem(*pte))
256 return;
258 // domain might map IO space or acpi table pages. check it.
259 if (!mfn_valid(mfn))
260 return;
261 page = mfn_to_page(mfn);
262 // struct page_info corresponding to mfn may exist or not depending
263 // on CONFIG_VIRTUAL_FRAME_TABLE.
264 // This check is too easy.
265 // The right way is to check whether this page is of io area or acpi pages
266 if (page_get_owner(page) == NULL) {
267 BUG_ON(page->count_info != 0);
268 return;
269 }
271 if (page_get_owner(page) == d) {
272 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
273 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
274 }
276 try_to_clear_PGC_allocate(d, page);
277 put_page(page);
278 }
280 static void
281 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
282 {
283 unsigned long i;
284 pte_t* pte = pte_offset_map(pmd, offset);
286 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
287 if (!pte_present(*pte))
288 continue;
290 relinquish_pte(d, pte);
291 }
292 pte_free_kernel(pte_offset_map(pmd, offset));
293 }
295 static void
296 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
297 {
298 unsigned long i;
299 pmd_t *pmd = pmd_offset(pud, offset);
301 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
302 if (!pmd_present(*pmd))
303 continue;
305 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
306 }
307 pmd_free(pmd_offset(pud, offset));
308 }
310 static void
311 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
312 {
313 unsigned long i;
314 pud_t *pud = pud_offset(pgd, offset);
316 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
317 if (!pud_present(*pud))
318 continue;
320 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
321 }
322 pud_free(pud_offset(pgd, offset));
323 }
325 void
326 relinquish_mm(struct domain* d)
327 {
328 struct mm_struct* mm = &d->arch.mm;
329 unsigned long i;
330 pgd_t* pgd;
332 if (mm->pgd == NULL)
333 return;
335 pgd = pgd_offset(mm, 0);
336 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
337 if (!pgd_present(*pgd))
338 continue;
340 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
341 }
342 pgd_free(mm->pgd);
343 mm->pgd = NULL;
344 }
346 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
347 void
348 share_xen_page_with_guest(struct page_info *page,
349 struct domain *d, int readonly)
350 {
351 if ( page_get_owner(page) == d )
352 return;
354 #if 1
355 if (readonly) {
356 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
357 }
358 #endif
360 // alloc_xenheap_pages() doesn't initialize page owner.
361 //BUG_ON(page_get_owner(page) != NULL);
363 spin_lock(&d->page_alloc_lock);
365 #ifndef __ia64__
366 /* The incremented type count pins as writable or read-only. */
367 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
368 page->u.inuse.type_info |= PGT_validated | 1;
369 #endif
371 page_set_owner(page, d);
372 wmb(); /* install valid domain ptr before updating refcnt. */
373 ASSERT(page->count_info == 0);
374 page->count_info |= PGC_allocated | 1;
376 if ( unlikely(d->xenheap_pages++ == 0) )
377 get_knownalive_domain(d);
378 list_add_tail(&page->list, &d->xenpage_list);
380 // grant_table_destroy() releases these pages.
381 // but it doesn't clear their m2p entry. So there might remain stale
382 // entries. such a stale entry is cleared here.
383 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
385 spin_unlock(&d->page_alloc_lock);
386 }
388 void
389 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
390 {
391 share_xen_page_with_guest(page, dom_xen, readonly);
392 }
394 unsigned long
395 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
396 {
397 unsigned long pte;
399 // This function may be called from __gnttab_copy()
400 // during destruction of VT-i domain with PV-on-HVM driver.
401 if (unlikely(d->arch.mm.pgd == NULL)) {
402 if (VMX_DOMAIN(d->vcpu[0]))
403 return INVALID_MFN;
404 }
405 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
406 if (!pte) {
407 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
408 }
409 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
410 }
412 // given a domain virtual address, pte and pagesize, extract the metaphysical
413 // address, convert the pte for a physical address for (possibly different)
414 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
415 // PAGE_SIZE!)
416 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
417 struct p2m_entry* entry)
418 {
419 struct domain *d = current->domain;
420 ia64_itir_t itir = {.itir = itir__};
421 u64 mask, mpaddr, pteval2;
422 u64 arflags;
423 u64 arflags2;
424 u64 maflags2;
426 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
428 // FIXME address had better be pre-validated on insert
429 mask = ~itir_mask(itir.itir);
430 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
432 if (itir.ps > PAGE_SHIFT)
433 itir.ps = PAGE_SHIFT;
435 *logps = itir.ps;
437 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
439 /* Check access rights. */
440 arflags = pteval & _PAGE_AR_MASK;
441 arflags2 = pteval2 & _PAGE_AR_MASK;
442 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
443 #if 0
444 DPRINTK("%s:%d "
445 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
446 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
447 __func__, __LINE__,
448 pteval, arflags, address, itir__,
449 pteval2, arflags2, mpaddr);
450 #endif
451 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
452 }
454 /* Check memory attribute. The switch is on the *requested* memory
455 attribute. */
456 maflags2 = pteval2 & _PAGE_MA_MASK;
457 switch (pteval & _PAGE_MA_MASK) {
458 case _PAGE_MA_NAT:
459 /* NaT pages are always accepted! */
460 break;
461 case _PAGE_MA_UC:
462 case _PAGE_MA_UCE:
463 case _PAGE_MA_WC:
464 if (maflags2 == _PAGE_MA_WB) {
465 /* Don't let domains WB-map uncached addresses.
466 This can happen when domU tries to touch i/o
467 port space. Also prevents possible address
468 aliasing issues. */
469 printf("Warning: UC to WB for mpaddr=%lx\n", mpaddr);
470 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
471 }
472 break;
473 case _PAGE_MA_WB:
474 if (maflags2 != _PAGE_MA_WB) {
475 /* Forbid non-coherent access to coherent memory. */
476 panic_domain(NULL, "try to use WB mem attr on "
477 "UC page, mpaddr=%lx\n", mpaddr);
478 }
479 break;
480 default:
481 panic_domain(NULL, "try to use unknown mem attribute\n");
482 }
484 /* If shadow mode is enabled, virtualize dirty bit. */
485 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
486 u64 mp_page = mpaddr >> PAGE_SHIFT;
487 pteval |= _PAGE_VIRT_D;
489 /* If the page is not already dirty, don't set the dirty bit! */
490 if (mp_page < d->arch.shadow_bitmap_size * 8
491 && !test_bit(mp_page, d->arch.shadow_bitmap))
492 pteval &= ~_PAGE_D;
493 }
495 /* Ignore non-addr bits of pteval2 and force PL0->2
496 (PL3 is unaffected) */
497 return (pteval & ~_PAGE_PPN_MASK) |
498 (pteval2 & _PAGE_PPN_MASK) | _PAGE_PL_2;
499 }
501 // given a current domain metaphysical address, return the physical address
502 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
503 struct p2m_entry* entry)
504 {
505 unsigned long pteval;
507 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
508 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
509 }
511 //XXX !xxx_present() should be used instread of !xxx_none()?
512 // __assign_new_domain_page(), assign_new_domain_page() and
513 // assign_new_domain0_page() are used only when domain creation.
514 // their accesses aren't racy so that returned pte_t doesn't need
515 // volatile qualifier
516 static pte_t*
517 __lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
518 {
519 struct mm_struct *mm = &d->arch.mm;
520 pgd_t *pgd;
521 pud_t *pud;
522 pmd_t *pmd;
524 BUG_ON(mm->pgd == NULL);
525 pgd = pgd_offset(mm, mpaddr);
526 if (pgd_none(*pgd)) {
527 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
528 }
530 pud = pud_offset(pgd, mpaddr);
531 if (pud_none(*pud)) {
532 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
533 }
535 pmd = pmd_offset(pud, mpaddr);
536 if (pmd_none(*pmd)) {
537 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
538 }
540 return pte_offset_map(pmd, mpaddr);
541 }
543 //XXX !xxx_present() should be used instread of !xxx_none()?
544 // pud, pmd, pte page is zero cleared when they are allocated.
545 // Their area must be visible before population so that
546 // cmpxchg must have release semantics.
547 static volatile pte_t*
548 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
549 {
550 struct mm_struct *mm = &d->arch.mm;
551 pgd_t *pgd;
552 pud_t *pud;
553 pmd_t *pmd;
555 BUG_ON(mm->pgd == NULL);
557 pgd = pgd_offset(mm, mpaddr);
558 again_pgd:
559 if (unlikely(pgd_none(*pgd))) {
560 pud_t *old_pud = NULL;
561 pud = pud_alloc_one(mm, mpaddr);
562 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
563 pud_free(pud);
564 goto again_pgd;
565 }
566 }
568 pud = pud_offset(pgd, mpaddr);
569 again_pud:
570 if (unlikely(pud_none(*pud))) {
571 pmd_t* old_pmd = NULL;
572 pmd = pmd_alloc_one(mm, mpaddr);
573 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
574 pmd_free(pmd);
575 goto again_pud;
576 }
577 }
579 pmd = pmd_offset(pud, mpaddr);
580 again_pmd:
581 if (unlikely(pmd_none(*pmd))) {
582 pte_t* old_pte = NULL;
583 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
584 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
585 pte_free_kernel(pte);
586 goto again_pmd;
587 }
588 }
590 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
591 }
593 //XXX xxx_none() should be used instread of !xxx_present()?
594 volatile pte_t*
595 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
596 {
597 struct mm_struct *mm = &d->arch.mm;
598 pgd_t *pgd;
599 pud_t *pud;
600 pmd_t *pmd;
602 BUG_ON(mm->pgd == NULL);
603 pgd = pgd_offset(mm, mpaddr);
604 if (unlikely(!pgd_present(*pgd)))
605 return NULL;
607 pud = pud_offset(pgd, mpaddr);
608 if (unlikely(!pud_present(*pud)))
609 return NULL;
611 pmd = pmd_offset(pud, mpaddr);
612 if (unlikely(!pmd_present(*pmd)))
613 return NULL;
615 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
616 }
618 static volatile pte_t*
619 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
620 {
621 struct mm_struct *mm = &d->arch.mm;
622 pgd_t *pgd;
623 pud_t *pud;
624 pmd_t *pmd;
626 BUG_ON(mm->pgd == NULL);
627 pgd = pgd_offset(mm, mpaddr);
628 if (unlikely(pgd_none(*pgd)))
629 return NULL;
631 pud = pud_offset(pgd, mpaddr);
632 if (unlikely(pud_none(*pud)))
633 return NULL;
635 pmd = pmd_offset(pud, mpaddr);
636 if (unlikely(pmd_none(*pmd)))
637 return NULL;
639 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
640 }
642 unsigned long
643 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
644 {
645 volatile pte_t *pte;
647 pte = lookup_noalloc_domain_pte(d, mpaddr);
648 if (pte == NULL)
649 return INVALID_MFN;
651 if (pte_present(*pte))
652 return (pte->pte & _PFN_MASK);
653 else if (VMX_DOMAIN(d->vcpu[0]))
654 return GPFN_INV_MASK;
655 return INVALID_MFN;
656 }
658 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
659 struct p2m_entry* entry)
660 {
661 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
663 if (pte != NULL) {
664 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
665 if (pte_present(tmp_pte)) {
666 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
667 if (entry != NULL)
668 p2m_entry_set(entry, pte, tmp_pte);
669 return pte_val(tmp_pte);
670 } else if (VMX_DOMAIN(d->vcpu[0]))
671 return GPFN_INV_MASK;
672 }
674 printk("%s: d 0x%p id %d current 0x%p id %d\n",
675 __func__, d, d->domain_id, current, current->vcpu_id);
676 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
677 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
678 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
679 else
680 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
681 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
683 if (entry != NULL)
684 p2m_entry_set(entry, NULL, __pte(0));
685 //XXX This is a work around until the emulation memory access to a region
686 // where memory or device are attached is implemented.
687 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
688 }
690 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
691 #if 1
692 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
693 {
694 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
695 unsigned long imva;
697 pte &= _PAGE_PPN_MASK;
698 imva = (unsigned long) __va(pte);
699 imva |= mpaddr & ~PAGE_MASK;
700 return (void*)imva;
701 }
702 #else
703 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
704 {
705 unsigned long imva = __gpa_to_mpa(d, mpaddr);
707 return (void *)__va(imva);
708 }
709 #endif
711 /* Allocate a new page for domain and map it to the specified metaphysical
712 address. */
713 static struct page_info *
714 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
715 {
716 struct page_info *p;
717 unsigned long maddr;
718 int ret;
720 BUG_ON(!pte_none(*pte));
722 p = alloc_domheap_page(d);
723 if (unlikely(!p)) {
724 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
725 return(p);
726 }
728 // zero out pages for security reasons
729 clear_page(page_to_virt(p));
730 maddr = page_to_maddr (p);
731 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
732 && maddr < __get_cpu_var(vhpt_pend))) {
733 /* FIXME: how can this happen ?
734 vhpt is allocated by alloc_domheap_page. */
735 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
736 maddr);
737 }
739 ret = get_page(p, d);
740 BUG_ON(ret == 0);
741 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
742 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
743 // because set_pte_rel() has release semantics
744 set_pte_rel(pte,
745 pfn_pte(maddr >> PAGE_SHIFT,
746 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
748 smp_mb();
749 return p;
750 }
752 struct page_info *
753 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
754 {
755 pte_t *pte = __lookup_alloc_domain_pte(d, mpaddr);
757 if (!pte_none(*pte))
758 return NULL;
760 return __assign_new_domain_page(d, mpaddr, pte);
761 }
763 void
764 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
765 {
766 pte_t *pte;
768 BUG_ON(d != dom0);
769 pte = __lookup_alloc_domain_pte(d, mpaddr);
770 if (pte_none(*pte)) {
771 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
772 if (p == NULL) {
773 panic("%s: can't allocate page for dom0", __func__);
774 }
775 }
776 }
778 static unsigned long
779 flags_to_prot (unsigned long flags)
780 {
781 unsigned long res = _PAGE_PL_2 | __DIRTY_BITS;
783 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
784 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
786 return res;
787 }
789 /* map a physical address to the specified metaphysical addr */
790 // flags: currently only ASSIGN_readonly, ASSIGN_nocache
791 // This is called by assign_domain_mmio_page().
792 // So accessing to pte is racy.
793 void
794 __assign_domain_page(struct domain *d,
795 unsigned long mpaddr, unsigned long physaddr,
796 unsigned long flags)
797 {
798 volatile pte_t *pte;
799 pte_t old_pte;
800 pte_t new_pte;
801 pte_t ret_pte;
802 unsigned long prot = flags_to_prot(flags);
804 pte = lookup_alloc_domain_pte(d, mpaddr);
806 old_pte = __pte(0);
807 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
808 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
809 if (pte_val(ret_pte) == pte_val(old_pte))
810 smp_mb();
811 }
813 /* get_page() and map a physical address to the specified metaphysical addr */
814 void
815 assign_domain_page(struct domain *d,
816 unsigned long mpaddr, unsigned long physaddr)
817 {
818 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
819 int ret;
821 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
822 ret = get_page(page, d);
823 BUG_ON(ret == 0);
824 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
825 // because __assign_domain_page() uses set_pte_rel() which has
826 // release semantics, smp_mb() isn't needed.
827 __assign_domain_page(d, mpaddr, physaddr, ASSIGN_writable);
828 }
830 int
831 ioports_permit_access(struct domain *d, unsigned long fp, unsigned long lp)
832 {
833 int ret;
834 unsigned long off;
835 unsigned long fp_offset;
836 unsigned long lp_offset;
838 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
839 if (ret != 0)
840 return ret;
842 /* Domain 0 doesn't virtualize IO ports space. */
843 if (d == dom0)
844 return 0;
846 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
847 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
849 for (off = fp_offset; off <= lp_offset; off += PAGE_SIZE)
850 __assign_domain_page(d, IO_PORTS_PADDR + off,
851 __pa(ia64_iobase) + off, ASSIGN_nocache);
853 return 0;
854 }
856 static int
857 ioports_has_allowed(struct domain *d, unsigned long fp, unsigned long lp)
858 {
859 unsigned long i;
860 for (i = fp; i < lp; i++)
861 if (rangeset_contains_singleton(d->arch.ioport_caps, i))
862 return 1;
863 return 0;
864 }
866 int
867 ioports_deny_access(struct domain *d, unsigned long fp, unsigned long lp)
868 {
869 int ret;
870 struct mm_struct *mm = &d->arch.mm;
871 unsigned long off;
872 unsigned long io_ports_base;
873 unsigned long fp_offset;
874 unsigned long lp_offset;
876 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
877 if (ret != 0)
878 return ret;
879 if (d == dom0)
880 io_ports_base = __pa(ia64_iobase);
881 else
882 io_ports_base = IO_PORTS_PADDR;
884 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
885 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
887 for (off = fp_offset; off < lp_offset; off += PAGE_SIZE) {
888 unsigned long mpaddr = io_ports_base + off;
889 unsigned long port;
890 volatile pte_t *pte;
891 pte_t old_pte;
893 port = IO_SPACE_SPARSE_DECODING (off);
894 if (port < fp || port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1 > lp) {
895 /* Maybe this covers an allowed port. */
896 if (ioports_has_allowed(d, port,
897 port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1))
898 continue;
899 }
901 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
902 BUG_ON(pte == NULL);
903 BUG_ON(pte_none(*pte));
905 // clear pte
906 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
907 }
908 domain_flush_vtlb_all();
909 return 0;
910 }
912 static void
913 assign_domain_same_page(struct domain *d,
914 unsigned long mpaddr, unsigned long size,
915 unsigned long flags)
916 {
917 //XXX optimization
918 unsigned long end = PAGE_ALIGN(mpaddr + size);
919 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
920 __assign_domain_page(d, mpaddr, mpaddr, flags);
921 }
922 }
924 int
925 efi_mmio(unsigned long physaddr, unsigned long size)
926 {
927 void *efi_map_start, *efi_map_end;
928 u64 efi_desc_size;
929 void* p;
931 efi_map_start = __va(ia64_boot_param->efi_memmap);
932 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
933 efi_desc_size = ia64_boot_param->efi_memdesc_size;
935 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
936 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
937 unsigned long start = md->phys_addr;
938 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
940 if (start <= physaddr && physaddr < end) {
941 if ((physaddr + size) > end) {
942 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
943 __func__, __LINE__, physaddr, size);
944 return 0;
945 }
947 // for io space
948 if (md->type == EFI_MEMORY_MAPPED_IO ||
949 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
950 return 1;
951 }
953 // for runtime
954 // see efi_enter_virtual_mode(void)
955 // in linux/arch/ia64/kernel/efi.c
956 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
957 !(md->attribute & EFI_MEMORY_WB)) {
958 return 1;
959 }
961 return 0;
962 }
964 if (physaddr < start) {
965 break;
966 }
967 }
969 return 1;
970 }
972 unsigned long
973 assign_domain_mmio_page(struct domain *d,
974 unsigned long mpaddr, unsigned long size)
975 {
976 if (size == 0) {
977 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
978 __func__, d, mpaddr, size);
979 }
980 if (!efi_mmio(mpaddr, size)) {
981 #ifndef NDEBUG
982 DPRINTK("%s:%d domain %p mpaddr 0x%lx size = 0x%lx\n",
983 __func__, __LINE__, d, mpaddr, size);
984 #endif
985 return -EINVAL;
986 }
987 assign_domain_same_page(d, mpaddr, size, ASSIGN_writable | ASSIGN_nocache);
988 return mpaddr;
989 }
991 unsigned long
992 assign_domain_mach_page(struct domain *d,
993 unsigned long mpaddr, unsigned long size,
994 unsigned long flags)
995 {
996 assign_domain_same_page(d, mpaddr, size, flags);
997 return mpaddr;
998 }
1000 // caller must get_page(mfn_to_page(mfn)) before call.
1001 // caller must call set_gpfn_from_mfn() before call if necessary.
1002 // because set_gpfn_from_mfn() result must be visible before pte xchg
1003 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1004 // flags: currently only ASSIGN_readonly
1005 static void
1006 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1007 unsigned long mfn, unsigned long flags)
1009 struct mm_struct *mm = &d->arch.mm;
1010 volatile pte_t* pte;
1011 pte_t old_pte;
1012 pte_t npte;
1013 unsigned long prot = flags_to_prot(flags);
1015 pte = lookup_alloc_domain_pte(d, mpaddr);
1017 // update pte
1018 npte = pfn_pte(mfn, __pgprot(prot));
1019 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1020 if (pte_mem(old_pte)) {
1021 unsigned long old_mfn = pte_pfn(old_pte);
1023 // mfn = old_mfn case can happen when domain maps a granted page
1024 // twice with the same pseudo physial address.
1025 // It's non sense, but allowed.
1026 // __gnttab_map_grant_ref()
1027 // => create_host_mapping()
1028 // => assign_domain_page_replace()
1029 if (mfn != old_mfn) {
1030 struct page_info* old_page = mfn_to_page(old_mfn);
1032 if (page_get_owner(old_page) == d ||
1033 page_get_owner(old_page) == NULL) {
1034 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1035 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1038 domain_page_flush(d, mpaddr, old_mfn, mfn);
1040 try_to_clear_PGC_allocate(d, old_page);
1041 put_page(old_page);
1046 // caller must get_page(new_page) before
1047 // Only steal_page() calls this function.
1048 static int
1049 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1050 struct page_info* old_page,
1051 struct page_info* new_page,
1052 unsigned long flags)
1054 struct mm_struct *mm = &d->arch.mm;
1055 volatile pte_t* pte;
1056 unsigned long old_mfn;
1057 unsigned long old_arflags;
1058 pte_t old_pte;
1059 unsigned long new_mfn;
1060 unsigned long new_prot;
1061 pte_t new_pte;
1062 pte_t ret_pte;
1064 pte = lookup_alloc_domain_pte(d, mpaddr);
1066 again:
1067 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1068 old_mfn = page_to_mfn(old_page);
1069 old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
1070 if (!pte_present(old_pte)) {
1071 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
1072 __func__, pte_val(old_pte), old_arflags, old_mfn);
1073 return -EINVAL;
1076 new_prot = flags_to_prot(flags);
1077 new_mfn = page_to_mfn(new_page);
1078 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1080 // update pte
1081 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1082 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1083 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1084 goto again;
1087 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
1088 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1089 __func__,
1090 pte_val(old_pte), old_arflags, old_mfn,
1091 pte_val(ret_pte), pte_pfn(ret_pte));
1092 return -EINVAL;
1095 BUG_ON(!pte_mem(old_pte));
1096 BUG_ON(page_get_owner(old_page) != d);
1097 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1098 BUG_ON(old_mfn == new_mfn);
1100 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1102 domain_page_flush(d, mpaddr, old_mfn, new_mfn);
1103 put_page(old_page);
1104 return 0;
1107 static void
1108 zap_domain_page_one(struct domain *d, unsigned long mpaddr, unsigned long mfn)
1110 struct mm_struct *mm = &d->arch.mm;
1111 volatile pte_t *pte;
1112 pte_t old_pte;
1113 struct page_info *page;
1115 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1116 if (pte == NULL)
1117 return;
1118 if (pte_none(*pte))
1119 return;
1121 if (mfn == INVALID_MFN) {
1122 // clear pte
1123 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1124 mfn = pte_pfn(old_pte);
1125 } else {
1126 unsigned long old_arflags;
1127 pte_t new_pte;
1128 pte_t ret_pte;
1130 again:
1131 // memory_exchange() calls guest_physmap_remove_page() with
1132 // a stealed page. i.e. page owner = NULL.
1133 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1134 page_get_owner(mfn_to_page(mfn)) != NULL);
1135 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1136 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1137 new_pte = __pte(0);
1139 // update pte
1140 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1141 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1142 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1143 goto again;
1146 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1147 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1148 __func__,
1149 pte_val(old_pte), old_arflags, mfn,
1150 pte_val(ret_pte), pte_pfn(ret_pte));
1151 return;
1153 BUG_ON(mfn != pte_pfn(ret_pte));
1156 page = mfn_to_page(mfn);
1157 BUG_ON((page->count_info & PGC_count_mask) == 0);
1159 if (page_get_owner(page) == d ||
1160 page_get_owner(page) == NULL) {
1161 // exchange_memory() calls
1162 // steal_page()
1163 // page owner is set to NULL
1164 // guest_physmap_remove_page()
1165 // zap_domain_page_one()
1166 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1167 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1170 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
1172 if (page_get_owner(page) != NULL) {
1173 try_to_clear_PGC_allocate(d, page);
1175 put_page(page);
1178 unsigned long
1179 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1180 unsigned int extent_order)
1182 if (extent_order != 0) {
1183 //XXX
1184 return -ENOSYS;
1187 zap_domain_page_one(d, gpfn << PAGE_SHIFT, INVALID_MFN);
1188 return 0;
1191 unsigned long
1192 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1193 unsigned long flags, domid_t domid)
1195 int error = 0;
1196 struct domain* rd;
1198 /* Not allowed by a domain. */
1199 if (flags & ASSIGN_nocache)
1200 return -EINVAL;
1202 rd = find_domain_by_id(domid);
1203 if (unlikely(rd == NULL)) {
1204 switch (domid) {
1205 case DOMID_XEN:
1206 rd = dom_xen;
1207 break;
1208 case DOMID_IO:
1209 rd = dom_io;
1210 break;
1211 default:
1212 DPRINTK("d 0x%p domid %d "
1213 "pgfn 0x%lx mfn 0x%lx flags 0x%lx domid %d\n",
1214 d, d->domain_id, gpfn, mfn, flags, domid);
1215 return -ESRCH;
1217 BUG_ON(rd == NULL);
1218 get_knownalive_domain(rd);
1221 if (unlikely(rd == d || !mfn_valid(mfn))) {
1222 error = -EINVAL;
1223 goto out1;
1225 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1226 error = -EINVAL;
1227 goto out1;
1229 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1230 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1231 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1232 //don't update p2m table because this page belongs to rd, not d.
1233 out1:
1234 put_domain(rd);
1235 return error;
1238 // grant table host mapping
1239 // mpaddr: host_addr: pseudo physical address
1240 // mfn: frame: machine page frame
1241 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1242 int
1243 create_grant_host_mapping(unsigned long gpaddr,
1244 unsigned long mfn, unsigned int flags)
1246 struct domain* d = current->domain;
1247 struct page_info* page;
1248 int ret;
1250 if (flags & (GNTMAP_device_map |
1251 GNTMAP_application_map | GNTMAP_contains_pte)) {
1252 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1253 return GNTST_general_error;
1256 BUG_ON(!mfn_valid(mfn));
1257 page = mfn_to_page(mfn);
1258 ret = get_page(page, page_get_owner(page));
1259 BUG_ON(ret == 0);
1260 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1261 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1262 assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
1263 ASSIGN_readonly: ASSIGN_writable);
1264 return GNTST_okay;
1267 // grant table host unmapping
1268 int
1269 destroy_grant_host_mapping(unsigned long gpaddr,
1270 unsigned long mfn, unsigned int flags)
1272 struct domain* d = current->domain;
1273 volatile pte_t* pte;
1274 unsigned long cur_arflags;
1275 pte_t cur_pte;
1276 pte_t new_pte;
1277 pte_t old_pte;
1278 struct page_info* page;
1280 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1281 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1282 return GNTST_general_error;
1285 pte = lookup_noalloc_domain_pte(d, gpaddr);
1286 if (pte == NULL) {
1287 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx\n", __func__, gpaddr, mfn);
1288 return GNTST_general_error;
1291 again:
1292 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1293 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1294 if (!pte_present(cur_pte)) {
1295 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1296 __func__, gpaddr, mfn, pte_val(cur_pte));
1297 return GNTST_general_error;
1299 new_pte = __pte(0);
1301 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1302 if (unlikely(!pte_present(old_pte))) {
1303 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1304 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1305 return GNTST_general_error;
1307 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1308 if (pte_pfn(old_pte) == mfn) {
1309 goto again;
1311 DPRINTK("%s gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1312 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1313 return GNTST_general_error;
1315 BUG_ON(pte_pfn(old_pte) != mfn);
1317 domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
1319 page = mfn_to_page(mfn);
1320 BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1321 put_page(page);
1323 return GNTST_okay;
1326 // heavily depends on the struct page layout.
1327 // gnttab_transfer() calls steal_page() with memflags = 0
1328 // For grant table transfer, we must fill the page.
1329 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1330 // For memory exchange, we don't have to fill the page because
1331 // memory_exchange() does it.
1332 int
1333 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1335 #if 0 /* if big endian */
1336 # error "implement big endian version of steal_page()"
1337 #endif
1338 u32 _d, _nd;
1339 u64 x, nx, y;
1341 if (page_get_owner(page) != d) {
1342 DPRINTK("%s d 0x%p owner 0x%p\n", __func__, d, page_get_owner(page));
1343 return -1;
1346 if (!(memflags & MEMF_no_refcount)) {
1347 unsigned long gpfn;
1348 struct page_info *new;
1349 unsigned long new_mfn;
1350 int ret;
1352 new = alloc_domheap_page(d);
1353 if (new == NULL) {
1354 DPRINTK("alloc_domheap_page() failed\n");
1355 return -1;
1357 // zero out pages for security reasons
1358 clear_page(page_to_virt(new));
1359 // assign_domain_page_cmpxchg_rel() has release semantics
1360 // so smp_mb() isn't needed.
1362 ret = get_page(new, d);
1363 BUG_ON(ret == 0);
1365 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1366 if (gpfn == INVALID_M2P_ENTRY) {
1367 free_domheap_page(new);
1368 return -1;
1370 new_mfn = page_to_mfn(new);
1371 set_gpfn_from_mfn(new_mfn, gpfn);
1372 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1373 // has release semantics.
1375 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1376 ASSIGN_writable);
1377 if (ret < 0) {
1378 DPRINTK("assign_domain_page_cmpxchg_rel failed %d\n", ret);
1379 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1380 free_domheap_page(new);
1381 return -1;
1385 spin_lock(&d->page_alloc_lock);
1387 /*
1388 * The tricky bit: atomically release ownership while there is just one
1389 * benign reference to the page (PGC_allocated). If that reference
1390 * disappears then the deallocation routine will safely spin.
1391 */
1392 _d = pickle_domptr(d);
1393 y = *((u64*)&page->count_info);
1394 do {
1395 x = y;
1396 nx = x & 0xffffffff;
1397 // page->count_info: untouched
1398 // page->u.inused._domain = 0;
1399 _nd = x >> 32;
1401 if (unlikely(!(memflags & MEMF_no_refcount) &&
1402 ((x & (PGC_count_mask | PGC_allocated)) !=
1403 (1 | PGC_allocated))) ||
1405 // when MEMF_no_refcount, page isn't de-assigned from
1406 // this domain yet. So count_info = 2
1407 unlikely((memflags & MEMF_no_refcount) &&
1408 ((x & (PGC_count_mask | PGC_allocated)) !=
1409 (2 | PGC_allocated))) ||
1411 unlikely(_nd != _d)) {
1412 struct domain* nd = unpickle_domptr(_nd);
1413 if (nd == NULL) {
1414 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1415 "sd=%p 0x%x,"
1416 " caf=%016lx, taf=%" PRtype_info
1417 " memflags 0x%x\n",
1418 (void *) page_to_mfn(page),
1419 d, d->domain_id, _d,
1420 nd, _nd,
1421 x,
1422 page->u.inuse.type_info,
1423 memflags);
1424 } else {
1425 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1426 "sd=%p(%u) 0x%x,"
1427 " caf=%016lx, taf=%" PRtype_info
1428 " memflags 0x%x\n",
1429 (void *) page_to_mfn(page),
1430 d, d->domain_id, _d,
1431 nd, nd->domain_id, _nd,
1432 x,
1433 page->u.inuse.type_info,
1434 memflags);
1436 spin_unlock(&d->page_alloc_lock);
1437 return -1;
1440 y = cmpxchg((u64*)&page->count_info, x, nx);
1441 } while (unlikely(y != x));
1443 /*
1444 * Unlink from 'd'. At least one reference remains (now anonymous), so
1445 * noone else is spinning to try to delete this page from 'd'.
1446 */
1447 if ( !(memflags & MEMF_no_refcount) )
1448 d->tot_pages--;
1449 list_del(&page->list);
1451 spin_unlock(&d->page_alloc_lock);
1452 return 0;
1455 void
1456 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1457 unsigned long mfn)
1459 int ret;
1461 BUG_ON(!mfn_valid(mfn));
1462 ret = get_page(mfn_to_page(mfn), d);
1463 BUG_ON(ret == 0);
1464 set_gpfn_from_mfn(mfn, gpfn);
1465 smp_mb();
1466 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, ASSIGN_writable);
1468 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1471 void
1472 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1473 unsigned long mfn)
1475 BUG_ON(mfn == 0);//XXX
1476 zap_domain_page_one(d, gpfn << PAGE_SHIFT, mfn);
1479 //XXX sledgehammer.
1480 // flush finer range.
1481 static void
1482 domain_page_flush(struct domain* d, unsigned long mpaddr,
1483 unsigned long old_mfn, unsigned long new_mfn)
1485 if (shadow_mode_enabled(d))
1486 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1488 domain_flush_vtlb_all();
1491 int
1492 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1494 volatile pte_t * pte;
1496 pte = lookup_noalloc_domain_pte(d, mpaddr);
1497 if(pte != NULL && !pte_none(*pte))
1498 return 1;
1499 return 0;
1502 /* Flush cache of domain d. */
1503 void domain_cache_flush (struct domain *d, int sync_only)
1505 struct mm_struct *mm = &d->arch.mm;
1506 pgd_t *pgd = mm->pgd;
1507 unsigned long maddr;
1508 int i,j,k, l;
1509 int nbr_page = 0;
1510 void (*flush_func)(unsigned long start, unsigned long end);
1511 extern void flush_dcache_range (unsigned long, unsigned long);
1513 if (sync_only)
1514 flush_func = &flush_icache_range;
1515 else
1516 flush_func = &flush_dcache_range;
1518 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1519 pud_t *pud;
1520 if (!pgd_present(*pgd))
1521 continue;
1522 pud = pud_offset(pgd, 0);
1523 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1524 pmd_t *pmd;
1525 if (!pud_present(*pud))
1526 continue;
1527 pmd = pmd_offset(pud, 0);
1528 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1529 pte_t *pte;
1530 if (!pmd_present(*pmd))
1531 continue;
1532 pte = pte_offset_map(pmd, 0);
1533 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1534 if (!pte_present(*pte))
1535 continue;
1536 /* Convert PTE to maddr. */
1537 maddr = __va_ul (pte_val(*pte)
1538 & _PAGE_PPN_MASK);
1539 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1540 nbr_page++;
1545 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1548 #ifdef VERBOSE
1549 #define MEM_LOG(_f, _a...) \
1550 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1551 current->domain->domain_id , __LINE__ , ## _a )
1552 #else
1553 #define MEM_LOG(_f, _a...) ((void)0)
1554 #endif
1556 static void free_page_type(struct page_info *page, u32 type)
1560 static int alloc_page_type(struct page_info *page, u32 type)
1562 return 1;
1565 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1567 void *p = alloc_xenheap_pages(order);
1569 memset(p,0,PAGE_SIZE<<order);
1570 return (unsigned long)p;
1573 void __free_pages(struct page_info *page, unsigned int order)
1575 if (order) BUG();
1576 free_xenheap_page(page);
1579 void *pgtable_quicklist_alloc(void)
1581 void *p;
1582 p = alloc_xenheap_pages(0);
1583 if (p)
1584 clear_page(p);
1585 return p;
1588 void pgtable_quicklist_free(void *pgtable_entry)
1590 free_xenheap_page(pgtable_entry);
1593 void put_page_type(struct page_info *page)
1595 u32 nx, x, y = page->u.inuse.type_info;
1597 again:
1598 do {
1599 x = y;
1600 nx = x - 1;
1602 ASSERT((x & PGT_count_mask) != 0);
1604 /*
1605 * The page should always be validated while a reference is held. The
1606 * exception is during domain destruction, when we forcibly invalidate
1607 * page-table pages if we detect a referential loop.
1608 * See domain.c:relinquish_list().
1609 */
1610 ASSERT((x & PGT_validated) ||
1611 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1613 if ( unlikely((nx & PGT_count_mask) == 0) )
1615 /* Record TLB information for flush later. Races are harmless. */
1616 page->tlbflush_timestamp = tlbflush_current_time();
1618 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1619 likely(nx & PGT_validated) )
1621 /*
1622 * Page-table pages must be unvalidated when count is zero. The
1623 * 'free' is safe because the refcnt is non-zero and validated
1624 * bit is clear => other ops will spin or fail.
1625 */
1626 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1627 x & ~PGT_validated)) != x) )
1628 goto again;
1629 /* We cleared the 'valid bit' so we do the clean up. */
1630 free_page_type(page, x);
1631 /* Carry on, but with the 'valid bit' now clear. */
1632 x &= ~PGT_validated;
1633 nx &= ~PGT_validated;
1637 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1641 int get_page_type(struct page_info *page, u32 type)
1643 u32 nx, x, y = page->u.inuse.type_info;
1645 ASSERT(!(type & ~PGT_type_mask));
1647 again:
1648 do {
1649 x = y;
1650 nx = x + 1;
1651 if ( unlikely((nx & PGT_count_mask) == 0) )
1653 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1654 return 0;
1656 else if ( unlikely((x & PGT_count_mask) == 0) )
1658 if ( (x & PGT_type_mask) != type )
1660 /*
1661 * On type change we check to flush stale TLB entries. This
1662 * may be unnecessary (e.g., page was GDT/LDT) but those
1663 * circumstances should be very rare.
1664 */
1665 cpumask_t mask =
1666 page_get_owner(page)->domain_dirty_cpumask;
1667 tlbflush_filter(mask, page->tlbflush_timestamp);
1669 if ( unlikely(!cpus_empty(mask)) )
1671 perfc_incrc(need_flush_tlb_flush);
1672 flush_tlb_mask(mask);
1675 /* We lose existing type, back pointer, and validity. */
1676 nx &= ~(PGT_type_mask | PGT_validated);
1677 nx |= type;
1679 /* No special validation needed for writable pages. */
1680 /* Page tables and GDT/LDT need to be scanned for validity. */
1681 if ( type == PGT_writable_page )
1682 nx |= PGT_validated;
1685 else if ( unlikely((x & PGT_type_mask) != type) )
1687 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1688 (type != PGT_l1_page_table) )
1689 MEM_LOG("Bad type (saw %08x != exp %08x) "
1690 "for mfn %016lx (pfn %016lx)",
1691 x, type, page_to_mfn(page),
1692 get_gpfn_from_mfn(page_to_mfn(page)));
1693 return 0;
1695 else if ( unlikely(!(x & PGT_validated)) )
1697 /* Someone else is updating validation of this page. Wait... */
1698 while ( (y = page->u.inuse.type_info) == x )
1699 cpu_relax();
1700 goto again;
1703 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
1705 if ( unlikely(!(nx & PGT_validated)) )
1707 /* Try to validate page type; drop the new reference on failure. */
1708 if ( unlikely(!alloc_page_type(page, type)) )
1710 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
1711 ": caf=%08x taf=%" PRtype_info,
1712 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1713 type, page->count_info, page->u.inuse.type_info);
1714 /* Noone else can get a reference. We hold the only ref. */
1715 page->u.inuse.type_info = 0;
1716 return 0;
1719 /* Noone else is updating simultaneously. */
1720 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1723 return 1;
1726 int memory_is_conventional_ram(paddr_t p)
1728 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
1732 long
1733 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
1735 switch (op) {
1736 case XENMEM_add_to_physmap:
1738 struct xen_add_to_physmap xatp;
1739 unsigned long prev_mfn, mfn = 0, gpfn;
1740 struct domain *d;
1742 if (copy_from_guest(&xatp, arg, 1))
1743 return -EFAULT;
1745 if (xatp.domid == DOMID_SELF) {
1746 d = current->domain;
1747 get_knownalive_domain(d);
1749 else if (!IS_PRIV(current->domain))
1750 return -EPERM;
1751 else if ((d = find_domain_by_id(xatp.domid)) == NULL)
1752 return -ESRCH;
1754 /* This hypercall is used for VT-i domain only */
1755 if (!VMX_DOMAIN(d->vcpu[0])) {
1756 put_domain(d);
1757 return -ENOSYS;
1760 switch (xatp.space) {
1761 case XENMAPSPACE_shared_info:
1762 if (xatp.idx == 0)
1763 mfn = virt_to_mfn(d->shared_info);
1764 break;
1765 case XENMAPSPACE_grant_table:
1766 if (xatp.idx < NR_GRANT_FRAMES)
1767 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
1768 break;
1769 default:
1770 break;
1773 LOCK_BIGLOCK(d);
1775 /* Remove previously mapped page if it was present. */
1776 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
1777 if (prev_mfn && mfn_valid(prev_mfn)) {
1778 if (IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)))
1779 /* Xen heap frames are simply unhooked from this phys slot. */
1780 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
1781 else
1782 /* Normal domain memory is freed, to avoid leaking memory. */
1783 guest_remove_page(d, xatp.gpfn);
1786 /* Unmap from old location, if any. */
1787 gpfn = get_gpfn_from_mfn(mfn);
1788 if (gpfn != INVALID_M2P_ENTRY)
1789 guest_physmap_remove_page(d, gpfn, mfn);
1791 /* Map at new location. */
1792 guest_physmap_add_page(d, xatp.gpfn, mfn);
1794 UNLOCK_BIGLOCK(d);
1796 put_domain(d);
1798 break;
1801 default:
1802 return -ENOSYS;
1805 return 0;
1808 /*
1809 * Local variables:
1810 * mode: C
1811 * c-set-style: "BSD"
1812 * c-basic-offset: 4
1813 * tab-width: 4
1814 * indent-tabs-mode: nil
1815 * End:
1816 */