ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 14196:9d36026b1b43

xen: Cleanups and bug fixes after the rcu_lock_domain patch.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Mar 01 11:38:55 2007 +0000 (2007-03-01)
parents a42c58791d8a
children 50d5bf02e59e
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <xen/guest_access.h>
179 #include <asm/page.h>
180 #include <public/memory.h>
182 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
183 volatile pte_t* ptep, pte_t old_pte,
184 struct page_info* page);
186 extern unsigned long ia64_iobase;
188 static struct domain *dom_xen, *dom_io;
190 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
191 void
192 alloc_dom_xen_and_dom_io(void)
193 {
194 /*
195 * Initialise our DOMID_XEN domain.
196 * Any Xen-heap pages that we will allow to be mapped will have
197 * their domain field set to dom_xen.
198 */
199 dom_xen = alloc_domain(DOMID_XEN);
200 BUG_ON(dom_xen == NULL);
202 /*
203 * Initialise our DOMID_IO domain.
204 * This domain owns I/O pages that are within the range of the page_info
205 * array. Mappings occur at the priv of the caller.
206 */
207 dom_io = alloc_domain(DOMID_IO);
208 BUG_ON(dom_io == NULL);
209 }
211 // heavily depends on the struct page_info layout.
212 // if (page_get_owner(page) == d &&
213 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
214 // put_page(page);
215 // }
216 static void
217 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
218 {
219 u32 _d, _nd;
220 u64 x, nx, y;
222 _d = pickle_domptr(d);
223 y = *((u64*)&page->count_info);
224 do {
225 x = y;
226 _nd = x >> 32;
227 nx = x - 1;
228 __clear_bit(_PGC_allocated, &nx);
230 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
231 struct domain* nd = unpickle_domptr(_nd);
232 if (nd == NULL) {
233 gdprintk(XENLOG_INFO, "gnttab_transfer: "
234 "Bad page %p: ed=%p(%u) 0x%x, "
235 "sd=%p 0x%x,"
236 " caf=%016lx, taf=%" PRtype_info "\n",
237 (void *) page_to_mfn(page),
238 d, d->domain_id, _d,
239 nd, _nd,
240 x,
241 page->u.inuse.type_info);
242 }
243 break;
244 }
246 BUG_ON((nx & PGC_count_mask) < 1);
247 y = cmpxchg((u64*)&page->count_info, x, nx);
248 } while (unlikely(y != x));
249 }
251 static void
252 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
253 {
254 pte_t old_pte;
255 unsigned long mfn;
256 struct page_info* page;
258 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
260 // vmx domain use bit[58:56] to distinguish io region from memory.
261 // see vmx_build_physmap_table() in vmx_init.c
262 if (!pte_mem(old_pte))
263 return;
265 // domain might map IO space or acpi table pages. check it.
266 mfn = pte_pfn(old_pte);
267 if (!mfn_valid(mfn))
268 return;
269 page = mfn_to_page(mfn);
270 // struct page_info corresponding to mfn may exist or not depending
271 // on CONFIG_VIRTUAL_FRAME_TABLE.
272 // This check is too easy.
273 // The right way is to check whether this page is of io area or acpi pages
274 if (page_get_owner(page) == NULL) {
275 BUG_ON(page->count_info != 0);
276 return;
277 }
279 if (pte_pgc_allocated(old_pte)) {
280 BUG_ON(page_get_owner(page) != d);
281 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
282 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
283 try_to_clear_PGC_allocate(d, page);
284 }
285 put_page(page);
286 }
288 static void
289 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
290 {
291 unsigned long i;
292 volatile pte_t* pte = pte_offset_map(pmd, offset);
294 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
295 if (!pte_present(*pte)) // acquire semantics
296 continue;
297 mm_teardown_pte(d, pte, offset + (i << PAGE_SHIFT));
298 }
299 }
301 static void
302 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
303 {
304 unsigned long i;
305 volatile pmd_t *pmd = pmd_offset(pud, offset);
307 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
308 if (!pmd_present(*pmd)) // acquire semantics
309 continue;
310 mm_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
311 }
312 }
314 static void
315 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
316 {
317 unsigned long i;
318 volatile pud_t *pud = pud_offset(pgd, offset);
320 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
321 if (!pud_present(*pud)) // acquire semantics
322 continue;
323 mm_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
324 }
325 }
327 void
328 mm_teardown(struct domain* d)
329 {
330 struct mm_struct* mm = &d->arch.mm;
331 unsigned long i;
332 volatile pgd_t* pgd;
334 if (mm->pgd == NULL)
335 return;
337 pgd = pgd_offset(mm, 0);
338 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
339 if (!pgd_present(*pgd)) // acquire semantics
340 continue;
341 mm_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
342 }
343 }
345 static void
346 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
347 unsigned long offset)
348 {
349 pte_free_kernel(pte_offset_map(pmd, offset));
350 }
352 static void
353 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
354 unsigned long offset)
355 {
356 unsigned long i;
357 volatile pmd_t *pmd = pmd_offset(pud, offset);
359 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
360 if (!pmd_present(*pmd))
361 continue;
362 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
363 }
364 pmd_free(pmd_offset(pud, offset));
365 }
367 static void
368 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
369 unsigned long offset)
370 {
371 unsigned long i;
372 volatile pud_t *pud = pud_offset(pgd, offset);
374 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
375 if (!pud_present(*pud))
376 continue;
377 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
378 }
379 pud_free(pud_offset(pgd, offset));
380 }
382 static void
383 mm_p2m_teardown(struct domain* d)
384 {
385 struct mm_struct* mm = &d->arch.mm;
386 unsigned long i;
387 volatile pgd_t* pgd;
389 BUG_ON(mm->pgd == NULL);
390 pgd = pgd_offset(mm, 0);
391 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
392 if (!pgd_present(*pgd))
393 continue;
394 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
395 }
396 pgd_free(mm->pgd);
397 mm->pgd = NULL;
398 }
400 void
401 mm_final_teardown(struct domain* d)
402 {
403 if (d->arch.shadow_bitmap != NULL) {
404 xfree(d->arch.shadow_bitmap);
405 d->arch.shadow_bitmap = NULL;
406 }
407 mm_p2m_teardown(d);
408 }
410 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
411 void
412 share_xen_page_with_guest(struct page_info *page,
413 struct domain *d, int readonly)
414 {
415 if ( page_get_owner(page) == d )
416 return;
418 #if 1
419 if (readonly) {
420 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
421 }
422 #endif
424 // alloc_xenheap_pages() doesn't initialize page owner.
425 //BUG_ON(page_get_owner(page) != NULL);
427 spin_lock(&d->page_alloc_lock);
429 #ifndef __ia64__
430 /* The incremented type count pins as writable or read-only. */
431 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
432 page->u.inuse.type_info |= PGT_validated | 1;
433 #endif
435 page_set_owner(page, d);
436 wmb(); /* install valid domain ptr before updating refcnt. */
437 ASSERT(page->count_info == 0);
439 /* Only add to the allocation list if the domain isn't dying. */
440 if ( !test_bit(_DOMF_dying, &d->domain_flags) )
441 {
442 page->count_info |= PGC_allocated | 1;
443 if ( unlikely(d->xenheap_pages++ == 0) )
444 get_knownalive_domain(d);
445 list_add_tail(&page->list, &d->xenpage_list);
446 }
448 // grant_table_destroy() releases these pages.
449 // but it doesn't clear their m2p entry. So there might remain stale
450 // entries. such a stale entry is cleared here.
451 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
453 spin_unlock(&d->page_alloc_lock);
454 }
456 void
457 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
458 {
459 share_xen_page_with_guest(page, dom_xen, readonly);
460 }
462 unsigned long
463 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
464 {
465 unsigned long pte;
467 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
468 if (!pte) {
469 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
470 }
471 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
472 }
474 // given a domain virtual address, pte and pagesize, extract the metaphysical
475 // address, convert the pte for a physical address for (possibly different)
476 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
477 // PAGE_SIZE!)
478 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
479 struct p2m_entry* entry)
480 {
481 struct domain *d = current->domain;
482 ia64_itir_t itir = {.itir = itir__};
483 u64 mask, mpaddr, pteval2;
484 u64 arflags;
485 u64 arflags2;
486 u64 maflags2;
488 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
490 // FIXME address had better be pre-validated on insert
491 mask = ~itir_mask(itir.itir);
492 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
494 if (itir.ps > PAGE_SHIFT)
495 itir.ps = PAGE_SHIFT;
497 *logps = itir.ps;
499 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
501 /* Check access rights. */
502 arflags = pteval & _PAGE_AR_MASK;
503 arflags2 = pteval2 & _PAGE_AR_MASK;
504 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
505 #if 0
506 dprintk(XENLOG_WARNING,
507 "%s:%d "
508 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
509 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
510 __func__, __LINE__,
511 pteval, arflags, address, itir__,
512 pteval2, arflags2, mpaddr);
513 #endif
514 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
515 }
517 /* Check memory attribute. The switch is on the *requested* memory
518 attribute. */
519 maflags2 = pteval2 & _PAGE_MA_MASK;
520 switch (pteval & _PAGE_MA_MASK) {
521 case _PAGE_MA_NAT:
522 /* NaT pages are always accepted! */
523 break;
524 case _PAGE_MA_UC:
525 case _PAGE_MA_UCE:
526 case _PAGE_MA_WC:
527 if (maflags2 == _PAGE_MA_WB) {
528 /* Don't let domains WB-map uncached addresses.
529 This can happen when domU tries to touch i/o
530 port space. Also prevents possible address
531 aliasing issues. */
532 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE))
533 gdprintk(XENLOG_WARNING, "Warning: UC to WB "
534 "for mpaddr=%lx\n", mpaddr);
535 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
536 }
537 break;
538 case _PAGE_MA_WB:
539 if (maflags2 != _PAGE_MA_WB) {
540 /* Forbid non-coherent access to coherent memory. */
541 panic_domain(NULL, "try to use WB mem attr on "
542 "UC page, mpaddr=%lx\n", mpaddr);
543 }
544 break;
545 default:
546 panic_domain(NULL, "try to use unknown mem attribute\n");
547 }
549 /* If shadow mode is enabled, virtualize dirty bit. */
550 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
551 u64 mp_page = mpaddr >> PAGE_SHIFT;
552 pteval |= _PAGE_VIRT_D;
554 /* If the page is not already dirty, don't set the dirty bit! */
555 if (mp_page < d->arch.shadow_bitmap_size * 8
556 && !test_bit(mp_page, d->arch.shadow_bitmap))
557 pteval &= ~_PAGE_D;
558 }
560 /* Ignore non-addr bits of pteval2 and force PL0->2
561 (PL3 is unaffected) */
562 return (pteval & ~_PAGE_PPN_MASK) |
563 (pteval2 & _PAGE_PPN_MASK) | _PAGE_PL_2;
564 }
566 // given a current domain metaphysical address, return the physical address
567 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
568 struct p2m_entry* entry)
569 {
570 unsigned long pteval;
572 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
573 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
574 }
576 //XXX !xxx_present() should be used instread of !xxx_none()?
577 // pud, pmd, pte page is zero cleared when they are allocated.
578 // Their area must be visible before population so that
579 // cmpxchg must have release semantics.
580 static volatile pte_t*
581 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
582 {
583 struct mm_struct *mm = &d->arch.mm;
584 volatile pgd_t *pgd;
585 volatile pud_t *pud;
586 volatile pmd_t *pmd;
588 BUG_ON(mm->pgd == NULL);
590 pgd = pgd_offset(mm, mpaddr);
591 again_pgd:
592 if (unlikely(pgd_none(*pgd))) { // acquire semantics
593 pud_t *old_pud = NULL;
594 pud = pud_alloc_one(mm, mpaddr);
595 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
596 pud_free(pud);
597 goto again_pgd;
598 }
599 }
601 pud = pud_offset(pgd, mpaddr);
602 again_pud:
603 if (unlikely(pud_none(*pud))) { // acquire semantics
604 pmd_t* old_pmd = NULL;
605 pmd = pmd_alloc_one(mm, mpaddr);
606 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
607 pmd_free(pmd);
608 goto again_pud;
609 }
610 }
612 pmd = pmd_offset(pud, mpaddr);
613 again_pmd:
614 if (unlikely(pmd_none(*pmd))) { // acquire semantics
615 pte_t* old_pte = NULL;
616 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
617 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
618 pte_free_kernel(pte);
619 goto again_pmd;
620 }
621 }
623 return pte_offset_map(pmd, mpaddr);
624 }
626 //XXX xxx_none() should be used instread of !xxx_present()?
627 volatile pte_t*
628 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
629 {
630 struct mm_struct *mm = &d->arch.mm;
631 volatile pgd_t *pgd;
632 volatile pud_t *pud;
633 volatile pmd_t *pmd;
635 BUG_ON(mm->pgd == NULL);
636 pgd = pgd_offset(mm, mpaddr);
637 if (unlikely(!pgd_present(*pgd))) // acquire semantics
638 return NULL;
640 pud = pud_offset(pgd, mpaddr);
641 if (unlikely(!pud_present(*pud))) // acquire semantics
642 return NULL;
644 pmd = pmd_offset(pud, mpaddr);
645 if (unlikely(!pmd_present(*pmd))) // acquire semantics
646 return NULL;
648 return pte_offset_map(pmd, mpaddr);
649 }
651 static volatile pte_t*
652 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
653 {
654 struct mm_struct *mm = &d->arch.mm;
655 volatile pgd_t *pgd;
656 volatile pud_t *pud;
657 volatile pmd_t *pmd;
659 BUG_ON(mm->pgd == NULL);
660 pgd = pgd_offset(mm, mpaddr);
661 if (unlikely(pgd_none(*pgd))) // acquire semantics
662 return NULL;
664 pud = pud_offset(pgd, mpaddr);
665 if (unlikely(pud_none(*pud))) // acquire semantics
666 return NULL;
668 pmd = pmd_offset(pud, mpaddr);
669 if (unlikely(pmd_none(*pmd))) // acquire semantics
670 return NULL;
672 return pte_offset_map(pmd, mpaddr);
673 }
675 unsigned long
676 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
677 {
678 volatile pte_t *pte;
680 pte = lookup_noalloc_domain_pte(d, mpaddr);
681 if (pte == NULL)
682 return INVALID_MFN;
684 if (pte_present(*pte))
685 return (pte->pte & _PFN_MASK);
686 else if (VMX_DOMAIN(d->vcpu[0]))
687 return GPFN_INV_MASK;
688 return INVALID_MFN;
689 }
691 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
692 struct p2m_entry* entry)
693 {
694 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
696 if (pte != NULL) {
697 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
698 if (pte_present(tmp_pte)) {
699 if (entry != NULL)
700 p2m_entry_set(entry, pte, tmp_pte);
701 return pte_val(tmp_pte);
702 } else if (VMX_DOMAIN(d->vcpu[0]))
703 return GPFN_INV_MASK;
704 }
706 if (mpaddr < d->arch.convmem_end) {
707 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
708 "0x%lx (< 0x%lx)\n", current->vcpu_id, PSCB(current, iip),
709 mpaddr, d->arch.convmem_end);
710 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
711 /* Log I/O port probing, but complain less loudly about it */
712 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
713 "0x%lx\n ", current->vcpu_id, PSCB(current, iip),
714 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
715 } else {
716 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa 0x%lx "
717 "(=> 0x%lx)\n", current->vcpu_id, PSCB(current, iip),
718 mpaddr, d->arch.convmem_end);
719 }
721 if (entry != NULL)
722 p2m_entry_set(entry, NULL, __pte(0));
723 //XXX This is a work around until the emulation memory access to a region
724 // where memory or device are attached is implemented.
725 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
726 }
728 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
729 #if 1
730 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
731 {
732 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
733 unsigned long imva;
735 pte &= _PAGE_PPN_MASK;
736 imva = (unsigned long) __va(pte);
737 imva |= mpaddr & ~PAGE_MASK;
738 return (void*)imva;
739 }
740 #else
741 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
742 {
743 unsigned long imva = __gpa_to_mpa(d, mpaddr);
745 return (void *)__va(imva);
746 }
747 #endif
749 unsigned long
750 xencomm_paddr_to_maddr(unsigned long paddr)
751 {
752 struct vcpu *v = current;
753 struct domain *d = v->domain;
754 u64 pa;
756 pa = ____lookup_domain_mpa(d, paddr);
757 if (pa == INVALID_MFN) {
758 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
759 __func__, paddr, vcpu_regs(v)->cr_iip);
760 return 0;
761 }
762 return __va_ul((pa & _PFN_MASK) | (paddr & ~PAGE_MASK));
763 }
765 /* Allocate a new page for domain and map it to the specified metaphysical
766 address. */
767 static struct page_info *
768 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
769 volatile pte_t* pte)
770 {
771 struct page_info *p;
772 unsigned long maddr;
773 int ret;
775 BUG_ON(!pte_none(*pte));
777 p = alloc_domheap_page(d);
778 if (unlikely(!p)) {
779 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
780 return(p);
781 }
783 // zero out pages for security reasons
784 clear_page(page_to_virt(p));
785 maddr = page_to_maddr (p);
786 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
787 && maddr < __get_cpu_var(vhpt_pend))) {
788 /* FIXME: how can this happen ?
789 vhpt is allocated by alloc_domheap_page. */
790 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
791 maddr);
792 }
794 ret = get_page(p, d);
795 BUG_ON(ret == 0);
796 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
797 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
798 // because set_pte_rel() has release semantics
799 set_pte_rel(pte,
800 pfn_pte(maddr >> PAGE_SHIFT,
801 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
802 _PAGE_PL_2 | _PAGE_AR_RWX)));
804 smp_mb();
805 return p;
806 }
808 struct page_info *
809 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
810 {
811 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
813 if (!pte_none(*pte))
814 return NULL;
816 return __assign_new_domain_page(d, mpaddr, pte);
817 }
819 void
820 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
821 {
822 volatile pte_t *pte;
824 BUG_ON(d != dom0);
825 pte = lookup_alloc_domain_pte(d, mpaddr);
826 if (pte_none(*pte)) {
827 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
828 if (p == NULL) {
829 panic("%s: can't allocate page for dom0", __func__);
830 }
831 }
832 }
834 static unsigned long
835 flags_to_prot (unsigned long flags)
836 {
837 unsigned long res = _PAGE_PL_2 | __DIRTY_BITS;
839 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
840 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
841 #ifdef CONFIG_XEN_IA64_TLB_TRACK
842 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
843 #endif
844 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
846 return res;
847 }
849 /* map a physical address to the specified metaphysical addr */
850 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
851 // This is called by assign_domain_mmio_page().
852 // So accessing to pte is racy.
853 int
854 __assign_domain_page(struct domain *d,
855 unsigned long mpaddr, unsigned long physaddr,
856 unsigned long flags)
857 {
858 volatile pte_t *pte;
859 pte_t old_pte;
860 pte_t new_pte;
861 pte_t ret_pte;
862 unsigned long prot = flags_to_prot(flags);
864 pte = lookup_alloc_domain_pte(d, mpaddr);
866 old_pte = __pte(0);
867 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
868 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
869 if (pte_val(ret_pte) == pte_val(old_pte)) {
870 smp_mb();
871 return 0;
872 }
874 // dom0 tries to map real machine's I/O region, but failed.
875 // It is very likely that dom0 doesn't boot correctly because
876 // it can't access I/O. So complain here.
877 if ((flags & ASSIGN_nocache) &&
878 (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT) ||
879 !(pte_val(ret_pte) & _PAGE_MA_UC)))
880 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
881 "\talready assigned pte_val 0x%016lx\n"
882 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
883 __func__, __LINE__,
884 d, d->domain_id, pte_val(ret_pte),
885 mpaddr, physaddr, flags);
887 return -EAGAIN;
888 }
890 /* get_page() and map a physical address to the specified metaphysical addr */
891 void
892 assign_domain_page(struct domain *d,
893 unsigned long mpaddr, unsigned long physaddr)
894 {
895 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
896 int ret;
898 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
899 ret = get_page(page, d);
900 BUG_ON(ret == 0);
901 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
902 // because __assign_domain_page() uses set_pte_rel() which has
903 // release semantics, smp_mb() isn't needed.
904 (void)__assign_domain_page(d, mpaddr, physaddr,
905 ASSIGN_writable | ASSIGN_pgc_allocated);
906 }
908 int
909 ioports_permit_access(struct domain *d, unsigned long fp, unsigned long lp)
910 {
911 int ret;
912 unsigned long off;
913 unsigned long fp_offset;
914 unsigned long lp_offset;
916 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
917 if (ret != 0)
918 return ret;
920 /* Domain 0 doesn't virtualize IO ports space. */
921 if (d == dom0)
922 return 0;
924 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
925 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
927 for (off = fp_offset; off <= lp_offset; off += PAGE_SIZE)
928 (void)__assign_domain_page(d, IO_PORTS_PADDR + off,
929 __pa(ia64_iobase) + off, ASSIGN_nocache);
931 return 0;
932 }
934 static int
935 ioports_has_allowed(struct domain *d, unsigned long fp, unsigned long lp)
936 {
937 unsigned long i;
938 for (i = fp; i < lp; i++)
939 if (rangeset_contains_singleton(d->arch.ioport_caps, i))
940 return 1;
941 return 0;
942 }
944 int
945 ioports_deny_access(struct domain *d, unsigned long fp, unsigned long lp)
946 {
947 int ret;
948 struct mm_struct *mm = &d->arch.mm;
949 unsigned long off;
950 unsigned long io_ports_base;
951 unsigned long fp_offset;
952 unsigned long lp_offset;
954 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
955 if (ret != 0)
956 return ret;
957 if (d == dom0)
958 io_ports_base = __pa(ia64_iobase);
959 else
960 io_ports_base = IO_PORTS_PADDR;
962 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
963 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
965 for (off = fp_offset; off < lp_offset; off += PAGE_SIZE) {
966 unsigned long mpaddr = io_ports_base + off;
967 unsigned long port;
968 volatile pte_t *pte;
969 pte_t old_pte;
971 port = IO_SPACE_SPARSE_DECODING (off);
972 if (port < fp || port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1 > lp) {
973 /* Maybe this covers an allowed port. */
974 if (ioports_has_allowed(d, port,
975 port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1))
976 continue;
977 }
979 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
980 BUG_ON(pte == NULL);
981 BUG_ON(pte_none(*pte));
983 // clear pte
984 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
985 }
986 domain_flush_vtlb_all(d);
987 return 0;
988 }
990 static void
991 assign_domain_same_page(struct domain *d,
992 unsigned long mpaddr, unsigned long size,
993 unsigned long flags)
994 {
995 //XXX optimization
996 unsigned long end = PAGE_ALIGN(mpaddr + size);
997 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
998 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
999 }
1002 int
1003 efi_mmio(unsigned long physaddr, unsigned long size)
1005 void *efi_map_start, *efi_map_end;
1006 u64 efi_desc_size;
1007 void* p;
1009 efi_map_start = __va(ia64_boot_param->efi_memmap);
1010 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1011 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1013 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1014 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1015 unsigned long start = md->phys_addr;
1016 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1018 if (start <= physaddr && physaddr < end) {
1019 if ((physaddr + size) > end) {
1020 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1021 __func__, physaddr, size);
1022 return 0;
1025 // for io space
1026 if (md->type == EFI_MEMORY_MAPPED_IO ||
1027 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1028 return 1;
1031 // for runtime
1032 // see efi_enter_virtual_mode(void)
1033 // in linux/arch/ia64/kernel/efi.c
1034 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1035 !(md->attribute & EFI_MEMORY_WB)) {
1036 return 1;
1039 return 0;
1042 if (physaddr < start) {
1043 break;
1047 return 1;
1050 unsigned long
1051 assign_domain_mmio_page(struct domain *d,
1052 unsigned long mpaddr, unsigned long size)
1054 if (size == 0) {
1055 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1056 __func__, d, mpaddr, size);
1058 if (!efi_mmio(mpaddr, size)) {
1059 #ifndef NDEBUG
1060 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1061 __func__, d, mpaddr, size);
1062 #endif
1063 return -EINVAL;
1065 assign_domain_same_page(d, mpaddr, size, ASSIGN_writable | ASSIGN_nocache);
1066 return mpaddr;
1069 unsigned long
1070 assign_domain_mach_page(struct domain *d,
1071 unsigned long mpaddr, unsigned long size,
1072 unsigned long flags)
1074 BUG_ON(flags & ASSIGN_pgc_allocated);
1075 assign_domain_same_page(d, mpaddr, size, flags);
1076 return mpaddr;
1079 static void
1080 domain_put_page(struct domain* d, unsigned long mpaddr,
1081 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1083 unsigned long mfn = pte_pfn(old_pte);
1084 struct page_info* page = mfn_to_page(mfn);
1086 if (pte_pgc_allocated(old_pte)) {
1087 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1088 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1089 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1090 } else {
1091 BUG();
1094 if (clear_PGC_allocate)
1095 try_to_clear_PGC_allocate(d, page);
1097 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1100 // caller must get_page(mfn_to_page(mfn)) before call.
1101 // caller must call set_gpfn_from_mfn() before call if necessary.
1102 // because set_gpfn_from_mfn() result must be visible before pte xchg
1103 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1104 // flags: ASSIGN_xxx
1105 static void
1106 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1107 unsigned long mfn, unsigned long flags)
1109 struct mm_struct *mm = &d->arch.mm;
1110 volatile pte_t* pte;
1111 pte_t old_pte;
1112 pte_t npte;
1113 unsigned long prot = flags_to_prot(flags);
1115 pte = lookup_alloc_domain_pte(d, mpaddr);
1117 // update pte
1118 npte = pfn_pte(mfn, __pgprot(prot));
1119 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1120 if (pte_mem(old_pte)) {
1121 unsigned long old_mfn = pte_pfn(old_pte);
1123 // mfn = old_mfn case can happen when domain maps a granted page
1124 // twice with the same pseudo physial address.
1125 // It's non sense, but allowed.
1126 // __gnttab_map_grant_ref()
1127 // => create_host_mapping()
1128 // => assign_domain_page_replace()
1129 if (mfn != old_mfn) {
1130 domain_put_page(d, mpaddr, pte, old_pte, 1);
1133 perfc_incrc(assign_domain_page_replace);
1136 // caller must get_page(new_page) before
1137 // Only steal_page() calls this function.
1138 static int
1139 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1140 struct page_info* old_page,
1141 struct page_info* new_page,
1142 unsigned long flags)
1144 struct mm_struct *mm = &d->arch.mm;
1145 volatile pte_t* pte;
1146 unsigned long old_mfn;
1147 unsigned long old_prot;
1148 pte_t old_pte;
1149 unsigned long new_mfn;
1150 unsigned long new_prot;
1151 pte_t new_pte;
1152 pte_t ret_pte;
1154 pte = lookup_alloc_domain_pte(d, mpaddr);
1156 again:
1157 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1158 old_mfn = page_to_mfn(old_page);
1159 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1160 if (!pte_present(old_pte)) {
1161 gdprintk(XENLOG_INFO,
1162 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1163 __func__, pte_val(old_pte), old_prot, old_mfn);
1164 return -EINVAL;
1167 new_prot = flags_to_prot(flags);
1168 new_mfn = page_to_mfn(new_page);
1169 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1171 // update pte
1172 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1173 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1174 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1175 goto again;
1178 gdprintk(XENLOG_INFO,
1179 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1180 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1181 __func__,
1182 pte_val(old_pte), old_prot, old_mfn,
1183 pte_val(ret_pte), pte_pfn(ret_pte));
1184 return -EINVAL;
1187 BUG_ON(!pte_mem(old_pte));
1188 BUG_ON(!pte_pgc_allocated(old_pte));
1189 BUG_ON(page_get_owner(old_page) != d);
1190 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1191 BUG_ON(old_mfn == new_mfn);
1193 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1195 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1196 perfc_incrc(assign_domain_pge_cmpxchg_rel);
1197 return 0;
1200 static void
1201 zap_domain_page_one(struct domain *d, unsigned long mpaddr, unsigned long mfn)
1203 struct mm_struct *mm = &d->arch.mm;
1204 volatile pte_t *pte;
1205 pte_t old_pte;
1206 struct page_info *page;
1208 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1209 if (pte == NULL)
1210 return;
1211 if (pte_none(*pte))
1212 return;
1214 if (mfn == INVALID_MFN) {
1215 // clear pte
1216 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1217 mfn = pte_pfn(old_pte);
1218 } else {
1219 unsigned long old_arflags;
1220 pte_t new_pte;
1221 pte_t ret_pte;
1223 again:
1224 // memory_exchange() calls guest_physmap_remove_page() with
1225 // a stealed page. i.e. page owner = NULL.
1226 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1227 page_get_owner(mfn_to_page(mfn)) != NULL);
1228 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1229 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1230 new_pte = __pte(0);
1232 // update pte
1233 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1234 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1235 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1236 goto again;
1239 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1240 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1241 __func__,
1242 pte_val(old_pte), old_arflags, mfn,
1243 pte_val(ret_pte), pte_pfn(ret_pte));
1244 return;
1246 BUG_ON(mfn != pte_pfn(ret_pte));
1249 page = mfn_to_page(mfn);
1250 BUG_ON((page->count_info & PGC_count_mask) == 0);
1252 // exchange_memory() calls
1253 // steal_page()
1254 // page owner is set to NULL
1255 // guest_physmap_remove_page()
1256 // zap_domain_page_one()
1257 domain_put_page(d, mpaddr, pte, old_pte, (page_get_owner(page) != NULL));
1258 perfc_incrc(zap_dcomain_page_one);
1261 unsigned long
1262 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1263 unsigned int extent_order)
1265 if (extent_order != 0) {
1266 //XXX
1267 return -ENOSYS;
1270 zap_domain_page_one(d, gpfn << PAGE_SHIFT, INVALID_MFN);
1271 perfc_incrc(dom0vp_zap_physmap);
1272 return 0;
1275 static unsigned long
1276 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1277 unsigned long mfn_or_gmfn,
1278 unsigned long flags, domid_t domid, int is_gmfn)
1280 int error = -EINVAL;
1281 struct domain* rd;
1282 unsigned long mfn;
1284 /* Not allowed by a domain. */
1285 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1286 return -EINVAL;
1288 rd = get_domain_by_id(domid);
1289 if (unlikely(rd == NULL)) {
1290 switch (domid) {
1291 case DOMID_XEN:
1292 rd = dom_xen;
1293 break;
1294 case DOMID_IO:
1295 rd = dom_io;
1296 break;
1297 default:
1298 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1299 "pgfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1300 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1301 return -ESRCH;
1303 BUG_ON(rd == NULL);
1304 get_knownalive_domain(rd);
1307 if (unlikely(rd == d))
1308 goto out1;
1309 /*
1310 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1311 * It can be considered that their p2m conversion is p==m.
1312 */
1313 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1314 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1315 else
1316 mfn = mfn_or_gmfn;
1317 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1318 goto out1;
1320 error = 0;
1321 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1322 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1323 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1324 //don't update p2m table because this page belongs to rd, not d.
1325 perfc_incrc(dom0vp_add_physmap);
1326 out1:
1327 put_domain(rd);
1328 return error;
1331 unsigned long
1332 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1333 unsigned long flags, domid_t domid)
1335 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1338 unsigned long
1339 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1340 unsigned long gmfn, unsigned long flags,
1341 domid_t domid)
1343 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1346 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1347 static struct page_info* p2m_pte_zero_page = NULL;
1349 void
1350 expose_p2m_init(void)
1352 pte_t* pte;
1354 pte = pte_alloc_one_kernel(NULL, 0);
1355 BUG_ON(pte == NULL);
1356 smp_mb();// make contents of the page visible.
1357 p2m_pte_zero_page = virt_to_page(pte);
1360 static int
1361 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1363 // we can't get_page(page) here.
1364 // pte page is allocated form xen heap.(see pte_alloc_one_kernel().)
1365 // so that the page has NULL page owner and it's reference count
1366 // is useless.
1367 // see also mm_teardown_pte()'s page_get_owner() == NULL check.
1368 BUG_ON(page_get_owner(page) != NULL);
1370 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1371 ASSIGN_readonly);
1374 // It is possible to optimize loop, But this isn't performance critical.
1375 unsigned long
1376 dom0vp_expose_p2m(struct domain* d,
1377 unsigned long conv_start_gpfn,
1378 unsigned long assign_start_gpfn,
1379 unsigned long expose_size, unsigned long granule_pfn)
1381 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1382 unsigned long i;
1383 volatile pte_t* conv_pte;
1384 volatile pte_t* assign_pte;
1386 if ((expose_size % PAGE_SIZE) != 0 ||
1387 (granule_pfn % PTRS_PER_PTE) != 0 ||
1388 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1389 (conv_start_gpfn % granule_pfn) != 0 ||
1390 (assign_start_gpfn % granule_pfn) != 0 ||
1391 (expose_num_pfn % granule_pfn) != 0) {
1392 gdprintk(XENLOG_INFO,
1393 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1394 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1395 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1396 return -EINVAL;
1399 if (granule_pfn != PTRS_PER_PTE) {
1400 gdprintk(XENLOG_INFO,
1401 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1402 __func__, granule_pfn, PTRS_PER_PTE);
1403 return -ENOSYS;
1406 // allocate pgd, pmd.
1407 i = conv_start_gpfn;
1408 while (i < expose_num_pfn) {
1409 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1410 PAGE_SHIFT);
1411 if (conv_pte == NULL) {
1412 i++;
1413 continue;
1416 assign_pte = lookup_alloc_domain_pte(d, (assign_start_gpfn <<
1417 PAGE_SHIFT) + i * sizeof(pte_t));
1418 if (assign_pte == NULL) {
1419 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n", __func__);
1420 return -ENOMEM;
1423 // skip to next pte page
1424 i += PTRS_PER_PTE;
1425 i &= ~(PTRS_PER_PTE - 1);
1428 // expose pte page
1429 i = 0;
1430 while (i < expose_num_pfn) {
1431 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1432 PAGE_SHIFT);
1433 if (conv_pte == NULL) {
1434 i++;
1435 continue;
1438 if (expose_p2m_page(d, (assign_start_gpfn << PAGE_SHIFT) +
1439 i * sizeof(pte_t), virt_to_page(conv_pte)) < 0) {
1440 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1441 return -EAGAIN;
1444 // skip to next pte page
1445 i += PTRS_PER_PTE;
1446 i &= ~(PTRS_PER_PTE - 1);
1449 // expose p2m_pte_zero_page
1450 for (i = 0; i < expose_num_pfn / PTRS_PER_PTE + 1; i++) {
1451 assign_pte = lookup_noalloc_domain_pte(d, (assign_start_gpfn + i) <<
1452 PAGE_SHIFT);
1453 if (assign_pte == NULL || pte_present(*assign_pte))
1454 continue;
1456 if (expose_p2m_page(d, (assign_start_gpfn + i) << PAGE_SHIFT,
1457 p2m_pte_zero_page) < 0) {
1458 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n", __func__);
1459 return -EAGAIN;
1463 return 0;
1465 #endif
1467 // grant table host mapping
1468 // mpaddr: host_addr: pseudo physical address
1469 // mfn: frame: machine page frame
1470 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1471 int
1472 create_grant_host_mapping(unsigned long gpaddr,
1473 unsigned long mfn, unsigned int flags)
1475 struct domain* d = current->domain;
1476 struct page_info* page;
1477 int ret;
1479 if (flags & (GNTMAP_device_map |
1480 GNTMAP_application_map | GNTMAP_contains_pte)) {
1481 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1482 return GNTST_general_error;
1485 BUG_ON(!mfn_valid(mfn));
1486 page = mfn_to_page(mfn);
1487 ret = get_page(page, page_get_owner(page));
1488 BUG_ON(ret == 0);
1489 assign_domain_page_replace(d, gpaddr, mfn,
1490 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1491 ASSIGN_tlb_track |
1492 #endif
1493 ((flags & GNTMAP_readonly) ?
1494 ASSIGN_readonly : ASSIGN_writable));
1495 perfc_incrc(create_grant_host_mapping);
1496 return GNTST_okay;
1499 // grant table host unmapping
1500 int
1501 destroy_grant_host_mapping(unsigned long gpaddr,
1502 unsigned long mfn, unsigned int flags)
1504 struct domain* d = current->domain;
1505 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
1506 volatile pte_t* pte;
1507 unsigned long cur_arflags;
1508 pte_t cur_pte;
1509 pte_t new_pte;
1510 pte_t old_pte;
1511 struct page_info* page = mfn_to_page(mfn);
1513 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1514 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1515 return GNTST_general_error;
1518 pte = lookup_noalloc_domain_pte(d, gpaddr);
1519 if (pte == NULL) {
1520 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
1521 __func__, gpaddr, mfn);
1522 return GNTST_general_error;
1525 again:
1526 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1527 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1528 if (!pte_present(cur_pte) ||
1529 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
1530 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1531 __func__, gpaddr, mfn, pte_val(cur_pte));
1532 return GNTST_general_error;
1534 new_pte = __pte(0);
1536 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1537 if (unlikely(!pte_present(old_pte))) {
1538 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx"
1539 " cur_pte 0x%lx old_pte 0x%lx\n",
1540 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1541 return GNTST_general_error;
1543 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1544 if (pte_pfn(old_pte) == mfn) {
1545 goto again;
1547 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
1548 "0x%lx old_pte 0x%lx\n",
1549 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1550 return GNTST_general_error;
1552 BUG_ON(pte_pfn(old_pte) != mfn);
1554 /* try_to_clear_PGC_allocate(d, page) is not needed. */
1555 BUG_ON(page_get_owner(page) == d &&
1556 get_gpfn_from_mfn(mfn) == gpfn);
1557 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
1559 perfc_incrc(destroy_grant_host_mapping);
1560 return GNTST_okay;
1563 // heavily depends on the struct page layout.
1564 // gnttab_transfer() calls steal_page() with memflags = 0
1565 // For grant table transfer, we must fill the page.
1566 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1567 // For memory exchange, we don't have to fill the page because
1568 // memory_exchange() does it.
1569 int
1570 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1572 #if 0 /* if big endian */
1573 # error "implement big endian version of steal_page()"
1574 #endif
1575 u32 _d, _nd;
1576 u64 x, nx, y;
1578 if (page_get_owner(page) != d) {
1579 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
1580 __func__, d, page_get_owner(page));
1581 return -1;
1584 if (!(memflags & MEMF_no_refcount)) {
1585 unsigned long gpfn;
1586 struct page_info *new;
1587 unsigned long new_mfn;
1588 int ret;
1590 new = alloc_domheap_page(d);
1591 if (new == NULL) {
1592 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
1593 return -1;
1595 // zero out pages for security reasons
1596 clear_page(page_to_virt(new));
1597 // assign_domain_page_cmpxchg_rel() has release semantics
1598 // so smp_mb() isn't needed.
1600 ret = get_page(new, d);
1601 BUG_ON(ret == 0);
1603 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1604 if (gpfn == INVALID_M2P_ENTRY) {
1605 free_domheap_page(new);
1606 return -1;
1608 new_mfn = page_to_mfn(new);
1609 set_gpfn_from_mfn(new_mfn, gpfn);
1610 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1611 // has release semantics.
1613 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1614 ASSIGN_writable |
1615 ASSIGN_pgc_allocated);
1616 if (ret < 0) {
1617 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
1618 ret);
1619 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1620 free_domheap_page(new);
1621 return -1;
1623 perfc_incrc(steal_page_refcount);
1626 spin_lock(&d->page_alloc_lock);
1628 /*
1629 * The tricky bit: atomically release ownership while there is just one
1630 * benign reference to the page (PGC_allocated). If that reference
1631 * disappears then the deallocation routine will safely spin.
1632 */
1633 _d = pickle_domptr(d);
1634 y = *((u64*)&page->count_info);
1635 do {
1636 x = y;
1637 nx = x & 0xffffffff;
1638 // page->count_info: untouched
1639 // page->u.inused._domain = 0;
1640 _nd = x >> 32;
1642 if (
1643 // when !MEMF_no_refcount, page might be put_page()'d or
1644 // it will be put_page()'d later depending on queued.
1645 unlikely(!(memflags & MEMF_no_refcount) &&
1646 ((x & (PGC_count_mask | PGC_allocated)) !=
1647 (1 | PGC_allocated))) ||
1648 // when MEMF_no_refcount, page isn't de-assigned from
1649 // this domain yet. So count_info = 2
1650 unlikely((memflags & MEMF_no_refcount) &&
1651 ((x & (PGC_count_mask | PGC_allocated)) !=
1652 (2 | PGC_allocated))) ||
1654 unlikely(_nd != _d)) {
1655 struct domain* nd = unpickle_domptr(_nd);
1656 if (nd == NULL) {
1657 gdprintk(XENLOG_INFO, "gnttab_transfer: "
1658 "Bad page %p: ed=%p(%u) 0x%x, "
1659 "sd=%p 0x%x,"
1660 " caf=%016lx, taf=%" PRtype_info
1661 " memflags 0x%x\n",
1662 (void *) page_to_mfn(page),
1663 d, d->domain_id, _d,
1664 nd, _nd,
1665 x,
1666 page->u.inuse.type_info,
1667 memflags);
1668 } else {
1669 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
1670 "Bad page %p: ed=%p(%u) 0x%x, "
1671 "sd=%p(%u) 0x%x,"
1672 " caf=%016lx, taf=%" PRtype_info
1673 " memflags 0x%x\n",
1674 (void *) page_to_mfn(page),
1675 d, d->domain_id, _d,
1676 nd, nd->domain_id, _nd,
1677 x,
1678 page->u.inuse.type_info,
1679 memflags);
1681 spin_unlock(&d->page_alloc_lock);
1682 return -1;
1685 y = cmpxchg((u64*)&page->count_info, x, nx);
1686 } while (unlikely(y != x));
1688 /*
1689 * Unlink from 'd'. At least one reference remains (now anonymous), so
1690 * noone else is spinning to try to delete this page from 'd'.
1691 */
1692 if ( !(memflags & MEMF_no_refcount) )
1693 d->tot_pages--;
1694 list_del(&page->list);
1696 spin_unlock(&d->page_alloc_lock);
1697 perfc_incrc(steal_page);
1698 return 0;
1701 void
1702 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1703 unsigned long mfn)
1705 int ret;
1707 BUG_ON(!mfn_valid(mfn));
1708 ret = get_page(mfn_to_page(mfn), d);
1709 BUG_ON(ret == 0);
1710 set_gpfn_from_mfn(mfn, gpfn);
1711 smp_mb();
1712 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
1713 ASSIGN_writable | ASSIGN_pgc_allocated);
1715 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1717 perfc_incrc(guest_physmap_add_page);
1720 void
1721 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1722 unsigned long mfn)
1724 BUG_ON(mfn == 0);//XXX
1725 zap_domain_page_one(d, gpfn << PAGE_SHIFT, mfn);
1726 perfc_incrc(guest_physmap_remove_page);
1729 static void
1730 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
1731 volatile pte_t* ptep, pte_t old_pte,
1732 struct page_info* page)
1734 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1735 struct tlb_track_entry* entry;
1736 #endif
1738 if (shadow_mode_enabled(d))
1739 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1741 #ifndef CONFIG_XEN_IA64_TLB_TRACK
1742 //XXX sledgehammer.
1743 // flush finer range.
1744 domain_flush_vtlb_all(d);
1745 put_page(page);
1746 #else
1747 switch (tlb_track_search_and_remove(d->arch.tlb_track,
1748 ptep, old_pte, &entry)) {
1749 case TLB_TRACK_NOT_TRACKED:
1750 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
1751 /* This page is zapped from this domain
1752 * by memory decrease or exchange or dom0vp_zap_physmap.
1753 * I.e. the page is zapped for returning this page to xen
1754 * (balloon driver or DMA page allocation) or
1755 * foreign domain mapped page is unmapped from the domain.
1756 * In the former case the page is to be freed so that
1757 * we can defer freeing page to batch.
1758 * In the latter case the page is unmapped so that
1759 * we need to flush it. But to optimize it, we
1760 * queue the page and flush vTLB only once.
1761 * I.e. The caller must call dfree_flush() explicitly.
1762 */
1763 domain_flush_vtlb_all(d);
1764 put_page(page);
1765 break;
1766 case TLB_TRACK_NOT_FOUND:
1767 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
1768 /* This page is zapped from this domain
1769 * by grant table page unmap.
1770 * Luckily the domain that mapped this page didn't
1771 * access this page so that we don't have to flush vTLB.
1772 * Probably the domain did only DMA.
1773 */
1774 /* do nothing */
1775 put_page(page);
1776 break;
1777 case TLB_TRACK_FOUND:
1778 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
1779 /* This page is zapped from this domain
1780 * by grant table page unmap.
1781 * Fortunately this page is accessced via only one virtual
1782 * memory address. So it is easy to flush it.
1783 */
1784 domain_flush_vtlb_track_entry(d, entry);
1785 tlb_track_free_entry(d->arch.tlb_track, entry);
1786 put_page(page);
1787 break;
1788 case TLB_TRACK_MANY:
1789 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
1790 /* This page is zapped from this domain
1791 * by grant table page unmap.
1792 * Unfortunately this page is accessced via many virtual
1793 * memory address (or too many times with single virtual address).
1794 * So we abondaned to track virtual addresses.
1795 * full vTLB flush is necessary.
1796 */
1797 domain_flush_vtlb_all(d);
1798 put_page(page);
1799 break;
1800 case TLB_TRACK_AGAIN:
1801 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
1802 BUG();
1803 break;
1805 #endif
1806 perfc_incrc(domain_page_flush_and_put);
1809 int
1810 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1812 volatile pte_t * pte;
1814 pte = lookup_noalloc_domain_pte(d, mpaddr);
1815 if(pte != NULL && !pte_none(*pte))
1816 return 1;
1817 return 0;
1820 /* Flush cache of domain d. */
1821 void domain_cache_flush (struct domain *d, int sync_only)
1823 struct mm_struct *mm = &d->arch.mm;
1824 volatile pgd_t *pgd = mm->pgd;
1825 unsigned long maddr;
1826 int i,j,k, l;
1827 int nbr_page = 0;
1828 void (*flush_func)(unsigned long start, unsigned long end);
1829 extern void flush_dcache_range (unsigned long, unsigned long);
1831 if (sync_only)
1832 flush_func = &flush_icache_range;
1833 else
1834 flush_func = &flush_dcache_range;
1836 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1837 volatile pud_t *pud;
1838 if (!pgd_present(*pgd)) // acquire semantics
1839 continue;
1840 pud = pud_offset(pgd, 0);
1841 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1842 volatile pmd_t *pmd;
1843 if (!pud_present(*pud)) // acquire semantics
1844 continue;
1845 pmd = pmd_offset(pud, 0);
1846 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1847 volatile pte_t *pte;
1848 if (!pmd_present(*pmd)) // acquire semantics
1849 continue;
1850 pte = pte_offset_map(pmd, 0);
1851 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1852 if (!pte_present(*pte)) // acquire semantics
1853 continue;
1854 /* Convert PTE to maddr. */
1855 maddr = __va_ul (pte_val(*pte)
1856 & _PAGE_PPN_MASK);
1857 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1858 nbr_page++;
1863 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1866 #ifdef VERBOSE
1867 #define MEM_LOG(_f, _a...) \
1868 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1869 current->domain->domain_id , __LINE__ , ## _a )
1870 #else
1871 #define MEM_LOG(_f, _a...) ((void)0)
1872 #endif
1874 static void free_page_type(struct page_info *page, u32 type)
1878 static int alloc_page_type(struct page_info *page, u32 type)
1880 return 1;
1883 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1885 void *p = alloc_xenheap_pages(order);
1887 memset(p,0,PAGE_SIZE<<order);
1888 return (unsigned long)p;
1891 void __free_pages(struct page_info *page, unsigned int order)
1893 if (order) BUG();
1894 free_xenheap_page(page);
1897 void *pgtable_quicklist_alloc(void)
1899 void *p;
1900 p = alloc_xenheap_pages(0);
1901 if (p)
1902 clear_page(p);
1903 return p;
1906 void pgtable_quicklist_free(void *pgtable_entry)
1908 free_xenheap_page(pgtable_entry);
1911 void put_page_type(struct page_info *page)
1913 u32 nx, x, y = page->u.inuse.type_info;
1915 again:
1916 do {
1917 x = y;
1918 nx = x - 1;
1920 ASSERT((x & PGT_count_mask) != 0);
1922 /*
1923 * The page should always be validated while a reference is held. The
1924 * exception is during domain destruction, when we forcibly invalidate
1925 * page-table pages if we detect a referential loop.
1926 * See domain.c:relinquish_list().
1927 */
1928 ASSERT((x & PGT_validated) ||
1929 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1931 if ( unlikely((nx & PGT_count_mask) == 0) )
1933 /* Record TLB information for flush later. Races are harmless. */
1934 page->tlbflush_timestamp = tlbflush_current_time();
1936 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1937 likely(nx & PGT_validated) )
1939 /*
1940 * Page-table pages must be unvalidated when count is zero. The
1941 * 'free' is safe because the refcnt is non-zero and validated
1942 * bit is clear => other ops will spin or fail.
1943 */
1944 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1945 x & ~PGT_validated)) != x) )
1946 goto again;
1947 /* We cleared the 'valid bit' so we do the clean up. */
1948 free_page_type(page, x);
1949 /* Carry on, but with the 'valid bit' now clear. */
1950 x &= ~PGT_validated;
1951 nx &= ~PGT_validated;
1955 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1959 int get_page_type(struct page_info *page, u32 type)
1961 u32 nx, x, y = page->u.inuse.type_info;
1963 ASSERT(!(type & ~PGT_type_mask));
1965 again:
1966 do {
1967 x = y;
1968 nx = x + 1;
1969 if ( unlikely((nx & PGT_count_mask) == 0) )
1971 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1972 return 0;
1974 else if ( unlikely((x & PGT_count_mask) == 0) )
1976 if ( (x & PGT_type_mask) != type )
1978 /*
1979 * On type change we check to flush stale TLB entries. This
1980 * may be unnecessary (e.g., page was GDT/LDT) but those
1981 * circumstances should be very rare.
1982 */
1983 cpumask_t mask =
1984 page_get_owner(page)->domain_dirty_cpumask;
1985 tlbflush_filter(mask, page->tlbflush_timestamp);
1987 if ( unlikely(!cpus_empty(mask)) )
1989 perfc_incrc(need_flush_tlb_flush);
1990 flush_tlb_mask(mask);
1993 /* We lose existing type, back pointer, and validity. */
1994 nx &= ~(PGT_type_mask | PGT_validated);
1995 nx |= type;
1997 /* No special validation needed for writable pages. */
1998 /* Page tables and GDT/LDT need to be scanned for validity. */
1999 if ( type == PGT_writable_page )
2000 nx |= PGT_validated;
2003 else if ( unlikely((x & PGT_type_mask) != type) )
2005 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
2006 (type != PGT_l1_page_table) )
2007 MEM_LOG("Bad type (saw %08x != exp %08x) "
2008 "for mfn %016lx (pfn %016lx)",
2009 x, type, page_to_mfn(page),
2010 get_gpfn_from_mfn(page_to_mfn(page)));
2011 return 0;
2013 else if ( unlikely(!(x & PGT_validated)) )
2015 /* Someone else is updating validation of this page. Wait... */
2016 while ( (y = page->u.inuse.type_info) == x )
2017 cpu_relax();
2018 goto again;
2021 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
2023 if ( unlikely(!(nx & PGT_validated)) )
2025 /* Try to validate page type; drop the new reference on failure. */
2026 if ( unlikely(!alloc_page_type(page, type)) )
2028 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
2029 ": caf=%08x taf=%" PRtype_info,
2030 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2031 type, page->count_info, page->u.inuse.type_info);
2032 /* Noone else can get a reference. We hold the only ref. */
2033 page->u.inuse.type_info = 0;
2034 return 0;
2037 /* Noone else is updating simultaneously. */
2038 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2041 return 1;
2044 int memory_is_conventional_ram(paddr_t p)
2046 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
2050 long
2051 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2053 switch (op) {
2054 case XENMEM_add_to_physmap:
2056 struct xen_add_to_physmap xatp;
2057 unsigned long prev_mfn, mfn = 0, gpfn;
2058 struct domain *d;
2060 if (copy_from_guest(&xatp, arg, 1))
2061 return -EFAULT;
2063 if (xatp.domid == DOMID_SELF) {
2064 d = get_current_domain();
2066 else if (!IS_PRIV(current->domain))
2067 return -EPERM;
2068 else if ((d = get_domain_by_id(xatp.domid)) == NULL)
2069 return -ESRCH;
2071 /* This hypercall is used for VT-i domain only */
2072 if (!VMX_DOMAIN(d->vcpu[0])) {
2073 put_domain(d);
2074 return -ENOSYS;
2077 switch (xatp.space) {
2078 case XENMAPSPACE_shared_info:
2079 if (xatp.idx == 0)
2080 mfn = virt_to_mfn(d->shared_info);
2081 break;
2082 case XENMAPSPACE_grant_table:
2083 spin_lock(&d->grant_table->lock);
2084 if (xatp.idx < nr_grant_frames(d->grant_table))
2085 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2086 spin_unlock(&d->grant_table->lock);
2087 break;
2088 default:
2089 break;
2092 LOCK_BIGLOCK(d);
2094 /* Remove previously mapped page if it was present. */
2095 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2096 if (prev_mfn && mfn_valid(prev_mfn)) {
2097 if (IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)))
2098 /* Xen heap frames are simply unhooked from this phys slot. */
2099 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2100 else
2101 /* Normal domain memory is freed, to avoid leaking memory. */
2102 guest_remove_page(d, xatp.gpfn);
2105 /* Unmap from old location, if any. */
2106 gpfn = get_gpfn_from_mfn(mfn);
2107 if (gpfn != INVALID_M2P_ENTRY)
2108 guest_physmap_remove_page(d, gpfn, mfn);
2110 /* Map at new location. */
2111 guest_physmap_add_page(d, xatp.gpfn, mfn);
2113 UNLOCK_BIGLOCK(d);
2115 put_domain(d);
2117 break;
2120 default:
2121 return -ENOSYS;
2124 return 0;
2127 /*
2128 * Local variables:
2129 * mode: C
2130 * c-set-style: "BSD"
2131 * c-basic-offset: 4
2132 * tab-width: 4
2133 * indent-tabs-mode: nil
2134 * End:
2135 */