ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 14797:acf561f90822

[IA64] Formatting fix

This extra space after the newline causes printk to get confused
and re-evaluate do_print and leave start_of_line == 0. Anything
printed after this will always print, regardless of intended
log level.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author Alex Williamson <alex.williamson@hp.com>
date Thu Apr 12 10:24:53 2007 -0600 (2007-04-12)
parents 4b13fc910acf
children 8745300bec4e 23c4790512db
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <xen/guest_access.h>
179 #include <asm/page.h>
180 #include <public/memory.h>
182 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
183 volatile pte_t* ptep, pte_t old_pte,
184 struct page_info* page);
186 extern unsigned long ia64_iobase;
188 static struct domain *dom_xen, *dom_io;
190 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
191 void
192 alloc_dom_xen_and_dom_io(void)
193 {
194 /*
195 * Initialise our DOMID_XEN domain.
196 * Any Xen-heap pages that we will allow to be mapped will have
197 * their domain field set to dom_xen.
198 */
199 dom_xen = alloc_domain(DOMID_XEN);
200 BUG_ON(dom_xen == NULL);
202 /*
203 * Initialise our DOMID_IO domain.
204 * This domain owns I/O pages that are within the range of the page_info
205 * array. Mappings occur at the priv of the caller.
206 */
207 dom_io = alloc_domain(DOMID_IO);
208 BUG_ON(dom_io == NULL);
209 }
211 static void
212 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
213 {
214 pte_t old_pte;
215 unsigned long mfn;
216 struct page_info* page;
218 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
220 // vmx domain use bit[58:56] to distinguish io region from memory.
221 // see vmx_build_physmap_table() in vmx_init.c
222 if (!pte_mem(old_pte))
223 return;
225 // domain might map IO space or acpi table pages. check it.
226 mfn = pte_pfn(old_pte);
227 if (!mfn_valid(mfn))
228 return;
229 page = mfn_to_page(mfn);
230 // page might be pte page for p2m exposing. check it.
231 if (page_get_owner(page) == NULL) {
232 BUG_ON(page->count_info != 0);
233 return;
234 }
235 // struct page_info corresponding to mfn may exist or not depending
236 // on CONFIG_VIRTUAL_FRAME_TABLE.
237 // The above check is too easy.
238 // The right way is to check whether this page is of io area or acpi pages
240 if (pte_pgc_allocated(old_pte)) {
241 BUG_ON(page_get_owner(page) != d);
242 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
243 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
244 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
245 put_page(page);
246 } else {
247 put_page(page);
248 }
249 }
251 static void
252 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
253 {
254 unsigned long i;
255 volatile pte_t* pte = pte_offset_map(pmd, offset);
257 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
258 if (!pte_present(*pte)) // acquire semantics
259 continue;
260 mm_teardown_pte(d, pte, offset + (i << PAGE_SHIFT));
261 }
262 }
264 static void
265 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
266 {
267 unsigned long i;
268 volatile pmd_t *pmd = pmd_offset(pud, offset);
270 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
271 if (!pmd_present(*pmd)) // acquire semantics
272 continue;
273 mm_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
274 }
275 }
277 static void
278 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
279 {
280 unsigned long i;
281 volatile pud_t *pud = pud_offset(pgd, offset);
283 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
284 if (!pud_present(*pud)) // acquire semantics
285 continue;
286 mm_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
287 }
288 }
290 void
291 mm_teardown(struct domain* d)
292 {
293 struct mm_struct* mm = &d->arch.mm;
294 unsigned long i;
295 volatile pgd_t* pgd;
297 if (mm->pgd == NULL)
298 return;
300 pgd = pgd_offset(mm, 0);
301 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
302 if (!pgd_present(*pgd)) // acquire semantics
303 continue;
304 mm_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
305 }
306 }
308 static void
309 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
310 unsigned long offset)
311 {
312 pte_free_kernel(pte_offset_map(pmd, offset));
313 }
315 static void
316 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
317 unsigned long offset)
318 {
319 unsigned long i;
320 volatile pmd_t *pmd = pmd_offset(pud, offset);
322 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
323 if (!pmd_present(*pmd))
324 continue;
325 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
326 }
327 pmd_free(pmd_offset(pud, offset));
328 }
330 static void
331 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
332 unsigned long offset)
333 {
334 unsigned long i;
335 volatile pud_t *pud = pud_offset(pgd, offset);
337 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
338 if (!pud_present(*pud))
339 continue;
340 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
341 }
342 pud_free(pud_offset(pgd, offset));
343 }
345 static void
346 mm_p2m_teardown(struct domain* d)
347 {
348 struct mm_struct* mm = &d->arch.mm;
349 unsigned long i;
350 volatile pgd_t* pgd;
352 BUG_ON(mm->pgd == NULL);
353 pgd = pgd_offset(mm, 0);
354 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
355 if (!pgd_present(*pgd))
356 continue;
357 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
358 }
359 pgd_free(mm->pgd);
360 mm->pgd = NULL;
361 }
363 void
364 mm_final_teardown(struct domain* d)
365 {
366 if (d->arch.shadow_bitmap != NULL) {
367 xfree(d->arch.shadow_bitmap);
368 d->arch.shadow_bitmap = NULL;
369 }
370 mm_p2m_teardown(d);
371 }
373 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
374 void
375 share_xen_page_with_guest(struct page_info *page,
376 struct domain *d, int readonly)
377 {
378 if ( page_get_owner(page) == d )
379 return;
381 #if 1
382 if (readonly) {
383 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
384 }
385 #endif
387 // alloc_xenheap_pages() doesn't initialize page owner.
388 //BUG_ON(page_get_owner(page) != NULL);
390 spin_lock(&d->page_alloc_lock);
392 #ifndef __ia64__
393 /* The incremented type count pins as writable or read-only. */
394 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
395 page->u.inuse.type_info |= PGT_validated | 1;
396 #endif
398 page_set_owner(page, d);
399 wmb(); /* install valid domain ptr before updating refcnt. */
400 ASSERT(page->count_info == 0);
402 /* Only add to the allocation list if the domain isn't dying. */
403 if ( !d->is_dying )
404 {
405 page->count_info |= PGC_allocated | 1;
406 if ( unlikely(d->xenheap_pages++ == 0) )
407 get_knownalive_domain(d);
408 list_add_tail(&page->list, &d->xenpage_list);
409 }
411 // grant_table_destroy() releases these pages.
412 // but it doesn't clear their m2p entry. So there might remain stale
413 // entries. such a stale entry is cleared here.
414 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
416 spin_unlock(&d->page_alloc_lock);
417 }
419 void
420 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
421 {
422 share_xen_page_with_guest(page, dom_xen, readonly);
423 }
425 unsigned long
426 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
427 {
428 unsigned long pte;
430 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
431 if (!pte) {
432 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
433 }
434 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
435 }
437 // given a domain virtual address, pte and pagesize, extract the metaphysical
438 // address, convert the pte for a physical address for (possibly different)
439 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
440 // PAGE_SIZE!)
441 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
442 struct p2m_entry* entry)
443 {
444 struct domain *d = current->domain;
445 ia64_itir_t itir = {.itir = itir__};
446 u64 mask, mpaddr, pteval2;
447 u64 arflags;
448 u64 arflags2;
449 u64 maflags2;
451 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
453 // FIXME address had better be pre-validated on insert
454 mask = ~itir_mask(itir.itir);
455 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
457 if (itir.ps > PAGE_SHIFT)
458 itir.ps = PAGE_SHIFT;
460 *logps = itir.ps;
462 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
464 /* Check access rights. */
465 arflags = pteval & _PAGE_AR_MASK;
466 arflags2 = pteval2 & _PAGE_AR_MASK;
467 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
468 #if 0
469 dprintk(XENLOG_WARNING,
470 "%s:%d "
471 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
472 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
473 __func__, __LINE__,
474 pteval, arflags, address, itir__,
475 pteval2, arflags2, mpaddr);
476 #endif
477 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
478 }
480 /* Check memory attribute. The switch is on the *requested* memory
481 attribute. */
482 maflags2 = pteval2 & _PAGE_MA_MASK;
483 switch (pteval & _PAGE_MA_MASK) {
484 case _PAGE_MA_NAT:
485 /* NaT pages are always accepted! */
486 break;
487 case _PAGE_MA_UC:
488 case _PAGE_MA_UCE:
489 case _PAGE_MA_WC:
490 if (maflags2 == _PAGE_MA_WB) {
491 /* Don't let domains WB-map uncached addresses.
492 This can happen when domU tries to touch i/o
493 port space. Also prevents possible address
494 aliasing issues. */
495 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE))
496 gdprintk(XENLOG_WARNING, "Warning: UC to WB "
497 "for mpaddr=%lx\n", mpaddr);
498 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
499 }
500 break;
501 case _PAGE_MA_WB:
502 if (maflags2 != _PAGE_MA_WB) {
503 /* Forbid non-coherent access to coherent memory. */
504 panic_domain(NULL, "try to use WB mem attr on "
505 "UC page, mpaddr=%lx\n", mpaddr);
506 }
507 break;
508 default:
509 panic_domain(NULL, "try to use unknown mem attribute\n");
510 }
512 /* If shadow mode is enabled, virtualize dirty bit. */
513 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
514 u64 mp_page = mpaddr >> PAGE_SHIFT;
515 pteval |= _PAGE_VIRT_D;
517 /* If the page is not already dirty, don't set the dirty bit! */
518 if (mp_page < d->arch.shadow_bitmap_size * 8
519 && !test_bit(mp_page, d->arch.shadow_bitmap))
520 pteval &= ~_PAGE_D;
521 }
523 /* Ignore non-addr bits of pteval2 and force PL0->2
524 (PL3 is unaffected) */
525 return (pteval & ~_PAGE_PPN_MASK) |
526 (pteval2 & _PAGE_PPN_MASK) | _PAGE_PL_2;
527 }
529 // given a current domain metaphysical address, return the physical address
530 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
531 struct p2m_entry* entry)
532 {
533 unsigned long pteval;
535 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
536 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
537 }
539 //XXX !xxx_present() should be used instread of !xxx_none()?
540 // pud, pmd, pte page is zero cleared when they are allocated.
541 // Their area must be visible before population so that
542 // cmpxchg must have release semantics.
543 static volatile pte_t*
544 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
545 {
546 struct mm_struct *mm = &d->arch.mm;
547 volatile pgd_t *pgd;
548 volatile pud_t *pud;
549 volatile pmd_t *pmd;
551 BUG_ON(mm->pgd == NULL);
553 pgd = pgd_offset(mm, mpaddr);
554 again_pgd:
555 if (unlikely(pgd_none(*pgd))) { // acquire semantics
556 pud_t *old_pud = NULL;
557 pud = pud_alloc_one(mm, mpaddr);
558 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
559 pud_free(pud);
560 goto again_pgd;
561 }
562 }
564 pud = pud_offset(pgd, mpaddr);
565 again_pud:
566 if (unlikely(pud_none(*pud))) { // acquire semantics
567 pmd_t* old_pmd = NULL;
568 pmd = pmd_alloc_one(mm, mpaddr);
569 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
570 pmd_free(pmd);
571 goto again_pud;
572 }
573 }
575 pmd = pmd_offset(pud, mpaddr);
576 again_pmd:
577 if (unlikely(pmd_none(*pmd))) { // acquire semantics
578 pte_t* old_pte = NULL;
579 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
580 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
581 pte_free_kernel(pte);
582 goto again_pmd;
583 }
584 }
586 return pte_offset_map(pmd, mpaddr);
587 }
589 //XXX xxx_none() should be used instread of !xxx_present()?
590 volatile pte_t*
591 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
592 {
593 struct mm_struct *mm = &d->arch.mm;
594 volatile pgd_t *pgd;
595 volatile pud_t *pud;
596 volatile pmd_t *pmd;
598 BUG_ON(mm->pgd == NULL);
599 pgd = pgd_offset(mm, mpaddr);
600 if (unlikely(!pgd_present(*pgd))) // acquire semantics
601 return NULL;
603 pud = pud_offset(pgd, mpaddr);
604 if (unlikely(!pud_present(*pud))) // acquire semantics
605 return NULL;
607 pmd = pmd_offset(pud, mpaddr);
608 if (unlikely(!pmd_present(*pmd))) // acquire semantics
609 return NULL;
611 return pte_offset_map(pmd, mpaddr);
612 }
614 static volatile pte_t*
615 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
616 {
617 struct mm_struct *mm = &d->arch.mm;
618 volatile pgd_t *pgd;
619 volatile pud_t *pud;
620 volatile pmd_t *pmd;
622 BUG_ON(mm->pgd == NULL);
623 pgd = pgd_offset(mm, mpaddr);
624 if (unlikely(pgd_none(*pgd))) // acquire semantics
625 return NULL;
627 pud = pud_offset(pgd, mpaddr);
628 if (unlikely(pud_none(*pud))) // acquire semantics
629 return NULL;
631 pmd = pmd_offset(pud, mpaddr);
632 if (unlikely(pmd_none(*pmd))) // acquire semantics
633 return NULL;
635 return pte_offset_map(pmd, mpaddr);
636 }
638 unsigned long
639 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
640 {
641 volatile pte_t *pte;
643 pte = lookup_noalloc_domain_pte(d, mpaddr);
644 if (pte == NULL)
645 return INVALID_MFN;
647 if (pte_present(*pte))
648 return (pte->pte & _PFN_MASK);
649 else if (VMX_DOMAIN(d->vcpu[0]))
650 return GPFN_INV_MASK;
651 return INVALID_MFN;
652 }
654 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
655 struct p2m_entry* entry)
656 {
657 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
659 if (pte != NULL) {
660 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
661 if (pte_present(tmp_pte)) {
662 if (entry != NULL)
663 p2m_entry_set(entry, pte, tmp_pte);
664 return pte_val(tmp_pte);
665 } else if (VMX_DOMAIN(d->vcpu[0]))
666 return GPFN_INV_MASK;
667 }
669 if (mpaddr < d->arch.convmem_end) {
670 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
671 "0x%lx (< 0x%lx)\n", current->vcpu_id, PSCB(current, iip),
672 mpaddr, d->arch.convmem_end);
673 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
674 /* Log I/O port probing, but complain less loudly about it */
675 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
676 "0x%lx\n", current->vcpu_id, PSCB(current, iip),
677 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
678 } else {
679 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa 0x%lx "
680 "(=> 0x%lx)\n", current->vcpu_id, PSCB(current, iip),
681 mpaddr, d->arch.convmem_end);
682 }
684 if (entry != NULL)
685 p2m_entry_set(entry, NULL, __pte(0));
686 //XXX This is a work around until the emulation memory access to a region
687 // where memory or device are attached is implemented.
688 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
689 }
691 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
692 #if 1
693 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
694 {
695 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
696 unsigned long imva;
698 pte &= _PAGE_PPN_MASK;
699 imva = (unsigned long) __va(pte);
700 imva |= mpaddr & ~PAGE_MASK;
701 return (void*)imva;
702 }
703 #else
704 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
705 {
706 unsigned long imva = __gpa_to_mpa(d, mpaddr);
708 return (void *)__va(imva);
709 }
710 #endif
712 unsigned long
713 xencomm_paddr_to_maddr(unsigned long paddr)
714 {
715 struct vcpu *v = current;
716 struct domain *d = v->domain;
717 u64 pa;
719 pa = ____lookup_domain_mpa(d, paddr);
720 if (pa == INVALID_MFN) {
721 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
722 __func__, paddr, vcpu_regs(v)->cr_iip);
723 return 0;
724 }
725 return __va_ul((pa & _PFN_MASK) | (paddr & ~PAGE_MASK));
726 }
728 /* Allocate a new page for domain and map it to the specified metaphysical
729 address. */
730 static struct page_info *
731 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
732 volatile pte_t* pte)
733 {
734 struct page_info *p;
735 unsigned long maddr;
737 BUG_ON(!pte_none(*pte));
739 p = alloc_domheap_page(d);
740 if (unlikely(!p)) {
741 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
742 return(p);
743 }
745 // zero out pages for security reasons
746 clear_page(page_to_virt(p));
747 maddr = page_to_maddr (p);
748 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
749 && maddr < __get_cpu_var(vhpt_pend))) {
750 /* FIXME: how can this happen ?
751 vhpt is allocated by alloc_domheap_page. */
752 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
753 maddr);
754 }
756 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
757 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
758 // because set_pte_rel() has release semantics
759 set_pte_rel(pte,
760 pfn_pte(maddr >> PAGE_SHIFT,
761 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
762 _PAGE_PL_2 | _PAGE_AR_RWX)));
764 smp_mb();
765 return p;
766 }
768 struct page_info *
769 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
770 {
771 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
773 if (!pte_none(*pte))
774 return NULL;
776 return __assign_new_domain_page(d, mpaddr, pte);
777 }
779 void
780 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
781 {
782 volatile pte_t *pte;
784 BUG_ON(d != dom0);
785 pte = lookup_alloc_domain_pte(d, mpaddr);
786 if (pte_none(*pte)) {
787 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
788 if (p == NULL) {
789 panic("%s: can't allocate page for dom0", __func__);
790 }
791 }
792 }
794 static unsigned long
795 flags_to_prot (unsigned long flags)
796 {
797 unsigned long res = _PAGE_PL_2 | __DIRTY_BITS;
799 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
800 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
801 #ifdef CONFIG_XEN_IA64_TLB_TRACK
802 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
803 #endif
804 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
806 return res;
807 }
809 /* map a physical address to the specified metaphysical addr */
810 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
811 // This is called by assign_domain_mmio_page().
812 // So accessing to pte is racy.
813 int
814 __assign_domain_page(struct domain *d,
815 unsigned long mpaddr, unsigned long physaddr,
816 unsigned long flags)
817 {
818 volatile pte_t *pte;
819 pte_t old_pte;
820 pte_t new_pte;
821 pte_t ret_pte;
822 unsigned long prot = flags_to_prot(flags);
824 pte = lookup_alloc_domain_pte(d, mpaddr);
826 old_pte = __pte(0);
827 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
828 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
829 if (pte_val(ret_pte) == pte_val(old_pte)) {
830 smp_mb();
831 return 0;
832 }
834 // dom0 tries to map real machine's I/O region, but failed.
835 // It is very likely that dom0 doesn't boot correctly because
836 // it can't access I/O. So complain here.
837 if ((flags & ASSIGN_nocache) &&
838 (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT) ||
839 !(pte_val(ret_pte) & _PAGE_MA_UC)))
840 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
841 "\talready assigned pte_val 0x%016lx\n"
842 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
843 __func__, __LINE__,
844 d, d->domain_id, pte_val(ret_pte),
845 mpaddr, physaddr, flags);
847 return -EAGAIN;
848 }
850 /* get_page() and map a physical address to the specified metaphysical addr */
851 void
852 assign_domain_page(struct domain *d,
853 unsigned long mpaddr, unsigned long physaddr)
854 {
855 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
857 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
858 BUG_ON(page->count_info != (PGC_allocated | 1));
859 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
860 // because __assign_domain_page() uses set_pte_rel() which has
861 // release semantics, smp_mb() isn't needed.
862 (void)__assign_domain_page(d, mpaddr, physaddr,
863 ASSIGN_writable | ASSIGN_pgc_allocated);
864 }
866 int
867 ioports_permit_access(struct domain *d, unsigned long fp, unsigned long lp)
868 {
869 int ret;
870 unsigned long off;
871 unsigned long fp_offset;
872 unsigned long lp_offset;
874 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
875 if (ret != 0)
876 return ret;
878 /* Domain 0 doesn't virtualize IO ports space. */
879 if (d == dom0)
880 return 0;
882 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
883 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
885 for (off = fp_offset; off <= lp_offset; off += PAGE_SIZE)
886 (void)__assign_domain_page(d, IO_PORTS_PADDR + off,
887 __pa(ia64_iobase) + off, ASSIGN_nocache);
889 return 0;
890 }
892 static int
893 ioports_has_allowed(struct domain *d, unsigned long fp, unsigned long lp)
894 {
895 unsigned long i;
896 for (i = fp; i < lp; i++)
897 if (rangeset_contains_singleton(d->arch.ioport_caps, i))
898 return 1;
899 return 0;
900 }
902 int
903 ioports_deny_access(struct domain *d, unsigned long fp, unsigned long lp)
904 {
905 int ret;
906 struct mm_struct *mm = &d->arch.mm;
907 unsigned long off;
908 unsigned long io_ports_base;
909 unsigned long fp_offset;
910 unsigned long lp_offset;
912 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
913 if (ret != 0)
914 return ret;
915 if (d == dom0)
916 io_ports_base = __pa(ia64_iobase);
917 else
918 io_ports_base = IO_PORTS_PADDR;
920 fp_offset = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
921 lp_offset = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
923 for (off = fp_offset; off < lp_offset; off += PAGE_SIZE) {
924 unsigned long mpaddr = io_ports_base + off;
925 unsigned long port;
926 volatile pte_t *pte;
927 pte_t old_pte;
929 port = IO_SPACE_SPARSE_DECODING (off);
930 if (port < fp || port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1 > lp) {
931 /* Maybe this covers an allowed port. */
932 if (ioports_has_allowed(d, port,
933 port + IO_SPACE_SPARSE_PORTS_PER_PAGE - 1))
934 continue;
935 }
937 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
938 BUG_ON(pte == NULL);
939 BUG_ON(pte_none(*pte));
941 // clear pte
942 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
943 }
944 domain_flush_vtlb_all(d);
945 return 0;
946 }
948 static void
949 assign_domain_same_page(struct domain *d,
950 unsigned long mpaddr, unsigned long size,
951 unsigned long flags)
952 {
953 //XXX optimization
954 unsigned long end = PAGE_ALIGN(mpaddr + size);
955 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
956 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
957 }
958 }
960 int
961 efi_mmio(unsigned long physaddr, unsigned long size)
962 {
963 void *efi_map_start, *efi_map_end;
964 u64 efi_desc_size;
965 void* p;
967 efi_map_start = __va(ia64_boot_param->efi_memmap);
968 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
969 efi_desc_size = ia64_boot_param->efi_memdesc_size;
971 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
972 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
973 unsigned long start = md->phys_addr;
974 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
976 if (start <= physaddr && physaddr < end) {
977 if ((physaddr + size) > end) {
978 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
979 __func__, physaddr, size);
980 return 0;
981 }
983 // for io space
984 if (md->type == EFI_MEMORY_MAPPED_IO ||
985 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
986 return 1;
987 }
989 // for runtime
990 // see efi_enter_virtual_mode(void)
991 // in linux/arch/ia64/kernel/efi.c
992 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
993 !(md->attribute & EFI_MEMORY_WB)) {
994 return 1;
995 }
997 return 0;
998 }
1000 if (physaddr < start) {
1001 break;
1005 return 1;
1008 unsigned long
1009 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1010 unsigned long phys_addr, unsigned long size,
1011 unsigned long flags)
1013 unsigned long addr = mpaddr & PAGE_MASK;
1014 unsigned long end = PAGE_ALIGN(mpaddr + size);
1016 if (size == 0) {
1017 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1018 __func__, d, mpaddr, size);
1020 if (!efi_mmio(mpaddr, size)) {
1021 #ifndef NDEBUG
1022 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1023 __func__, d, mpaddr, size);
1024 #endif
1025 return -EINVAL;
1028 for (phys_addr &= PAGE_MASK; addr < end;
1029 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1030 __assign_domain_page(d, addr, phys_addr, flags);
1033 return mpaddr;
1036 unsigned long
1037 assign_domain_mach_page(struct domain *d,
1038 unsigned long mpaddr, unsigned long size,
1039 unsigned long flags)
1041 BUG_ON(flags & ASSIGN_pgc_allocated);
1042 assign_domain_same_page(d, mpaddr, size, flags);
1043 return mpaddr;
1046 static void
1047 adjust_page_count_info(struct page_info* page)
1049 struct domain* d = page_get_owner(page);
1050 BUG_ON((page->count_info & PGC_count_mask) != 1);
1051 if (d != NULL) {
1052 int ret = get_page(page, d);
1053 BUG_ON(ret == 0);
1054 } else {
1055 u64 x, nx, y;
1057 y = *((u64*)&page->count_info);
1058 do {
1059 x = y;
1060 nx = x + 1;
1062 BUG_ON((x >> 32) != 0);
1063 BUG_ON((nx & PGC_count_mask) != 2);
1064 y = cmpxchg((u64*)&page->count_info, x, nx);
1065 } while (unlikely(y != x));
1069 static void
1070 domain_put_page(struct domain* d, unsigned long mpaddr,
1071 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1073 unsigned long mfn = pte_pfn(old_pte);
1074 struct page_info* page = mfn_to_page(mfn);
1076 if (pte_pgc_allocated(old_pte)) {
1077 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1078 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1079 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1080 } else {
1081 BUG();
1084 if (likely(clear_PGC_allocate)) {
1085 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1086 BUG();
1087 /* put_page() is done by domain_page_flush_and_put() */
1088 } else {
1089 // In this case, page reference count mustn't touched.
1090 // domain_page_flush_and_put() decrements it, we increment
1091 // it in advence. This patch is slow path.
1092 //
1093 // guest_remove_page(): owner = d, count_info = 1
1094 // memory_exchange(): owner = NULL, count_info = 1
1095 adjust_page_count_info(page);
1098 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1101 // caller must get_page(mfn_to_page(mfn)) before call.
1102 // caller must call set_gpfn_from_mfn() before call if necessary.
1103 // because set_gpfn_from_mfn() result must be visible before pte xchg
1104 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1105 // flags: ASSIGN_xxx
1106 static void
1107 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1108 unsigned long mfn, unsigned long flags)
1110 struct mm_struct *mm = &d->arch.mm;
1111 volatile pte_t* pte;
1112 pte_t old_pte;
1113 pte_t npte;
1114 unsigned long prot = flags_to_prot(flags);
1116 pte = lookup_alloc_domain_pte(d, mpaddr);
1118 // update pte
1119 npte = pfn_pte(mfn, __pgprot(prot));
1120 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1121 if (pte_mem(old_pte)) {
1122 unsigned long old_mfn = pte_pfn(old_pte);
1124 // mfn = old_mfn case can happen when domain maps a granted page
1125 // twice with the same pseudo physial address.
1126 // It's non sense, but allowed.
1127 // __gnttab_map_grant_ref()
1128 // => create_host_mapping()
1129 // => assign_domain_page_replace()
1130 if (mfn != old_mfn) {
1131 domain_put_page(d, mpaddr, pte, old_pte, 1);
1134 perfc_incr(assign_domain_page_replace);
1137 // caller must get_page(new_page) before
1138 // Only steal_page() calls this function.
1139 static int
1140 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1141 struct page_info* old_page,
1142 struct page_info* new_page,
1143 unsigned long flags, int clear_PGC_allocate)
1145 struct mm_struct *mm = &d->arch.mm;
1146 volatile pte_t* pte;
1147 unsigned long old_mfn;
1148 unsigned long old_prot;
1149 pte_t old_pte;
1150 unsigned long new_mfn;
1151 unsigned long new_prot;
1152 pte_t new_pte;
1153 pte_t ret_pte;
1155 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1156 pte = lookup_alloc_domain_pte(d, mpaddr);
1158 again:
1159 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1160 old_mfn = page_to_mfn(old_page);
1161 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1162 if (!pte_present(old_pte)) {
1163 gdprintk(XENLOG_INFO,
1164 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1165 __func__, pte_val(old_pte), old_prot, old_mfn);
1166 return -EINVAL;
1169 new_prot = flags_to_prot(flags);
1170 new_mfn = page_to_mfn(new_page);
1171 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1173 // update pte
1174 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1175 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1176 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1177 goto again;
1180 gdprintk(XENLOG_INFO,
1181 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1182 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1183 __func__,
1184 pte_val(old_pte), old_prot, old_mfn,
1185 pte_val(ret_pte), pte_pfn(ret_pte));
1186 return -EINVAL;
1189 BUG_ON(!pte_mem(old_pte));
1190 BUG_ON(!pte_pgc_allocated(old_pte));
1191 BUG_ON(page_get_owner(old_page) != d);
1192 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1193 BUG_ON(old_mfn == new_mfn);
1195 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1196 if (likely(clear_PGC_allocate)) {
1197 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1198 BUG();
1199 } else {
1200 int ret;
1201 // adjust for count_info for domain_page_flush_and_put()
1202 // This is slow path.
1203 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1204 BUG_ON(d == NULL);
1205 ret = get_page(old_page, d);
1206 BUG_ON(ret == 0);
1209 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1210 perfc_incr(assign_domain_pge_cmpxchg_rel);
1211 return 0;
1214 static void
1215 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1216 int clear_PGC_allocate, unsigned long mfn)
1218 struct mm_struct *mm = &d->arch.mm;
1219 volatile pte_t *pte;
1220 pte_t old_pte;
1221 struct page_info *page;
1223 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1224 if (pte == NULL)
1225 return;
1226 if (pte_none(*pte))
1227 return;
1229 if (mfn == INVALID_MFN) {
1230 // clear pte
1231 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1232 mfn = pte_pfn(old_pte);
1233 } else {
1234 unsigned long old_arflags;
1235 pte_t new_pte;
1236 pte_t ret_pte;
1238 again:
1239 // memory_exchange() calls guest_physmap_remove_page() with
1240 // a stealed page. i.e. page owner = NULL.
1241 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1242 page_get_owner(mfn_to_page(mfn)) != NULL);
1243 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1244 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1245 new_pte = __pte(0);
1247 // update pte
1248 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1249 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1250 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1251 goto again;
1254 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1255 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1256 __func__,
1257 pte_val(old_pte), old_arflags, mfn,
1258 pte_val(ret_pte), pte_pfn(ret_pte));
1259 return;
1261 BUG_ON(mfn != pte_pfn(ret_pte));
1264 page = mfn_to_page(mfn);
1265 BUG_ON((page->count_info & PGC_count_mask) == 0);
1267 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1268 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1269 perfc_incr(zap_dcomain_page_one);
1272 unsigned long
1273 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1274 unsigned int extent_order)
1276 if (extent_order != 0) {
1277 //XXX
1278 return -ENOSYS;
1281 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1282 perfc_incr(dom0vp_zap_physmap);
1283 return 0;
1286 static unsigned long
1287 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1288 unsigned long mfn_or_gmfn,
1289 unsigned long flags, domid_t domid, int is_gmfn)
1291 int error = -EINVAL;
1292 struct domain* rd;
1293 unsigned long mfn;
1295 /* Not allowed by a domain. */
1296 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1297 return -EINVAL;
1299 rd = get_domain_by_id(domid);
1300 if (unlikely(rd == NULL)) {
1301 switch (domid) {
1302 case DOMID_XEN:
1303 rd = dom_xen;
1304 break;
1305 case DOMID_IO:
1306 rd = dom_io;
1307 break;
1308 default:
1309 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1310 "pgfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1311 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1312 return -ESRCH;
1314 BUG_ON(rd == NULL);
1315 get_knownalive_domain(rd);
1318 if (unlikely(rd == d))
1319 goto out1;
1320 /*
1321 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1322 * It can be considered that their p2m conversion is p==m.
1323 */
1324 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1325 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1326 else
1327 mfn = mfn_or_gmfn;
1328 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1329 goto out1;
1331 error = 0;
1332 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1333 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1334 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1335 //don't update p2m table because this page belongs to rd, not d.
1336 perfc_incr(dom0vp_add_physmap);
1337 out1:
1338 put_domain(rd);
1339 return error;
1342 unsigned long
1343 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1344 unsigned long flags, domid_t domid)
1346 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1349 unsigned long
1350 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1351 unsigned long gmfn, unsigned long flags,
1352 domid_t domid)
1354 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1357 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1358 static struct page_info* p2m_pte_zero_page = NULL;
1360 void
1361 expose_p2m_init(void)
1363 pte_t* pte;
1365 pte = pte_alloc_one_kernel(NULL, 0);
1366 BUG_ON(pte == NULL);
1367 smp_mb();// make contents of the page visible.
1368 p2m_pte_zero_page = virt_to_page(pte);
1371 static int
1372 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1374 // we can't get_page(page) here.
1375 // pte page is allocated form xen heap.(see pte_alloc_one_kernel().)
1376 // so that the page has NULL page owner and it's reference count
1377 // is useless.
1378 // see also mm_teardown_pte()'s page_get_owner() == NULL check.
1379 BUG_ON(page_get_owner(page) != NULL);
1381 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1382 ASSIGN_readonly);
1385 // It is possible to optimize loop, But this isn't performance critical.
1386 unsigned long
1387 dom0vp_expose_p2m(struct domain* d,
1388 unsigned long conv_start_gpfn,
1389 unsigned long assign_start_gpfn,
1390 unsigned long expose_size, unsigned long granule_pfn)
1392 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1393 unsigned long i;
1394 volatile pte_t* conv_pte;
1395 volatile pte_t* assign_pte;
1397 if ((expose_size % PAGE_SIZE) != 0 ||
1398 (granule_pfn % PTRS_PER_PTE) != 0 ||
1399 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1400 (conv_start_gpfn % granule_pfn) != 0 ||
1401 (assign_start_gpfn % granule_pfn) != 0 ||
1402 (expose_num_pfn % granule_pfn) != 0) {
1403 gdprintk(XENLOG_INFO,
1404 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1405 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1406 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1407 return -EINVAL;
1410 if (granule_pfn != PTRS_PER_PTE) {
1411 gdprintk(XENLOG_INFO,
1412 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1413 __func__, granule_pfn, PTRS_PER_PTE);
1414 return -ENOSYS;
1417 // allocate pgd, pmd.
1418 i = conv_start_gpfn;
1419 while (i < expose_num_pfn) {
1420 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1421 PAGE_SHIFT);
1422 if (conv_pte == NULL) {
1423 i++;
1424 continue;
1427 assign_pte = lookup_alloc_domain_pte(d, (assign_start_gpfn <<
1428 PAGE_SHIFT) + i * sizeof(pte_t));
1429 if (assign_pte == NULL) {
1430 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n", __func__);
1431 return -ENOMEM;
1434 // skip to next pte page
1435 i += PTRS_PER_PTE;
1436 i &= ~(PTRS_PER_PTE - 1);
1439 // expose pte page
1440 i = 0;
1441 while (i < expose_num_pfn) {
1442 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1443 PAGE_SHIFT);
1444 if (conv_pte == NULL) {
1445 i++;
1446 continue;
1449 if (expose_p2m_page(d, (assign_start_gpfn << PAGE_SHIFT) +
1450 i * sizeof(pte_t), virt_to_page(conv_pte)) < 0) {
1451 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1452 return -EAGAIN;
1455 // skip to next pte page
1456 i += PTRS_PER_PTE;
1457 i &= ~(PTRS_PER_PTE - 1);
1460 // expose p2m_pte_zero_page
1461 for (i = 0; i < expose_num_pfn / PTRS_PER_PTE + 1; i++) {
1462 assign_pte = lookup_noalloc_domain_pte(d, (assign_start_gpfn + i) <<
1463 PAGE_SHIFT);
1464 if (assign_pte == NULL || pte_present(*assign_pte))
1465 continue;
1467 if (expose_p2m_page(d, (assign_start_gpfn + i) << PAGE_SHIFT,
1468 p2m_pte_zero_page) < 0) {
1469 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n", __func__);
1470 return -EAGAIN;
1474 return 0;
1476 #endif
1478 // grant table host mapping
1479 // mpaddr: host_addr: pseudo physical address
1480 // mfn: frame: machine page frame
1481 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1482 int
1483 create_grant_host_mapping(unsigned long gpaddr,
1484 unsigned long mfn, unsigned int flags)
1486 struct domain* d = current->domain;
1487 struct page_info* page;
1488 int ret;
1490 if (flags & (GNTMAP_device_map |
1491 GNTMAP_application_map | GNTMAP_contains_pte)) {
1492 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1493 return GNTST_general_error;
1496 BUG_ON(!mfn_valid(mfn));
1497 page = mfn_to_page(mfn);
1498 ret = get_page(page, page_get_owner(page));
1499 BUG_ON(ret == 0);
1500 assign_domain_page_replace(d, gpaddr, mfn,
1501 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1502 ASSIGN_tlb_track |
1503 #endif
1504 ((flags & GNTMAP_readonly) ?
1505 ASSIGN_readonly : ASSIGN_writable));
1506 perfc_incr(create_grant_host_mapping);
1507 return GNTST_okay;
1510 // grant table host unmapping
1511 int
1512 destroy_grant_host_mapping(unsigned long gpaddr,
1513 unsigned long mfn, unsigned int flags)
1515 struct domain* d = current->domain;
1516 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
1517 volatile pte_t* pte;
1518 unsigned long cur_arflags;
1519 pte_t cur_pte;
1520 pte_t new_pte;
1521 pte_t old_pte;
1522 struct page_info* page = mfn_to_page(mfn);
1524 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1525 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1526 return GNTST_general_error;
1529 pte = lookup_noalloc_domain_pte(d, gpaddr);
1530 if (pte == NULL) {
1531 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
1532 __func__, gpaddr, mfn);
1533 return GNTST_general_error;
1536 again:
1537 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1538 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1539 if (!pte_present(cur_pte) ||
1540 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
1541 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1542 __func__, gpaddr, mfn, pte_val(cur_pte));
1543 return GNTST_general_error;
1545 new_pte = __pte(0);
1547 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1548 if (unlikely(!pte_present(old_pte))) {
1549 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx"
1550 " cur_pte 0x%lx old_pte 0x%lx\n",
1551 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1552 return GNTST_general_error;
1554 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1555 if (pte_pfn(old_pte) == mfn) {
1556 goto again;
1558 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
1559 "0x%lx old_pte 0x%lx\n",
1560 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1561 return GNTST_general_error;
1563 BUG_ON(pte_pfn(old_pte) != mfn);
1565 /* try_to_clear_PGC_allocate(d, page) is not needed. */
1566 BUG_ON(page_get_owner(page) == d &&
1567 get_gpfn_from_mfn(mfn) == gpfn);
1568 BUG_ON(pte_pgc_allocated(old_pte));
1569 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
1571 perfc_incr(destroy_grant_host_mapping);
1572 return GNTST_okay;
1575 // heavily depends on the struct page layout.
1576 // gnttab_transfer() calls steal_page() with memflags = 0
1577 // For grant table transfer, we must fill the page.
1578 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1579 // For memory exchange, we don't have to fill the page because
1580 // memory_exchange() does it.
1581 int
1582 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1584 #if 0 /* if big endian */
1585 # error "implement big endian version of steal_page()"
1586 #endif
1587 u32 _d, _nd;
1588 u64 x, nx, y;
1590 if (page_get_owner(page) != d) {
1591 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
1592 __func__, d, page_get_owner(page));
1593 return -1;
1596 if (!(memflags & MEMF_no_refcount)) {
1597 unsigned long gpfn;
1598 struct page_info *new;
1599 unsigned long new_mfn;
1600 int ret;
1602 new = alloc_domheap_page(d);
1603 if (new == NULL) {
1604 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
1605 return -1;
1607 // zero out pages for security reasons
1608 clear_page(page_to_virt(new));
1609 // assign_domain_page_cmpxchg_rel() has release semantics
1610 // so smp_mb() isn't needed.
1612 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1613 if (gpfn == INVALID_M2P_ENTRY) {
1614 free_domheap_page(new);
1615 return -1;
1617 new_mfn = page_to_mfn(new);
1618 set_gpfn_from_mfn(new_mfn, gpfn);
1619 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1620 // has release semantics.
1622 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1623 ASSIGN_writable |
1624 ASSIGN_pgc_allocated, 0);
1625 if (ret < 0) {
1626 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
1627 ret);
1628 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1629 free_domheap_page(new);
1630 return -1;
1632 perfc_incr(steal_page_refcount);
1635 spin_lock(&d->page_alloc_lock);
1637 /*
1638 * The tricky bit: atomically release ownership while there is just one
1639 * benign reference to the page (PGC_allocated). If that reference
1640 * disappears then the deallocation routine will safely spin.
1641 */
1642 _d = pickle_domptr(d);
1643 y = *((u64*)&page->count_info);
1644 do {
1645 x = y;
1646 nx = x & 0xffffffff;
1647 // page->count_info: untouched
1648 // page->u.inused._domain = 0;
1649 _nd = x >> 32;
1651 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
1652 (1 | PGC_allocated))) ||
1653 unlikely(_nd != _d)) {
1654 struct domain* nd = unpickle_domptr(_nd);
1655 if (nd == NULL) {
1656 gdprintk(XENLOG_INFO, "gnttab_transfer: "
1657 "Bad page %p: ed=%p(%u) 0x%x, "
1658 "sd=%p 0x%x,"
1659 " caf=%016lx, taf=%" PRtype_info
1660 " memflags 0x%x\n",
1661 (void *) page_to_mfn(page),
1662 d, d->domain_id, _d,
1663 nd, _nd,
1664 x,
1665 page->u.inuse.type_info,
1666 memflags);
1667 } else {
1668 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
1669 "Bad page %p: ed=%p(%u) 0x%x, "
1670 "sd=%p(%u) 0x%x,"
1671 " caf=%016lx, taf=%" PRtype_info
1672 " memflags 0x%x\n",
1673 (void *) page_to_mfn(page),
1674 d, d->domain_id, _d,
1675 nd, nd->domain_id, _nd,
1676 x,
1677 page->u.inuse.type_info,
1678 memflags);
1680 spin_unlock(&d->page_alloc_lock);
1681 return -1;
1684 y = cmpxchg((u64*)&page->count_info, x, nx);
1685 } while (unlikely(y != x));
1687 /*
1688 * Unlink from 'd'. At least one reference remains (now anonymous), so
1689 * noone else is spinning to try to delete this page from 'd'.
1690 */
1691 if ( !(memflags & MEMF_no_refcount) )
1692 d->tot_pages--;
1693 list_del(&page->list);
1695 spin_unlock(&d->page_alloc_lock);
1696 perfc_incr(steal_page);
1697 return 0;
1700 void
1701 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1702 unsigned long mfn)
1704 BUG_ON(!mfn_valid(mfn));
1705 BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
1706 set_gpfn_from_mfn(mfn, gpfn);
1707 smp_mb();
1708 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
1709 ASSIGN_writable | ASSIGN_pgc_allocated);
1711 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1713 perfc_incr(guest_physmap_add_page);
1716 void
1717 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1718 unsigned long mfn)
1720 BUG_ON(mfn == 0);//XXX
1721 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
1722 perfc_incr(guest_physmap_remove_page);
1725 static void
1726 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
1727 volatile pte_t* ptep, pte_t old_pte,
1728 struct page_info* page)
1730 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1731 struct tlb_track_entry* entry;
1732 #endif
1734 if (shadow_mode_enabled(d))
1735 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1737 #ifndef CONFIG_XEN_IA64_TLB_TRACK
1738 //XXX sledgehammer.
1739 // flush finer range.
1740 domain_flush_vtlb_all(d);
1741 put_page(page);
1742 #else
1743 switch (tlb_track_search_and_remove(d->arch.tlb_track,
1744 ptep, old_pte, &entry)) {
1745 case TLB_TRACK_NOT_TRACKED:
1746 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
1747 /* This page is zapped from this domain
1748 * by memory decrease or exchange or dom0vp_zap_physmap.
1749 * I.e. the page is zapped for returning this page to xen
1750 * (balloon driver or DMA page allocation) or
1751 * foreign domain mapped page is unmapped from the domain.
1752 * In the former case the page is to be freed so that
1753 * we can defer freeing page to batch.
1754 * In the latter case the page is unmapped so that
1755 * we need to flush it. But to optimize it, we
1756 * queue the page and flush vTLB only once.
1757 * I.e. The caller must call dfree_flush() explicitly.
1758 */
1759 domain_flush_vtlb_all(d);
1760 put_page(page);
1761 break;
1762 case TLB_TRACK_NOT_FOUND:
1763 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
1764 /* This page is zapped from this domain
1765 * by grant table page unmap.
1766 * Luckily the domain that mapped this page didn't
1767 * access this page so that we don't have to flush vTLB.
1768 * Probably the domain did only DMA.
1769 */
1770 /* do nothing */
1771 put_page(page);
1772 break;
1773 case TLB_TRACK_FOUND:
1774 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
1775 /* This page is zapped from this domain
1776 * by grant table page unmap.
1777 * Fortunately this page is accessced via only one virtual
1778 * memory address. So it is easy to flush it.
1779 */
1780 domain_flush_vtlb_track_entry(d, entry);
1781 tlb_track_free_entry(d->arch.tlb_track, entry);
1782 put_page(page);
1783 break;
1784 case TLB_TRACK_MANY:
1785 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
1786 /* This page is zapped from this domain
1787 * by grant table page unmap.
1788 * Unfortunately this page is accessced via many virtual
1789 * memory address (or too many times with single virtual address).
1790 * So we abondaned to track virtual addresses.
1791 * full vTLB flush is necessary.
1792 */
1793 domain_flush_vtlb_all(d);
1794 put_page(page);
1795 break;
1796 case TLB_TRACK_AGAIN:
1797 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
1798 BUG();
1799 break;
1801 #endif
1802 perfc_incr(domain_page_flush_and_put);
1805 int
1806 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1808 volatile pte_t * pte;
1810 pte = lookup_noalloc_domain_pte(d, mpaddr);
1811 if(pte != NULL && !pte_none(*pte))
1812 return 1;
1813 return 0;
1816 /* Flush cache of domain d. */
1817 void domain_cache_flush (struct domain *d, int sync_only)
1819 struct mm_struct *mm = &d->arch.mm;
1820 volatile pgd_t *pgd = mm->pgd;
1821 unsigned long maddr;
1822 int i,j,k, l;
1823 int nbr_page = 0;
1824 void (*flush_func)(unsigned long start, unsigned long end);
1825 extern void flush_dcache_range (unsigned long, unsigned long);
1827 if (sync_only)
1828 flush_func = &flush_icache_range;
1829 else
1830 flush_func = &flush_dcache_range;
1832 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1833 volatile pud_t *pud;
1834 if (!pgd_present(*pgd)) // acquire semantics
1835 continue;
1836 pud = pud_offset(pgd, 0);
1837 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1838 volatile pmd_t *pmd;
1839 if (!pud_present(*pud)) // acquire semantics
1840 continue;
1841 pmd = pmd_offset(pud, 0);
1842 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1843 volatile pte_t *pte;
1844 if (!pmd_present(*pmd)) // acquire semantics
1845 continue;
1846 pte = pte_offset_map(pmd, 0);
1847 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1848 if (!pte_present(*pte)) // acquire semantics
1849 continue;
1850 /* Convert PTE to maddr. */
1851 maddr = __va_ul (pte_val(*pte)
1852 & _PAGE_PPN_MASK);
1853 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1854 nbr_page++;
1859 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1862 #ifdef VERBOSE
1863 #define MEM_LOG(_f, _a...) \
1864 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1865 current->domain->domain_id , __LINE__ , ## _a )
1866 #else
1867 #define MEM_LOG(_f, _a...) ((void)0)
1868 #endif
1870 static void free_page_type(struct page_info *page, u32 type)
1874 static int alloc_page_type(struct page_info *page, u32 type)
1876 return 1;
1879 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1881 void *p = alloc_xenheap_pages(order);
1883 memset(p,0,PAGE_SIZE<<order);
1884 return (unsigned long)p;
1887 void __free_pages(struct page_info *page, unsigned int order)
1889 if (order) BUG();
1890 free_xenheap_page(page);
1893 static int opt_p2m_xenheap;
1894 boolean_param("p2m_xenheap", opt_p2m_xenheap);
1896 void *pgtable_quicklist_alloc(void)
1898 void *p;
1899 if (!opt_p2m_xenheap) {
1900 struct page_info *page = alloc_domheap_page(NULL);
1901 if (page == NULL)
1902 return NULL;
1903 p = page_to_virt(page);
1904 clear_page(p);
1905 return p;
1907 p = alloc_xenheap_pages(0);
1908 if (p)
1909 clear_page(p);
1910 return p;
1913 void pgtable_quicklist_free(void *pgtable_entry)
1915 if (!opt_p2m_xenheap)
1916 free_domheap_page(virt_to_page(pgtable_entry));
1917 else
1918 free_xenheap_page(pgtable_entry);
1921 void put_page_type(struct page_info *page)
1923 u64 nx, x, y = page->u.inuse.type_info;
1925 again:
1926 do {
1927 x = y;
1928 nx = x - 1;
1930 ASSERT((x & PGT_count_mask) != 0);
1932 /*
1933 * The page should always be validated while a reference is held. The
1934 * exception is during domain destruction, when we forcibly invalidate
1935 * page-table pages if we detect a referential loop.
1936 * See domain.c:relinquish_list().
1937 */
1938 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
1940 if ( unlikely((nx & PGT_count_mask) == 0) )
1942 /* Record TLB information for flush later. Races are harmless. */
1943 page->tlbflush_timestamp = tlbflush_current_time();
1945 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1946 likely(nx & PGT_validated) )
1948 /*
1949 * Page-table pages must be unvalidated when count is zero. The
1950 * 'free' is safe because the refcnt is non-zero and validated
1951 * bit is clear => other ops will spin or fail.
1952 */
1953 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1954 x & ~PGT_validated)) != x) )
1955 goto again;
1956 /* We cleared the 'valid bit' so we do the clean up. */
1957 free_page_type(page, x);
1958 /* Carry on, but with the 'valid bit' now clear. */
1959 x &= ~PGT_validated;
1960 nx &= ~PGT_validated;
1964 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1968 int get_page_type(struct page_info *page, u32 type)
1970 u64 nx, x, y = page->u.inuse.type_info;
1972 ASSERT(!(type & ~PGT_type_mask));
1974 again:
1975 do {
1976 x = y;
1977 nx = x + 1;
1978 if ( unlikely((nx & PGT_count_mask) == 0) )
1980 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1981 return 0;
1983 else if ( unlikely((x & PGT_count_mask) == 0) )
1985 if ( (x & PGT_type_mask) != type )
1987 /*
1988 * On type change we check to flush stale TLB entries. This
1989 * may be unnecessary (e.g., page was GDT/LDT) but those
1990 * circumstances should be very rare.
1991 */
1992 cpumask_t mask =
1993 page_get_owner(page)->domain_dirty_cpumask;
1994 tlbflush_filter(mask, page->tlbflush_timestamp);
1996 if ( unlikely(!cpus_empty(mask)) )
1998 perfc_incr(need_flush_tlb_flush);
1999 flush_tlb_mask(mask);
2002 /* We lose existing type, back pointer, and validity. */
2003 nx &= ~(PGT_type_mask | PGT_validated);
2004 nx |= type;
2006 /* No special validation needed for writable pages. */
2007 /* Page tables and GDT/LDT need to be scanned for validity. */
2008 if ( type == PGT_writable_page )
2009 nx |= PGT_validated;
2012 else if ( unlikely((x & PGT_type_mask) != type) )
2014 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
2015 (type != PGT_l1_page_table) )
2016 MEM_LOG("Bad type (saw %08lx != exp %08x) "
2017 "for mfn %016lx (pfn %016lx)",
2018 x, type, page_to_mfn(page),
2019 get_gpfn_from_mfn(page_to_mfn(page)));
2020 return 0;
2022 else if ( unlikely(!(x & PGT_validated)) )
2024 /* Someone else is updating validation of this page. Wait... */
2025 while ( (y = page->u.inuse.type_info) == x )
2026 cpu_relax();
2027 goto again;
2030 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
2032 if ( unlikely(!(nx & PGT_validated)) )
2034 /* Try to validate page type; drop the new reference on failure. */
2035 if ( unlikely(!alloc_page_type(page, type)) )
2037 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
2038 ": caf=%08x taf=%" PRtype_info,
2039 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2040 type, page->count_info, page->u.inuse.type_info);
2041 /* Noone else can get a reference. We hold the only ref. */
2042 page->u.inuse.type_info = 0;
2043 return 0;
2046 /* Noone else is updating simultaneously. */
2047 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2050 return 1;
2053 int memory_is_conventional_ram(paddr_t p)
2055 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
2059 long
2060 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2062 switch (op) {
2063 case XENMEM_add_to_physmap:
2065 struct xen_add_to_physmap xatp;
2066 unsigned long prev_mfn, mfn = 0, gpfn;
2067 struct domain *d;
2069 if (copy_from_guest(&xatp, arg, 1))
2070 return -EFAULT;
2072 if (xatp.domid == DOMID_SELF) {
2073 d = get_current_domain();
2075 else if (!IS_PRIV(current->domain))
2076 return -EPERM;
2077 else if ((d = get_domain_by_id(xatp.domid)) == NULL)
2078 return -ESRCH;
2080 /* This hypercall is used for VT-i domain only */
2081 if (!VMX_DOMAIN(d->vcpu[0])) {
2082 put_domain(d);
2083 return -ENOSYS;
2086 switch (xatp.space) {
2087 case XENMAPSPACE_shared_info:
2088 if (xatp.idx == 0)
2089 mfn = virt_to_mfn(d->shared_info);
2090 break;
2091 case XENMAPSPACE_grant_table:
2092 spin_lock(&d->grant_table->lock);
2094 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
2095 (xatp.idx < max_nr_grant_frames))
2096 gnttab_grow_table(d, xatp.idx + 1);
2098 if (xatp.idx < nr_grant_frames(d->grant_table))
2099 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
2101 spin_unlock(&d->grant_table->lock);
2102 break;
2103 default:
2104 break;
2107 if (mfn == 0) {
2108 put_domain(d);
2109 return -EINVAL;
2112 LOCK_BIGLOCK(d);
2114 /* Check remapping necessity */
2115 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2116 if (mfn == prev_mfn)
2117 goto out;
2119 /* Remove previously mapped page if it was present. */
2120 if (prev_mfn && mfn_valid(prev_mfn)) {
2121 if (IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)))
2122 /* Xen heap frames are simply unhooked from this phys slot. */
2123 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2124 else
2125 /* Normal domain memory is freed, to avoid leaking memory. */
2126 guest_remove_page(d, xatp.gpfn);
2129 /* Unmap from old location, if any. */
2130 gpfn = get_gpfn_from_mfn(mfn);
2131 if (gpfn != INVALID_M2P_ENTRY)
2132 guest_physmap_remove_page(d, gpfn, mfn);
2134 /* Map at new location. */
2135 guest_physmap_add_page(d, xatp.gpfn, mfn);
2137 out:
2138 UNLOCK_BIGLOCK(d);
2140 put_domain(d);
2142 break;
2145 default:
2146 return -ENOSYS;
2149 return 0;
2152 /*
2153 * Local variables:
2154 * mode: C
2155 * c-set-style: "BSD"
2156 * c-basic-offset: 4
2157 * tab-width: 4
2158 * indent-tabs-mode: nil
2159 * End:
2160 */