ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 19131:46b4096813dc

[IA64] Fix populate-on-demand stub.

A return value of 1 is meant to inform decrease_reservation() that
everything has been taken care of by the PoD code and it doesn't need
to call guest_remove_page().

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author Isaku Yamahata <yamahata@valinux.co.jp>
date Mon Feb 02 11:11:36 2009 +0900 (2009-02-02)
parents c8962b24fb50
children af0da711bbdb
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * replace_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <linux/sort.h>
179 #include <xen/guest_access.h>
180 #include <asm/page.h>
181 #include <asm/dom_fw_common.h>
182 #include <public/memory.h>
183 #include <asm/event.h>
184 #include <asm/debugger.h>
187 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING, _f "\n", ## _a)
189 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
190 volatile pte_t* ptep, pte_t old_pte,
191 struct page_info* page);
193 static void __xencomm_mark_dirty(struct domain *d,
194 unsigned long addr, unsigned int len);
196 extern unsigned long ia64_iobase;
198 struct domain *dom_xen, *dom_io;
200 /*
201 * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO.
202 * If more reserved domain ids are introduced, this might be increased.
203 */
204 #define DOMID_P2M (0x7FF8U)
205 static struct domain *dom_p2m;
207 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
208 void
209 alloc_dom_xen_and_dom_io(void)
210 {
211 /*
212 * Initialise our DOMID_XEN domain.
213 * Any Xen-heap pages that we will allow to be mapped will have
214 * their domain field set to dom_xen.
215 */
216 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
217 BUG_ON(dom_xen == NULL);
219 /*
220 * Initialise our DOMID_IO domain.
221 * This domain owns I/O pages that are within the range of the page_info
222 * array. Mappings occur at the priv of the caller.
223 */
224 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
225 BUG_ON(dom_io == NULL);
226 }
228 static int
229 mm_teardown_can_skip(struct domain* d, unsigned long offset)
230 {
231 return d->arch.mm_teardown_offset > offset;
232 }
234 static void
235 mm_teardown_update_offset(struct domain* d, unsigned long offset)
236 {
237 d->arch.mm_teardown_offset = offset;
238 }
240 static void
241 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
242 {
243 pte_t old_pte;
244 unsigned long mfn;
245 struct page_info* page;
247 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
249 // vmx domain use bit[58:56] to distinguish io region from memory.
250 // see vmx_build_physmap_table() in vmx_init.c
251 if (!pte_mem(old_pte))
252 return;
254 // domain might map IO space or acpi table pages. check it.
255 mfn = pte_pfn(old_pte);
256 if (!mfn_valid(mfn))
257 return;
258 page = mfn_to_page(mfn);
259 BUG_ON(page_get_owner(page) == NULL);
261 // struct page_info corresponding to mfn may exist or not depending
262 // on CONFIG_VIRTUAL_FRAME_TABLE.
263 // The above check is too easy.
264 // The right way is to check whether this page is of io area or acpi pages
266 if (pte_pgc_allocated(old_pte)) {
267 BUG_ON(page_get_owner(page) != d);
268 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
269 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
270 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
271 put_page(page);
272 } else {
273 put_page(page);
274 }
275 }
277 static int
278 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
279 {
280 unsigned long i;
281 volatile pte_t* pte = pte_offset_map(pmd, offset);
283 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
284 unsigned long cur_offset = offset + (i << PAGE_SHIFT);
285 if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
286 continue;
287 if (!pte_present(*pte)) { // acquire semantics
288 mm_teardown_update_offset(d, cur_offset);
289 continue;
290 }
291 mm_teardown_update_offset(d, cur_offset);
292 mm_teardown_pte(d, pte, cur_offset);
293 if (hypercall_preempt_check())
294 return -EAGAIN;
295 }
296 return 0;
297 }
299 static int
300 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
301 {
302 unsigned long i;
303 volatile pmd_t *pmd = pmd_offset(pud, offset);
305 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
306 unsigned long cur_offset = offset + (i << PMD_SHIFT);
307 if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
308 continue;
309 if (!pmd_present(*pmd)) { // acquire semantics
310 mm_teardown_update_offset(d, cur_offset);
311 continue;
312 }
313 if (mm_teardown_pmd(d, pmd, cur_offset))
314 return -EAGAIN;
315 }
316 return 0;
317 }
319 static int
320 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
321 {
322 unsigned long i;
323 volatile pud_t *pud = pud_offset(pgd, offset);
325 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
326 unsigned long cur_offset = offset + (i << PUD_SHIFT);
327 #ifndef __PAGETABLE_PUD_FOLDED
328 if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
329 continue;
330 #endif
331 if (!pud_present(*pud)) { // acquire semantics
332 #ifndef __PAGETABLE_PUD_FOLDED
333 mm_teardown_update_offset(d, cur_offset);
334 #endif
335 continue;
336 }
337 if (mm_teardown_pud(d, pud, cur_offset))
338 return -EAGAIN;
339 }
340 return 0;
341 }
343 int
344 mm_teardown(struct domain* d)
345 {
346 struct mm_struct* mm = &d->arch.mm;
347 unsigned long i;
348 volatile pgd_t* pgd;
350 if (mm->pgd == NULL)
351 return 0;
353 pgd = pgd_offset(mm, 0);
354 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
355 unsigned long cur_offset = i << PGDIR_SHIFT;
357 if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE))
358 continue;
359 if (!pgd_present(*pgd)) { // acquire semantics
360 mm_teardown_update_offset(d, cur_offset);
361 continue;
362 }
363 if (mm_teardown_pgd(d, pgd, cur_offset))
364 return -EAGAIN;
365 }
367 foreign_p2m_destroy(d);
368 return 0;
369 }
371 static void
372 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
373 unsigned long offset)
374 {
375 pte_free_kernel(pte_offset_map(pmd, offset));
376 }
378 static void
379 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
380 unsigned long offset)
381 {
382 unsigned long i;
383 volatile pmd_t *pmd = pmd_offset(pud, offset);
385 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
386 if (!pmd_present(*pmd))
387 continue;
388 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
389 }
390 pmd_free(pmd_offset(pud, offset));
391 }
393 static void
394 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
395 unsigned long offset)
396 {
397 unsigned long i;
398 volatile pud_t *pud = pud_offset(pgd, offset);
400 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
401 if (!pud_present(*pud))
402 continue;
403 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
404 }
405 pud_free(pud_offset(pgd, offset));
406 }
408 static void
409 mm_p2m_teardown(struct domain* d)
410 {
411 struct mm_struct* mm = &d->arch.mm;
412 unsigned long i;
413 volatile pgd_t* pgd;
415 BUG_ON(mm->pgd == NULL);
416 pgd = pgd_offset(mm, 0);
417 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
418 if (!pgd_present(*pgd))
419 continue;
420 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
421 }
422 pgd_free(mm->pgd);
423 mm->pgd = NULL;
424 }
426 void
427 mm_final_teardown(struct domain* d)
428 {
429 if (d->arch.shadow_bitmap != NULL) {
430 xfree(d->arch.shadow_bitmap);
431 d->arch.shadow_bitmap = NULL;
432 }
433 mm_p2m_teardown(d);
434 }
436 unsigned long
437 domain_get_maximum_gpfn(struct domain *d)
438 {
439 return (d->arch.convmem_end - 1) >> PAGE_SHIFT;
440 }
442 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
443 void
444 share_xen_page_with_guest(struct page_info *page,
445 struct domain *d, int readonly)
446 {
447 if ( page_get_owner(page) == d )
448 return;
450 #if 1
451 if (readonly) {
452 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
453 }
454 #endif
456 // alloc_xenheap_pages() doesn't initialize page owner.
457 //BUG_ON(page_get_owner(page) != NULL);
459 spin_lock(&d->page_alloc_lock);
461 #ifndef __ia64__
462 /* The incremented type count pins as writable or read-only. */
463 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
464 page->u.inuse.type_info |= PGT_validated | 1;
465 #endif
467 page_set_owner(page, d);
468 wmb(); /* install valid domain ptr before updating refcnt. */
469 ASSERT((page->count_info & ~PGC_xen_heap)== 0);
471 /* Only add to the allocation list if the domain isn't dying. */
472 if ( !d->is_dying )
473 {
474 page->count_info |= PGC_allocated | 1;
475 if ( unlikely(d->xenheap_pages++ == 0) )
476 get_knownalive_domain(d);
477 list_add_tail(&page->list, &d->xenpage_list);
478 }
480 // grant_table_destroy() releases these pages.
481 // but it doesn't clear their m2p entry. So there might remain stale
482 // entries. such a stale entry is cleared here.
483 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
485 spin_unlock(&d->page_alloc_lock);
486 }
488 void
489 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
490 {
491 share_xen_page_with_guest(page, dom_xen, readonly);
492 }
494 unsigned long
495 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
496 {
497 unsigned long pte;
499 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
500 if (!pte) {
501 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
502 }
504 if ((pte & _PAGE_IO) && is_hvm_domain(d))
505 return INVALID_MFN;
507 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
508 }
510 // given a domain virtual address, pte and pagesize, extract the metaphysical
511 // address, convert the pte for a physical address for (possibly different)
512 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
513 // current->arch.vhpt_pg_shift!)
514 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* itir,
515 struct p2m_entry* entry)
516 {
517 struct domain *d = current->domain;
518 ia64_itir_t _itir = {.itir = itir__};
519 u64 mask, mpaddr, pteval2;
520 u64 arflags;
521 u64 arflags2;
522 u64 maflags2;
524 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
526 // FIXME address had better be pre-validated on insert
527 mask = ~itir_mask(_itir.itir);
528 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
530 if (_itir.ps > PAGE_SHIFT)
531 _itir.ps = PAGE_SHIFT;
533 ((ia64_itir_t*)itir)->itir = _itir.itir;/* Copy the whole register. */
534 ((ia64_itir_t*)itir)->ps = _itir.ps; /* Overwrite ps part! */
536 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
537 if (_itir.ps < PAGE_SHIFT)
538 pteval2 |= mpaddr & ~PAGE_MASK & ~((1L << _itir.ps) - 1);
540 /* Check access rights. */
541 arflags = pteval & _PAGE_AR_MASK;
542 arflags2 = pteval2 & _PAGE_AR_MASK;
543 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
544 #if 0
545 dprintk(XENLOG_WARNING,
546 "%s:%d "
547 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
548 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
549 __func__, __LINE__,
550 pteval, arflags, address, itir__,
551 pteval2, arflags2, mpaddr);
552 #endif
553 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
554 }
556 /* Check memory attribute. The switch is on the *requested* memory
557 attribute. */
558 maflags2 = pteval2 & _PAGE_MA_MASK;
559 switch (pteval & _PAGE_MA_MASK) {
560 case _PAGE_MA_NAT:
561 /* NaT pages are always accepted! */
562 break;
563 case _PAGE_MA_UC:
564 case _PAGE_MA_UCE:
565 case _PAGE_MA_WC:
566 if (maflags2 == _PAGE_MA_WB) {
567 /* Don't let domains WB-map uncached addresses.
568 This can happen when domU tries to touch i/o
569 port space. Also prevents possible address
570 aliasing issues. */
571 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) {
572 u64 ucwb;
574 /*
575 * If dom0 page has both UC & WB attributes
576 * don't warn about attempted UC access.
577 */
578 ucwb = efi_mem_attribute(mpaddr, PAGE_SIZE);
579 ucwb &= EFI_MEMORY_UC | EFI_MEMORY_WB;
580 ucwb ^= EFI_MEMORY_UC | EFI_MEMORY_WB;
582 if (d != dom0 || ucwb != 0)
583 gdprintk(XENLOG_WARNING, "Warning: UC"
584 " to WB for mpaddr=%lx\n",
585 mpaddr);
586 }
587 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
588 }
589 break;
590 case _PAGE_MA_WB:
591 if (maflags2 != _PAGE_MA_WB) {
592 /* Forbid non-coherent access to coherent memory. */
593 panic_domain(NULL, "try to use WB mem attr on "
594 "UC page, mpaddr=%lx\n", mpaddr);
595 }
596 break;
597 default:
598 panic_domain(NULL, "try to use unknown mem attribute\n");
599 }
601 /* If shadow mode is enabled, virtualize dirty bit. */
602 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
603 u64 mp_page = mpaddr >> PAGE_SHIFT;
604 pteval |= _PAGE_VIRT_D;
606 /* If the page is not already dirty, don't set the dirty bit! */
607 if (mp_page < d->arch.shadow_bitmap_size * 8
608 && !test_bit(mp_page, d->arch.shadow_bitmap))
609 pteval &= ~_PAGE_D;
610 }
612 /* Ignore non-addr bits of pteval2 and force PL0->1
613 (PL3 is unaffected) */
614 return (pteval & ~(_PAGE_PPN_MASK | _PAGE_PL_MASK)) |
615 (pteval2 & _PAGE_PPN_MASK) |
616 (vcpu_pl_adjust(pteval, 7) & _PAGE_PL_MASK);
617 }
619 // given a current domain metaphysical address, return the physical address
620 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
621 struct p2m_entry* entry)
622 {
623 unsigned long pteval;
625 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
626 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
627 }
629 //XXX !xxx_present() should be used instread of !xxx_none()?
630 // pud, pmd, pte page is zero cleared when they are allocated.
631 // Their area must be visible before population so that
632 // cmpxchg must have release semantics.
633 static volatile pte_t*
634 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
635 {
636 struct mm_struct *mm = &d->arch.mm;
637 volatile pgd_t *pgd;
638 volatile pud_t *pud;
639 volatile pmd_t *pmd;
641 BUG_ON(mm->pgd == NULL);
643 pgd = pgd_offset(mm, mpaddr);
644 again_pgd:
645 if (unlikely(pgd_none(*pgd))) { // acquire semantics
646 pud_t *old_pud = NULL;
647 pud = pud_alloc_one(mm, mpaddr);
648 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
649 pud_free(pud);
650 goto again_pgd;
651 }
652 }
654 pud = pud_offset(pgd, mpaddr);
655 again_pud:
656 if (unlikely(pud_none(*pud))) { // acquire semantics
657 pmd_t* old_pmd = NULL;
658 pmd = pmd_alloc_one(mm, mpaddr);
659 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
660 pmd_free(pmd);
661 goto again_pud;
662 }
663 }
665 pmd = pmd_offset(pud, mpaddr);
666 again_pmd:
667 if (unlikely(pmd_none(*pmd))) { // acquire semantics
668 pte_t* old_pte = NULL;
669 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
670 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
671 pte_free_kernel(pte);
672 goto again_pmd;
673 }
674 }
676 return pte_offset_map(pmd, mpaddr);
677 }
679 //XXX xxx_none() should be used instread of !xxx_present()?
680 volatile pte_t*
681 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
682 {
683 struct mm_struct *mm = &d->arch.mm;
684 volatile pgd_t *pgd;
685 volatile pud_t *pud;
686 volatile pmd_t *pmd;
688 BUG_ON(mm->pgd == NULL);
689 pgd = pgd_offset(mm, mpaddr);
690 if (unlikely(!pgd_present(*pgd))) // acquire semantics
691 return NULL;
693 pud = pud_offset(pgd, mpaddr);
694 if (unlikely(!pud_present(*pud))) // acquire semantics
695 return NULL;
697 pmd = pmd_offset(pud, mpaddr);
698 if (unlikely(!pmd_present(*pmd))) // acquire semantics
699 return NULL;
701 return pte_offset_map(pmd, mpaddr);
702 }
704 static volatile pte_t*
705 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
706 {
707 struct mm_struct *mm = &d->arch.mm;
708 volatile pgd_t *pgd;
709 volatile pud_t *pud;
710 volatile pmd_t *pmd;
712 BUG_ON(mm->pgd == NULL);
713 pgd = pgd_offset(mm, mpaddr);
714 if (unlikely(pgd_none(*pgd))) // acquire semantics
715 return NULL;
717 pud = pud_offset(pgd, mpaddr);
718 if (unlikely(pud_none(*pud))) // acquire semantics
719 return NULL;
721 pmd = pmd_offset(pud, mpaddr);
722 if (unlikely(pmd_none(*pmd))) // acquire semantics
723 return NULL;
725 return pte_offset_map(pmd, mpaddr);
726 }
728 unsigned long
729 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
730 {
731 volatile pte_t *pte;
733 pte = lookup_noalloc_domain_pte(d, mpaddr);
734 if (pte == NULL)
735 return INVALID_MFN;
737 if (pte_present(*pte))
738 return (pte->pte & _PFN_MASK);
739 return INVALID_MFN;
740 }
742 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
743 struct p2m_entry* entry)
744 {
745 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
747 if (pte != NULL) {
748 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
749 if (pte_present(tmp_pte)) {
750 if (entry != NULL)
751 p2m_entry_set(entry, pte, tmp_pte);
752 return pte_val(tmp_pte);
753 } else if (is_hvm_domain(d))
754 return INVALID_MFN;
755 }
757 if (mpaddr < d->arch.convmem_end && !d->is_dying) {
758 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
759 "d %"PRId16" 0x%lx (< 0x%lx)\n",
760 current->vcpu_id, PSCB(current, iip),
761 d->domain_id, mpaddr, d->arch.convmem_end);
762 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
763 /* Log I/O port probing, but complain less loudly about it */
764 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
765 "d %"PRId16" 0x%lx\n",
766 current->vcpu_id, PSCB(current, iip), d->domain_id,
767 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
768 } else {
769 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
770 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
771 current->vcpu_id, PSCB(current, iip),
772 d->domain_id, mpaddr, d->arch.convmem_end);
773 }
775 debugger_event (XEN_IA64_DEBUG_ON_BAD_MPA);
777 if (entry != NULL)
778 p2m_entry_set(entry, NULL, __pte(0));
779 //XXX This is a work around until the emulation memory access to a region
780 // where memory or device are attached is implemented.
781 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_PRIV |
782 _PAGE_AR_RWX)));
783 }
785 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
786 #if 1
787 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
788 {
789 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
790 unsigned long imva;
792 pte &= _PAGE_PPN_MASK;
793 imva = (unsigned long) __va(pte);
794 imva |= mpaddr & ~PAGE_MASK;
795 return (void*)imva;
796 }
797 #else
798 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
799 {
800 unsigned long imva = __gpa_to_mpa(d, mpaddr);
802 return (void *)__va(imva);
803 }
804 #endif
806 unsigned long
807 paddr_to_maddr(unsigned long paddr)
808 {
809 struct vcpu *v = current;
810 struct domain *d = v->domain;
811 u64 pa;
813 pa = ____lookup_domain_mpa(d, paddr);
814 if (pa == INVALID_MFN) {
815 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
816 __func__, paddr, vcpu_regs(v)->cr_iip);
817 return 0;
818 }
819 return (pa & _PFN_MASK) | (paddr & ~PAGE_MASK);
820 }
822 /* Allocate a new page for domain and map it to the specified metaphysical
823 address. */
824 static struct page_info *
825 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
826 volatile pte_t* pte)
827 {
828 struct page_info *p;
829 unsigned long maddr;
831 BUG_ON(!pte_none(*pte));
833 p = alloc_domheap_page(d, 0);
834 if (unlikely(!p)) {
835 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
836 return(p);
837 }
839 // zero out pages for security reasons
840 clear_page(page_to_virt(p));
841 maddr = page_to_maddr (p);
842 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
843 && maddr < __get_cpu_var(vhpt_pend))) {
844 /* FIXME: how can this happen ?
845 vhpt is allocated by alloc_domheap_page. */
846 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
847 maddr);
848 }
850 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
851 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
852 // because set_pte_rel() has release semantics
853 set_pte_rel(pte,
854 pfn_pte(maddr >> PAGE_SHIFT,
855 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
856 _PAGE_PL_PRIV | _PAGE_AR_RWX)));
858 smp_mb();
859 return p;
860 }
862 struct page_info *
863 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
864 {
865 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
867 if (!pte_none(*pte))
868 return NULL;
870 return __assign_new_domain_page(d, mpaddr, pte);
871 }
873 void __init
874 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
875 {
876 volatile pte_t *pte;
878 BUG_ON(d != dom0);
879 pte = lookup_alloc_domain_pte(d, mpaddr);
880 if (pte_none(*pte)) {
881 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
882 if (p == NULL) {
883 panic("%s: can't allocate page for dom0\n", __func__);
884 }
885 }
886 }
888 static unsigned long
889 flags_to_prot (unsigned long flags)
890 {
891 unsigned long res = _PAGE_PL_PRIV | __DIRTY_BITS;
893 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
894 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
895 #ifdef CONFIG_XEN_IA64_TLB_TRACK
896 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
897 #endif
898 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
899 res |= flags & ASSIGN_io ? _PAGE_IO: 0;
901 return res;
902 }
904 /* map a physical address to the specified metaphysical addr */
905 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
906 // This is called by assign_domain_mmio_page().
907 // So accessing to pte is racy.
908 int
909 __assign_domain_page(struct domain *d,
910 unsigned long mpaddr, unsigned long physaddr,
911 unsigned long flags)
912 {
913 volatile pte_t *pte;
914 pte_t old_pte;
915 pte_t new_pte;
916 pte_t ret_pte;
917 unsigned long prot = flags_to_prot(flags);
919 pte = lookup_alloc_domain_pte(d, mpaddr);
921 old_pte = __pte(0);
922 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
923 again_hvm_page_io:
924 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
925 if (pte_val(ret_pte) == pte_val(old_pte)) {
926 smp_mb();
927 return 0;
928 }
929 /* in HVM guest, when VTD is enabled,
930 * P2M entry may change from _PAGE_IO type to real MMIO page
931 */
932 if(is_hvm_domain(d) && (pte_val(ret_pte) & _PAGE_IO) &&
933 !mfn_valid(physaddr >> PAGE_SHIFT)) {
934 old_pte = ret_pte;
935 goto again_hvm_page_io;
936 }
938 // dom0 tries to map real machine's I/O region, but failed.
939 // It is very likely that dom0 doesn't boot correctly because
940 // it can't access I/O. So complain here.
941 if (flags & ASSIGN_nocache) {
942 int warn = 0;
944 if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT))
945 warn = 1;
946 else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) {
947 u32 type;
948 u64 attr;
950 warn = 1;
952 /*
953 * See
954 * complete_dom0_memmap()
955 * case EFI_RUNTIME_SERVICES_CODE:
956 * case EFI_RUNTIME_SERVICES_DATA:
957 * case EFI_ACPI_RECLAIM_MEMORY:
958 * case EFI_ACPI_MEMORY_NVS:
959 * case EFI_RESERVED_TYPE:
960 *
961 * Currently only EFI_RUNTIME_SERVICES_CODE is found
962 * so that we suppress only EFI_RUNTIME_SERVICES_CODE case.
963 */
964 type = efi_mem_type(physaddr);
965 attr = efi_mem_attributes(physaddr);
966 if (type == EFI_RUNTIME_SERVICES_CODE &&
967 (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB))
968 warn = 0;
969 }
970 if (warn)
971 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
972 "\talready assigned pte_val 0x%016lx\n"
973 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
974 __func__, __LINE__,
975 d, d->domain_id, pte_val(ret_pte),
976 mpaddr, physaddr, flags);
977 }
979 return -EAGAIN;
980 }
982 /* get_page() and map a physical address to the specified metaphysical addr */
983 void
984 assign_domain_page(struct domain *d,
985 unsigned long mpaddr, unsigned long physaddr)
986 {
987 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
989 BUG_ON((physaddr & _PAGE_PPN_MASK) != physaddr);
990 BUG_ON((page->count_info & ~PGC_xen_heap) != (PGC_allocated | 1));
991 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
992 // because __assign_domain_page() uses set_pte_rel() which has
993 // release semantics, smp_mb() isn't needed.
994 (void)__assign_domain_page(d, mpaddr, physaddr,
995 ASSIGN_writable | ASSIGN_pgc_allocated);
996 }
998 static void
999 ioports_get_mmio_addr(const struct io_space *space,
1000 unsigned long fp, unsigned long lp,
1001 unsigned long *mmio_start, unsigned long *mmio_end)
1003 if (space->sparse) {
1004 *mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
1005 *mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
1006 } else {
1007 *mmio_start = fp & PAGE_MASK;
1008 *mmio_end = PAGE_ALIGN(lp);
1012 static unsigned long
1013 ioports_get_mmio_base(const struct io_space *space, struct domain *d)
1015 if (VMX_DOMAIN(d->vcpu[0]))
1016 return LEGACY_IO_START;
1018 if (space == &io_space[0] && d != dom0)
1019 return IO_PORTS_PADDR;
1021 return __pa(space->mmio_base);
1024 /*
1025 * Inpurt
1026 * fgp: first guest port
1027 * fmp: first machine port
1028 * lmp: last machine port
1029 */
1030 int
1031 ioports_permit_access(struct domain *d, unsigned int fgp,
1032 unsigned int fmp, unsigned int lmp)
1034 struct io_space *space;
1035 unsigned long mmio_start, mach_start, mach_end;
1036 int ret;
1038 if (IO_SPACE_NR(fmp) >= num_io_spaces) {
1039 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fmp, lmp);
1040 return -EFAULT;
1043 /*
1044 * The ioport_cap rangeset tracks the I/O port address including
1045 * the port space ID. This means port space IDs need to match
1046 * between Xen and dom0. This is also a requirement because
1047 * the hypercall to pass these port ranges only uses a u32.
1049 * NB - non-dom0 driver domains may only have a subset of the
1050 * I/O port spaces and thus will number port spaces differently.
1051 * This is ok, they don't make use of this interface.
1052 */
1053 ret = rangeset_add_range(d->arch.ioport_caps, fmp, lmp);
1054 if (ret != 0)
1055 return ret;
1057 space = &io_space[IO_SPACE_NR(fmp)];
1059 /* Legacy I/O on dom0 is already setup */
1060 if (d == dom0 && space == &io_space[0])
1061 return 0;
1063 fmp = IO_SPACE_PORT(fmp);
1064 lmp = IO_SPACE_PORT(lmp);
1066 ioports_get_mmio_addr(space, fmp, lmp, &mach_start, &mach_end);
1068 /*
1069 * The "machine first port" is not necessarily identity mapped
1070 * to the guest first port. At least for the legacy range.
1071 */
1072 mach_start = mach_start | __pa(space->mmio_base);
1073 mach_end = mach_end | __pa(space->mmio_base);
1075 mmio_start = IO_SPACE_SPARSE_ENCODING(fgp) & PAGE_MASK;
1076 mmio_start |= ioports_get_mmio_base(space, d);
1078 while (mach_start < mach_end) {
1079 (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache);
1080 mmio_start += PAGE_SIZE;
1081 mach_start += PAGE_SIZE;
1084 return 0;
1087 static int
1088 ioports_has_allowed(struct domain *d, unsigned int fp, unsigned int lp)
1090 for (; fp < lp; fp++)
1091 if (rangeset_contains_singleton(d->arch.ioport_caps, fp))
1092 return 1;
1094 return 0;
1097 int
1098 ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
1100 int ret;
1101 struct mm_struct *mm = &d->arch.mm;
1102 unsigned long mmio_start, mmio_end, mmio_base;
1103 unsigned int fp_base, lp_base;
1104 struct io_space *space;
1106 if (IO_SPACE_NR(fp) >= num_io_spaces) {
1107 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
1108 return -EFAULT;
1111 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
1112 if (ret != 0)
1113 return ret;
1115 space = &io_space[IO_SPACE_NR(fp)];
1116 fp_base = IO_SPACE_PORT(fp);
1117 lp_base = IO_SPACE_PORT(lp);
1119 ioports_get_mmio_addr(space, fp_base, lp_base, &mmio_start, &mmio_end);
1121 mmio_base = ioports_get_mmio_base(space, d);
1123 for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
1124 unsigned int port, range;
1125 unsigned long mpaddr;
1126 volatile pte_t *pte;
1127 pte_t old_pte;
1129 if (space->sparse) {
1130 port = IO_SPACE_SPARSE_DECODING(mmio_start);
1131 range = IO_SPACE_SPARSE_PORTS_PER_PAGE - 1;
1132 } else {
1133 port = mmio_start;
1134 range = PAGE_SIZE - 1;
1137 port |= IO_SPACE_BASE(IO_SPACE_NR(fp));
1139 if (port < fp || port + range > lp) {
1140 /* Maybe this covers an allowed port. */
1141 if (ioports_has_allowed(d, port, port + range))
1142 continue;
1145 mpaddr = mmio_start | mmio_base;
1146 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1147 BUG_ON(pte == NULL);
1148 BUG_ON(pte_none(*pte));
1150 /* clear pte */
1151 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1153 domain_flush_vtlb_all(d);
1154 return 0;
1157 static void
1158 assign_domain_same_page(struct domain *d,
1159 unsigned long mpaddr, unsigned long size,
1160 unsigned long flags)
1162 //XXX optimization
1163 unsigned long end = PAGE_ALIGN(mpaddr + size);
1164 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
1165 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
1169 int
1170 efi_mmio(unsigned long physaddr, unsigned long size)
1172 void *efi_map_start, *efi_map_end;
1173 u64 efi_desc_size;
1174 void* p;
1176 efi_map_start = __va(ia64_boot_param->efi_memmap);
1177 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1178 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1180 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1181 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1182 unsigned long start = md->phys_addr;
1183 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1185 if (start <= physaddr && physaddr < end) {
1186 if ((physaddr + size) > end) {
1187 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1188 __func__, physaddr, size);
1189 return 0;
1192 // for io space
1193 if (md->type == EFI_MEMORY_MAPPED_IO ||
1194 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1195 return 1;
1198 // for runtime
1199 // see efi_enter_virtual_mode(void)
1200 // in linux/arch/ia64/kernel/efi.c
1201 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1202 !(md->attribute & EFI_MEMORY_WB)) {
1203 return 1;
1206 return 0;
1209 if (physaddr < start) {
1210 break;
1214 return 1;
1217 unsigned long
1218 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1219 unsigned long phys_addr, unsigned long size,
1220 unsigned long flags)
1222 unsigned long addr = mpaddr & PAGE_MASK;
1223 unsigned long end = PAGE_ALIGN(mpaddr + size);
1225 if (size == 0) {
1226 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1227 __func__, d, mpaddr, size);
1229 if (!efi_mmio(phys_addr, size)) {
1230 #ifndef NDEBUG
1231 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1232 __func__, d, mpaddr, size);
1233 #endif
1234 return -EINVAL;
1237 for (phys_addr &= PAGE_MASK; addr < end;
1238 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1239 __assign_domain_page(d, addr, phys_addr, flags);
1242 return mpaddr;
1245 unsigned long
1246 assign_domain_mach_page(struct domain *d,
1247 unsigned long mpaddr, unsigned long size,
1248 unsigned long flags)
1250 BUG_ON(flags & ASSIGN_pgc_allocated);
1251 assign_domain_same_page(d, mpaddr, size, flags);
1252 return mpaddr;
1255 static void
1256 adjust_page_count_info(struct page_info* page)
1258 struct domain* d = page_get_owner(page);
1259 BUG_ON((page->count_info & PGC_count_mask) < 1);
1260 if (d != NULL) {
1261 int ret = get_page(page, d);
1262 BUG_ON(ret == 0);
1263 } else {
1264 unsigned long x, nx, y;
1266 y = page->count_info;
1267 do {
1268 x = y;
1269 nx = x + 1;
1271 BUG_ON((x >> 32) != 0);
1272 BUG_ON((nx & PGC_count_mask) != 2);
1273 y = cmpxchg(&page->count_info, x, nx);
1274 } while (unlikely(y != x));
1275 BUG_ON(page_get_owner(page) != NULL);
1279 static void
1280 domain_put_page(struct domain* d, unsigned long mpaddr,
1281 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1283 unsigned long mfn = pte_pfn(old_pte);
1284 struct page_info* page = mfn_to_page(mfn);
1286 if (pte_pgc_allocated(old_pte)) {
1287 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1288 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1289 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1290 } else {
1291 BUG();
1294 if (likely(clear_PGC_allocate)) {
1295 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1296 BUG();
1297 /* put_page() is done by domain_page_flush_and_put() */
1298 } else {
1299 // In this case, page reference count mustn't touched.
1300 // domain_page_flush_and_put() decrements it, we increment
1301 // it in advence. This patch is slow path.
1302 //
1303 // guest_remove_page(): owner = d, count_info = 1
1304 // memory_exchange(): owner = NULL, count_info = 1
1305 // XENMEM_add_to_physmap: ower = d, count_info >= 1
1306 adjust_page_count_info(page);
1309 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1312 // caller must get_page(mfn_to_page(mfn)) before call.
1313 // caller must call set_gpfn_from_mfn() before call if necessary.
1314 // because set_gpfn_from_mfn() result must be visible before pte xchg
1315 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1316 // flags: ASSIGN_xxx
1317 static void
1318 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1319 unsigned long mfn, unsigned long flags)
1321 struct mm_struct *mm = &d->arch.mm;
1322 volatile pte_t* pte;
1323 pte_t old_pte;
1324 pte_t npte;
1325 unsigned long prot = flags_to_prot(flags);
1327 pte = lookup_alloc_domain_pte(d, mpaddr);
1329 // update pte
1330 npte = pfn_pte(mfn, __pgprot(prot));
1331 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1332 if (pte_mem(old_pte)) {
1333 unsigned long old_mfn = pte_pfn(old_pte);
1335 // mfn = old_mfn case can happen when domain maps a granted page
1336 // twice with the same pseudo physial address.
1337 // It's non sense, but allowed.
1338 // __gnttab_map_grant_ref()
1339 // => create_host_mapping()
1340 // => assign_domain_page_replace()
1341 if (mfn != old_mfn) {
1342 domain_put_page(d, mpaddr, pte, old_pte, 1);
1345 perfc_incr(assign_domain_page_replace);
1348 // caller must get_page(new_page) before
1349 // Only steal_page() calls this function.
1350 static int
1351 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1352 struct page_info* old_page,
1353 struct page_info* new_page,
1354 unsigned long flags, int clear_PGC_allocate)
1356 struct mm_struct *mm = &d->arch.mm;
1357 volatile pte_t* pte;
1358 unsigned long old_mfn;
1359 unsigned long old_prot;
1360 pte_t old_pte;
1361 unsigned long new_mfn;
1362 unsigned long new_prot;
1363 pte_t new_pte;
1364 pte_t ret_pte;
1366 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1367 pte = lookup_alloc_domain_pte(d, mpaddr);
1369 again:
1370 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1371 old_mfn = page_to_mfn(old_page);
1372 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1373 if (!pte_present(old_pte)) {
1374 gdprintk(XENLOG_INFO,
1375 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1376 __func__, pte_val(old_pte), old_prot, old_mfn);
1377 return -EINVAL;
1380 new_prot = flags_to_prot(flags);
1381 new_mfn = page_to_mfn(new_page);
1382 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1384 // update pte
1385 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1386 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1387 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1388 goto again;
1391 gdprintk(XENLOG_INFO,
1392 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1393 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1394 __func__,
1395 pte_val(old_pte), old_prot, old_mfn,
1396 pte_val(ret_pte), pte_pfn(ret_pte));
1397 return -EINVAL;
1400 BUG_ON(!pte_mem(old_pte));
1401 BUG_ON(!pte_pgc_allocated(old_pte));
1402 BUG_ON(page_get_owner(old_page) != d);
1403 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1404 BUG_ON(old_mfn == new_mfn);
1406 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1407 if (likely(clear_PGC_allocate)) {
1408 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1409 BUG();
1410 } else {
1411 int ret;
1412 // adjust for count_info for domain_page_flush_and_put()
1413 // This is slow path.
1414 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1415 BUG_ON(d == NULL);
1416 ret = get_page(old_page, d);
1417 BUG_ON(ret == 0);
1420 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1421 perfc_incr(assign_domain_pge_cmpxchg_rel);
1422 return 0;
1425 static void
1426 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1427 int clear_PGC_allocate, unsigned long mfn)
1429 struct mm_struct *mm = &d->arch.mm;
1430 volatile pte_t *pte;
1431 pte_t old_pte;
1432 struct page_info *page;
1434 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1435 if (pte == NULL)
1436 return;
1437 if (pte_none(*pte))
1438 return;
1440 if (mfn == INVALID_MFN) {
1441 // clear pte
1442 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1443 if(!pte_mem(old_pte))
1444 return;
1445 mfn = pte_pfn(old_pte);
1446 } else {
1447 unsigned long old_arflags;
1448 pte_t new_pte;
1449 pte_t ret_pte;
1451 again:
1452 // memory_exchange() calls guest_physmap_remove_page() with
1453 // a stealed page. i.e. page owner = NULL.
1454 BUG_ON(mfn_valid(mfn) &&
1455 page_get_owner(mfn_to_page(mfn)) != d &&
1456 page_get_owner(mfn_to_page(mfn)) != NULL);
1457 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1458 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1459 new_pte = __pte(0);
1461 // update pte
1462 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1463 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1464 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1465 goto again;
1468 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1469 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1470 __func__,
1471 pte_val(old_pte), old_arflags, mfn,
1472 pte_val(ret_pte), pte_pfn(ret_pte));
1473 return;
1475 BUG_ON(mfn != pte_pfn(ret_pte));
1478 perfc_incr(zap_domain_page_one);
1479 if(!mfn_valid(mfn))
1480 return;
1482 if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ){
1483 int i, j;
1484 j = 1 << (PAGE_SHIFT-PAGE_SHIFT_4K);
1485 for(i = 0 ; i < j; i++)
1486 iommu_unmap_page(d, (mpaddr>>PAGE_SHIFT)*j + i);
1489 page = mfn_to_page(mfn);
1490 BUG_ON((page->count_info & PGC_count_mask) == 0);
1492 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1493 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1496 int
1497 deassign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1498 unsigned long phys_addr, unsigned long size )
1500 unsigned long addr = mpaddr & PAGE_MASK;
1501 unsigned long end = PAGE_ALIGN(mpaddr + size);
1503 if (size == 0) {
1504 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1505 __func__, d, mpaddr, size);
1507 if (!efi_mmio(phys_addr, size)) {
1508 #ifndef NDEBUG
1509 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1510 __func__, d, mpaddr, size);
1511 #endif
1512 return -EINVAL;
1515 for (; addr < end; addr += PAGE_SIZE )
1516 zap_domain_page_one(d, addr, 0, INVALID_MFN);
1517 return 0;
1520 unsigned long
1521 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1522 unsigned int extent_order)
1524 if (extent_order != 0) {
1525 //XXX
1526 return -ENOSYS;
1529 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1530 perfc_incr(dom0vp_zap_physmap);
1531 return 0;
1534 static unsigned long
1535 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1536 unsigned long mfn_or_gmfn,
1537 unsigned long flags, domid_t domid, int is_gmfn)
1539 int error = -EINVAL;
1540 struct domain* rd;
1541 unsigned long mfn;
1543 /* Not allowed by a domain. */
1544 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1545 return -EINVAL;
1547 rd = rcu_lock_domain_by_id(domid);
1548 if (unlikely(rd == NULL)) {
1549 switch (domid) {
1550 case DOMID_XEN:
1551 rd = dom_xen;
1552 break;
1553 case DOMID_IO:
1554 rd = dom_io;
1555 break;
1556 default:
1557 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1558 "gpfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1559 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1560 return -ESRCH;
1562 BUG_ON(rd == NULL);
1563 rcu_lock_domain(rd);
1566 if (unlikely(rd == d))
1567 goto out1;
1568 /*
1569 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1570 * It can be considered that their p2m conversion is p==m.
1571 */
1572 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1573 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1574 else
1575 mfn = mfn_or_gmfn;
1576 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1577 goto out1;
1579 error = 0;
1580 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1581 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1582 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1583 //don't update p2m table because this page belongs to rd, not d.
1584 perfc_incr(dom0vp_add_physmap);
1585 out1:
1586 rcu_unlock_domain(rd);
1587 return error;
1590 unsigned long
1591 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1592 unsigned long flags, domid_t domid)
1594 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1597 unsigned long
1598 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1599 unsigned long gmfn, unsigned long flags,
1600 domid_t domid)
1602 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1605 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1606 #define P2M_PFN_ROUNDUP(x) (((x) + PTRS_PER_PTE - 1) & \
1607 ~(PTRS_PER_PTE - 1))
1608 #define P2M_PFN_ROUNDDOWN(x) ((x) & ~(PTRS_PER_PTE - 1))
1609 #define P2M_NUM_PFN(x) (((x) + PTRS_PER_PTE - 1) / PTRS_PER_PTE)
1610 #define MD_END(md) ((md)->phys_addr + \
1611 ((md)->num_pages << EFI_PAGE_SHIFT))
1612 static struct page_info* p2m_pte_zero_page = NULL;
1614 /* This must called before dom0 p2m table allocation */
1615 void __init
1616 expose_p2m_init(void)
1618 pte_t* pte;
1620 /*
1621 * Initialise our DOMID_P2M domain.
1622 * This domain owns m2p table pages.
1623 */
1624 dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
1625 BUG_ON(dom_p2m == NULL);
1626 dom_p2m->max_pages = ~0U;
1628 pte = pte_alloc_one_kernel(NULL, 0);
1629 BUG_ON(pte == NULL);
1630 smp_mb();// make contents of the page visible.
1631 p2m_pte_zero_page = virt_to_page(pte);
1634 // allocate pgd, pmd of dest_dom if necessary
1635 static int
1636 allocate_pgd_pmd(struct domain* dest_dom, unsigned long dest_gpfn,
1637 struct domain* src_dom,
1638 unsigned long src_gpfn, unsigned long num_src_gpfn)
1640 unsigned long i = 0;
1642 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1643 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1645 while (i < num_src_gpfn) {
1646 volatile pte_t* src_pte;
1647 volatile pte_t* dest_pte;
1649 src_pte = lookup_noalloc_domain_pte(src_dom,
1650 (src_gpfn + i) << PAGE_SHIFT);
1651 if (src_pte == NULL) {
1652 i++;
1653 continue;
1656 dest_pte = lookup_alloc_domain_pte(dest_dom,
1657 (dest_gpfn << PAGE_SHIFT) +
1658 i * sizeof(pte_t));
1659 if (dest_pte == NULL) {
1660 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n",
1661 __func__);
1662 return -ENOMEM;
1665 // skip to next pte page
1666 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1668 return 0;
1671 static int
1672 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1674 int ret = get_page(page, dom_p2m);
1675 BUG_ON(ret != 1);
1676 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1677 ASSIGN_readonly);
1680 // expose pte page
1681 static int
1682 expose_p2m_range(struct domain* dest_dom, unsigned long dest_gpfn,
1683 struct domain* src_dom,
1684 unsigned long src_gpfn, unsigned long num_src_gpfn)
1686 unsigned long i = 0;
1688 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1689 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1691 while (i < num_src_gpfn) {
1692 volatile pte_t* pte;
1694 pte = lookup_noalloc_domain_pte(src_dom, (src_gpfn + i) << PAGE_SHIFT);
1695 if (pte == NULL) {
1696 i++;
1697 continue;
1700 if (expose_p2m_page(dest_dom,
1701 (dest_gpfn << PAGE_SHIFT) + i * sizeof(pte_t),
1702 virt_to_page(pte)) < 0) {
1703 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1704 return -EAGAIN;
1707 // skip to next pte page
1708 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1710 return 0;
1713 // expose p2m_pte_zero_page
1714 static int
1715 expose_zero_page(struct domain* dest_dom, unsigned long dest_gpfn,
1716 unsigned long num_src_gpfn)
1718 unsigned long i;
1720 for (i = 0; i < P2M_NUM_PFN(num_src_gpfn); i++) {
1721 volatile pte_t* pte;
1722 pte = lookup_noalloc_domain_pte(dest_dom,
1723 (dest_gpfn + i) << PAGE_SHIFT);
1724 if (pte == NULL || pte_present(*pte))
1725 continue;
1727 if (expose_p2m_page(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1728 p2m_pte_zero_page) < 0) {
1729 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n",
1730 __func__);
1731 return -EAGAIN;
1734 return 0;
1737 static int
1738 expose_p2m(struct domain* dest_dom, unsigned long dest_gpfn,
1739 struct domain* src_dom,
1740 unsigned long src_gpfn, unsigned long num_src_gpfn)
1742 if (allocate_pgd_pmd(dest_dom, dest_gpfn,
1743 src_dom, src_gpfn, num_src_gpfn))
1744 return -ENOMEM;
1746 if (expose_p2m_range(dest_dom, dest_gpfn,
1747 src_dom, src_gpfn, num_src_gpfn))
1748 return -EAGAIN;
1750 if (expose_zero_page(dest_dom, dest_gpfn, num_src_gpfn))
1751 return -EAGAIN;
1753 return 0;
1756 static void
1757 unexpose_p2m(struct domain* dest_dom,
1758 unsigned long dest_gpfn, unsigned long num_dest_gpfn)
1760 unsigned long i;
1762 for (i = 0; i < num_dest_gpfn; i++) {
1763 zap_domain_page_one(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1764 0, INVALID_MFN);
1768 // It is possible to optimize loop, But this isn't performance critical.
1769 unsigned long
1770 dom0vp_expose_p2m(struct domain* d,
1771 unsigned long conv_start_gpfn,
1772 unsigned long assign_start_gpfn,
1773 unsigned long expose_size, unsigned long granule_pfn)
1775 unsigned long ret;
1776 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1778 if ((expose_size % PAGE_SIZE) != 0 ||
1779 (granule_pfn % PTRS_PER_PTE) != 0 ||
1780 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1781 (conv_start_gpfn % granule_pfn) != 0 ||
1782 (assign_start_gpfn % granule_pfn) != 0 ||
1783 (expose_num_pfn % granule_pfn) != 0) {
1784 gdprintk(XENLOG_INFO,
1785 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1786 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1787 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1788 return -EINVAL;
1791 if (granule_pfn != PTRS_PER_PTE) {
1792 gdprintk(XENLOG_INFO,
1793 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1794 __func__, granule_pfn, PTRS_PER_PTE);
1795 return -ENOSYS;
1797 ret = expose_p2m(d, assign_start_gpfn,
1798 d, conv_start_gpfn, expose_num_pfn);
1799 return ret;
1802 static int
1803 memmap_info_copy_from_guest(struct xen_ia64_memmap_info* memmap_info,
1804 char** memmap_p,
1805 XEN_GUEST_HANDLE(char) buffer)
1807 char *memmap;
1808 char *p;
1809 char *memmap_end;
1810 efi_memory_desc_t *md;
1811 unsigned long start;
1812 unsigned long end;
1813 efi_memory_desc_t *prev_md;
1815 if (copy_from_guest((char*)memmap_info, buffer, sizeof(*memmap_info)))
1816 return -EFAULT;
1817 if (memmap_info->efi_memdesc_size < sizeof(efi_memory_desc_t) ||
1818 memmap_info->efi_memmap_size < memmap_info->efi_memdesc_size ||
1819 (memmap_info->efi_memmap_size % memmap_info->efi_memdesc_size) != 0)
1820 return -EINVAL;
1822 memmap = _xmalloc(memmap_info->efi_memmap_size,
1823 __alignof__(efi_memory_desc_t));
1824 if (memmap == NULL)
1825 return -ENOMEM;
1826 if (copy_from_guest_offset(memmap, buffer, sizeof(*memmap_info),
1827 memmap_info->efi_memmap_size)) {
1828 xfree(memmap);
1829 return -EFAULT;
1832 /* intergirty check & simplify */
1833 sort(memmap, memmap_info->efi_memmap_size / memmap_info->efi_memdesc_size,
1834 memmap_info->efi_memdesc_size, efi_mdt_cmp, NULL);
1836 /* alignement & overlap check */
1837 prev_md = NULL;
1838 p = memmap;
1839 memmap_end = memmap + memmap_info->efi_memmap_size;
1840 for (p = memmap; p < memmap_end; p += memmap_info->efi_memmap_size) {
1841 md = (efi_memory_desc_t*)p;
1842 start = md->phys_addr;
1844 if (start & ((1UL << EFI_PAGE_SHIFT) - 1) || md->num_pages == 0) {
1845 xfree(memmap);
1846 return -EINVAL;
1849 if (prev_md != NULL) {
1850 unsigned long prev_end = MD_END(prev_md);
1851 if (prev_end > start) {
1852 xfree(memmap);
1853 return -EINVAL;
1857 prev_md = (efi_memory_desc_t *)p;
1860 /* coalease */
1861 prev_md = NULL;
1862 p = memmap;
1863 while (p < memmap_end) {
1864 md = (efi_memory_desc_t*)p;
1865 start = md->phys_addr;
1866 end = MD_END(md);
1868 start = P2M_PFN_ROUNDDOWN(start >> PAGE_SHIFT) << PAGE_SHIFT;
1869 end = P2M_PFN_ROUNDUP(end >> PAGE_SHIFT) << PAGE_SHIFT;
1870 md->phys_addr = start;
1871 md->num_pages = (end - start) >> EFI_PAGE_SHIFT;
1873 if (prev_md != NULL) {
1874 unsigned long prev_end = MD_END(prev_md);
1875 if (prev_end >= start) {
1876 size_t left;
1877 end = max(prev_end, end);
1878 prev_md->num_pages = (end - prev_md->phys_addr) >> EFI_PAGE_SHIFT;
1880 left = memmap_end - p;
1881 if (left > memmap_info->efi_memdesc_size) {
1882 left -= memmap_info->efi_memdesc_size;
1883 memmove(p, p + memmap_info->efi_memdesc_size, left);
1886 memmap_info->efi_memmap_size -= memmap_info->efi_memdesc_size;
1887 memmap_end -= memmap_info->efi_memdesc_size;
1888 continue;
1892 prev_md = md;
1893 p += memmap_info->efi_memdesc_size;
1896 if (copy_to_guest(buffer, (char*)memmap_info, sizeof(*memmap_info)) ||
1897 copy_to_guest_offset(buffer, sizeof(*memmap_info),
1898 (char*)memmap, memmap_info->efi_memmap_size)) {
1899 xfree(memmap);
1900 return -EFAULT;
1903 *memmap_p = memmap;
1904 return 0;
1907 static int
1908 foreign_p2m_allocate_pte(struct domain* d,
1909 const struct xen_ia64_memmap_info* memmap_info,
1910 const void* memmap)
1912 const void* memmap_end = memmap + memmap_info->efi_memmap_size;
1913 const void* p;
1915 for (p = memmap; p < memmap_end; p += memmap_info->efi_memdesc_size) {
1916 const efi_memory_desc_t* md = p;
1917 unsigned long start = md->phys_addr;
1918 unsigned long end = MD_END(md);
1919 unsigned long gpaddr;
1921 for (gpaddr = start; gpaddr < end; gpaddr += PAGE_SIZE) {
1922 if (lookup_alloc_domain_pte(d, gpaddr) == NULL) {
1923 return -ENOMEM;
1928 return 0;
1931 struct foreign_p2m_region {
1932 unsigned long gpfn;
1933 unsigned long num_gpfn;
1934 };
1936 struct foreign_p2m_entry {
1937 struct list_head list;
1938 int busy;
1940 /* src domain */
1941 struct domain* src_dom;
1943 /* region into which foreign p2m table is mapped */
1944 unsigned long gpfn;
1945 unsigned long num_gpfn;
1946 unsigned int num_region;
1947 struct foreign_p2m_region region[0];
1948 };
1950 /* caller must increment the reference count of src_dom */
1951 static int
1952 foreign_p2m_alloc(struct foreign_p2m* foreign_p2m,
1953 unsigned long dest_gpfn, struct domain* src_dom,
1954 struct xen_ia64_memmap_info* memmap_info, void* memmap,
1955 struct foreign_p2m_entry** entryp)
1957 void* memmap_end = memmap + memmap_info->efi_memmap_size;
1958 efi_memory_desc_t* md;
1959 unsigned long dest_gpfn_end;
1960 unsigned long src_gpfn;
1961 unsigned long src_gpfn_end;
1963 unsigned int num_region;
1964 struct foreign_p2m_entry* entry;
1965 struct foreign_p2m_entry* prev;
1966 struct foreign_p2m_entry* pos;
1968 num_region = (memmap_end - memmap) / memmap_info->efi_memdesc_size;
1970 md = memmap;
1971 src_gpfn = P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
1973 md = memmap + (num_region - 1) * memmap_info->efi_memdesc_size;
1974 src_gpfn_end = MD_END(md) >> PAGE_SHIFT;
1975 if (src_gpfn_end >
1976 P2M_PFN_ROUNDUP(src_dom->arch.convmem_end >> PAGE_SHIFT))
1977 return -EINVAL;
1979 src_gpfn_end = P2M_PFN_ROUNDUP(src_gpfn_end);
1980 dest_gpfn_end = dest_gpfn + P2M_NUM_PFN(src_gpfn_end - src_gpfn);
1981 entry = _xmalloc(sizeof(*entry) + num_region * sizeof(entry->region[0]),
1982 __alignof__(*entry));
1983 if (entry == NULL)
1984 return -ENOMEM;
1986 entry->busy = 1;
1987 entry->gpfn = dest_gpfn;
1988 entry->num_gpfn = dest_gpfn_end - dest_gpfn;
1989 entry->src_dom = src_dom;
1990 entry->num_region = 0;
1991 memset(entry->region, 0, sizeof(entry->region[0]) * num_region);
1992 prev = NULL;
1994 spin_lock(&foreign_p2m->lock);
1995 if (list_empty(&foreign_p2m->head))
1996 prev = (struct foreign_p2m_entry*)&foreign_p2m->head;
1998 list_for_each_entry(pos, &foreign_p2m->head, list) {
1999 if (pos->gpfn + pos->num_gpfn < dest_gpfn) {
2000 prev = pos;
2001 continue;
2004 if (dest_gpfn_end < pos->gpfn) {
2005 if (prev != NULL && prev->gpfn + prev->num_gpfn > dest_gpfn)
2006 prev = NULL;/* overlap */
2007 break;
2010 /* overlap */
2011 prev = NULL;
2012 break;
2014 if (prev != NULL) {
2015 list_add(&entry->list, &prev->list);
2016 spin_unlock(&foreign_p2m->lock);
2017 *entryp = entry;
2018 return 0;
2020 spin_unlock(&foreign_p2m->lock);
2021 xfree(entry);
2022 return -EBUSY;
2025 static void
2026 foreign_p2m_unexpose(struct domain* dest_dom, struct foreign_p2m_entry* entry)
2028 unsigned int i;
2030 BUG_ON(!entry->busy);
2031 for (i = 0; i < entry->num_region; i++)
2032 unexpose_p2m(dest_dom,
2033 entry->region[i].gpfn, entry->region[i].num_gpfn);
2036 static void
2037 foreign_p2m_unbusy(struct foreign_p2m* foreign_p2m,
2038 struct foreign_p2m_entry* entry)
2040 spin_lock(&foreign_p2m->lock);
2041 BUG_ON(!entry->busy);
2042 entry->busy = 0;
2043 spin_unlock(&foreign_p2m->lock);
2046 static void
2047 foreign_p2m_free(struct foreign_p2m* foreign_p2m,
2048 struct foreign_p2m_entry* entry)
2050 spin_lock(&foreign_p2m->lock);
2051 BUG_ON(!entry->busy);
2052 list_del(&entry->list);
2053 spin_unlock(&foreign_p2m->lock);
2055 put_domain(entry->src_dom);
2056 xfree(entry);
2059 void
2060 foreign_p2m_init(struct domain* d)
2062 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
2063 INIT_LIST_HEAD(&foreign_p2m->head);
2064 spin_lock_init(&foreign_p2m->lock);
2067 void
2068 foreign_p2m_destroy(struct domain* d)
2070 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
2071 struct foreign_p2m_entry* entry;
2072 struct foreign_p2m_entry* n;
2074 spin_lock(&foreign_p2m->lock);
2075 list_for_each_entry_safe(entry, n, &foreign_p2m->head, list) {
2076 /* mm_teardown() cleared p2m table already */
2077 /* foreign_p2m_unexpose(d, entry);*/
2078 list_del(&entry->list);
2079 put_domain(entry->src_dom);
2080 xfree(entry);
2082 spin_unlock(&foreign_p2m->lock);
2085 unsigned long
2086 dom0vp_expose_foreign_p2m(struct domain* dest_dom,
2087 unsigned long dest_gpfn,
2088 domid_t domid,
2089 XEN_GUEST_HANDLE(char) buffer,
2090 unsigned long flags)
2092 unsigned long ret = 0;
2093 struct domain* src_dom;
2094 struct xen_ia64_memmap_info memmap_info;
2095 char* memmap;
2096 void* memmap_end;
2097 void* p;
2099 struct foreign_p2m_entry* entry;
2101 ret = memmap_info_copy_from_guest(&memmap_info, &memmap, buffer);
2102 if (ret != 0)
2103 return ret;
2105 dest_dom = rcu_lock_domain(dest_dom);
2106 if (dest_dom == NULL) {
2107 ret = -EINVAL;
2108 goto out;
2110 #if 1
2111 // Self foreign domain p2m exposure isn't allowed.
2112 // Otherwise the domain can't be destroyed because
2113 // no one decrements the domain reference count.
2114 if (domid == dest_dom->domain_id) {
2115 ret = -EINVAL;
2116 goto out;
2118 #endif
2120 src_dom = get_domain_by_id(domid);
2121 if (src_dom == NULL) {
2122 ret = -EINVAL;
2123 goto out_unlock;
2126 if (flags & IA64_DOM0VP_EFP_ALLOC_PTE) {
2127 ret = foreign_p2m_allocate_pte(src_dom, &memmap_info, memmap);
2128 if (ret != 0)
2129 goto out_unlock;
2132 ret = foreign_p2m_alloc(&dest_dom->arch.foreign_p2m, dest_gpfn,
2133 src_dom, &memmap_info, memmap, &entry);
2134 if (ret != 0)
2135 goto out_unlock;
2137 memmap_end = memmap + memmap_info.efi_memmap_size;
2138 for (p = memmap; p < memmap_end; p += memmap_info.efi_memdesc_size) {
2139 efi_memory_desc_t* md = p;
2140 unsigned long src_gpfn =
2141 P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
2142 unsigned long src_gpfn_end =
2143 P2M_PFN_ROUNDUP(MD_END(md) >> PAGE_SHIFT);
2144 unsigned long num_src_gpfn = src_gpfn_end - src_gpfn;
2146 ret = expose_p2m(dest_dom, dest_gpfn + src_gpfn / PTRS_PER_PTE,
2147 src_dom, src_gpfn, num_src_gpfn);
2148 if (ret != 0)
2149 break;
2151 entry->region[entry->num_region].gpfn =
2152 dest_gpfn + src_gpfn / PTRS_PER_PTE;
2153 entry->region[entry->num_region].num_gpfn = P2M_NUM_PFN(num_src_gpfn);
2154 entry->num_region++;
2157 if (ret == 0) {
2158 foreign_p2m_unbusy(&dest_dom->arch.foreign_p2m, entry);
2159 } else {
2160 foreign_p2m_unexpose(dest_dom, entry);
2161 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2164 out_unlock:
2165 rcu_unlock_domain(dest_dom);
2166 out:
2167 xfree(memmap);
2168 return ret;
2171 unsigned long
2172 dom0vp_unexpose_foreign_p2m(struct domain* dest_dom,
2173 unsigned long dest_gpfn,
2174 domid_t domid)
2176 int ret = -ENOENT;
2177 struct foreign_p2m* foreign_p2m = &dest_dom->arch.foreign_p2m;
2178 struct foreign_p2m_entry* entry;
2180 dest_dom = rcu_lock_domain(dest_dom);
2181 if (dest_dom == NULL)
2182 return ret;
2183 spin_lock(&foreign_p2m->lock);
2184 list_for_each_entry(entry, &foreign_p2m->head, list) {
2185 if (entry->gpfn < dest_gpfn)
2186 continue;
2187 if (dest_gpfn < entry->gpfn)
2188 break;
2190 if (domid == entry->src_dom->domain_id)
2191 ret = 0;
2192 else
2193 ret = -EINVAL;
2194 break;
2196 if (ret == 0) {
2197 if (entry->busy == 0)
2198 entry->busy = 1;
2199 else
2200 ret = -EBUSY;
2202 spin_unlock(&foreign_p2m->lock);
2204 if (ret == 0) {
2205 foreign_p2m_unexpose(dest_dom, entry);
2206 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2208 rcu_unlock_domain(dest_dom);
2209 return ret;
2212 /* this lock can be only for memmap_info. domain_lock() is abused here */
2213 static void
2214 memmap_lock(struct domain *d)
2216 domain_lock(d);
2219 static void
2220 memmap_unlock(struct domain *d)
2222 domain_unlock(d);
2225 /* copy memory range to domain pseudo physical address space */
2226 static int
2227 __memmap_copy_to(struct domain *d, unsigned long dest_gpfn,
2228 void *src, unsigned long num_pages)
2230 BUG_ON(((unsigned long)src & ~PAGE_MASK) != 0);
2232 while (num_pages > 0) {
2233 unsigned long mfn;
2234 struct page_info *page;
2235 void *virt;
2237 mfn = gmfn_to_mfn_foreign(d, dest_gpfn);
2238 if (mfn == 0 || mfn == INVALID_MFN)
2239 return -EFAULT;
2240 page = mfn_to_page(mfn);
2241 if (get_page(page, d) == 0)
2242 return -EFAULT;
2243 virt = mfn_to_virt(mfn);
2244 copy_page(virt, src);
2245 __xencomm_mark_dirty(d, (unsigned long)virt, PAGE_SIZE);
2246 put_page(page);
2248 src += PAGE_SIZE;
2249 dest_gpfn++;
2250 num_pages--;
2253 return 0;
2256 /* copy memory range from domain pseudo physical address space */
2257 static int
2258 __memmap_copy_from(void *dest, struct domain *d, unsigned long src_gpfn,
2259 unsigned long num_pages)
2261 BUG_ON(((unsigned long)dest & ~PAGE_MASK) != 0);
2263 while (num_pages > 0) {
2264 unsigned long mfn;
2265 struct page_info *page;
2267 mfn = gmfn_to_mfn_foreign(d, src_gpfn);
2268 if (mfn == 0 || mfn == INVALID_MFN)
2269 return -EFAULT;
2270 page = mfn_to_page(mfn);
2271 if (get_page(page, d) == 0)
2272 return -EFAULT;
2273 copy_page(dest, mfn_to_virt(mfn));
2274 put_page(page);
2276 dest += PAGE_SIZE;
2277 src_gpfn++;
2278 num_pages--;
2281 return 0;
2284 /* This function unlock/lock memmeap_lock.
2285 * caller must free (*page, *order) even if error case by ckecking
2286 * *page = NULL.
2287 */
2288 static int
2289 memmap_copy_from(struct domain *d,
2290 struct page_info **page, unsigned long *order)
2292 unsigned long num_pages;
2293 struct xen_ia64_memmap_info *memmap_info;
2294 unsigned long memmap_info_pfn;
2296 num_pages = d->shared_info->arch.memmap_info_num_pages;
2297 memmap_unlock(d);
2299 again:
2300 *order = get_order(num_pages << PAGE_SHIFT);
2301 *page = alloc_domheap_pages(NULL, *order, 0);
2302 if (*page == NULL)
2303 return -ENOMEM;
2304 memmap_info = page_to_virt(*page);
2306 memmap_lock(d);
2307 if (d->shared_info->arch.memmap_info_num_pages != num_pages) {
2308 num_pages = d->shared_info->arch.memmap_info_num_pages;
2309 memmap_unlock(d);
2310 free_domheap_pages(*page, *order);
2311 goto again;
2313 memmap_info_pfn = d->shared_info->arch.memmap_info_pfn;
2315 /* copy into local to make them virtually contiguous */
2316 return __memmap_copy_from(memmap_info, d, memmap_info_pfn, num_pages);
2319 static int
2320 memdesc_can_expand(const struct xen_ia64_memmap_info *memmap_info,
2321 unsigned long num_pages)
2323 /* Is there room for one more md? */
2324 if ((num_pages << PAGE_SHIFT) <
2325 (sizeof(*memmap_info) + memmap_info->efi_memmap_size +
2326 memmap_info->efi_memdesc_size))
2327 return 0;
2329 return 1;
2332 static int
2333 memdesc_can_collapse(const efi_memory_desc_t *lhs,
2334 const efi_memory_desc_t *rhs)
2336 return (lhs->type == rhs->type && lhs->attribute == rhs->attribute);
2339 static int
2340 __dom0vp_add_memdesc_one(struct xen_ia64_memmap_info *memmap_info,
2341 unsigned long num_pages,
2342 const efi_memory_desc_t *md)
2344 void* const memmap_end = (void*)memmap_info->memdesc +
2345 memmap_info->efi_memmap_size;
2346 void *p;
2347 efi_memory_desc_t *tmp_md;
2348 efi_memory_desc_t *s_md;
2349 efi_memory_desc_t *e_md;
2350 u64 phys_addr;
2351 u64 phys_addr_end;
2353 /* fast path. appending to the last entry */
2354 tmp_md = (efi_memory_desc_t*)(memmap_end - memmap_info->efi_memdesc_size);
2355 if (MD_END(tmp_md) < md->phys_addr) {
2356 /* append one */
2357 if (!memdesc_can_expand(memmap_info, num_pages))
2358 return -ENOMEM;
2360 memcpy(memmap_end, md, memmap_info->efi_memdesc_size);
2361 memmap_info->efi_memmap_size += memmap_info->efi_memdesc_size;
2362 return 0;
2364 /* fast path. expand the last entry */
2365 if (tmp_md->phys_addr <= md->phys_addr) {
2366 if (!memdesc_can_collapse(tmp_md, md))
2367 return -EINVAL;
2369 phys_addr_end = max(MD_END(tmp_md), MD_END(md));
2370 tmp_md->num_pages =
2371 (phys_addr_end - tmp_md->phys_addr) >> EFI_PAGE_SHIFT;
2372 return 0;
2375 /* slow path */
2376 s_md = NULL;
2377 e_md = NULL;
2378 for (p = memmap_info->memdesc;
2379 p < memmap_end;
2380 p += memmap_info->efi_memdesc_size) {
2381 tmp_md = p;
2383 if (MD_END(tmp_md) < md->phys_addr)
2384 continue;
2386 if (MD_END(md) < tmp_md->phys_addr) {
2387 if (s_md == NULL) {
2388 void *next_md = p + memmap_info->efi_memdesc_size;
2389 size_t left_size = memmap_end - (void*)tmp_md;
2391 /* found hole. just insert md here*/
2392 if (!memdesc_can_expand(memmap_info, num_pages))
2393 return -ENOMEM;
2395 memmove(next_md, tmp_md, left_size);
2396 memcpy(tmp_md, md, memmap_info->efi_memdesc_size);
2397 memmap_info->efi_memmap_size += memmap_info->efi_memdesc_size;
2398 return 0;
2400 break;
2403 if (s_md == NULL)
2404 s_md = tmp_md;
2405 e_md = tmp_md;
2407 if (!memdesc_can_collapse(tmp_md, md))
2408 return -EINVAL;
2410 BUG_ON(s_md == NULL || e_md == NULL);
2412 /* collapse into one */
2413 phys_addr = min(md->phys_addr, s_md->phys_addr);
2414 phys_addr_end = max(MD_END(md), MD_END(e_md));
2415 s_md->phys_addr = phys_addr;
2416 s_md->num_pages = (phys_addr_end - phys_addr) >> EFI_PAGE_SHIFT;
2417 if (s_md != e_md) {
2418 void *next_s_md = (void*)s_md + memmap_info->efi_memdesc_size;
2419 void *next_e_md = (void*)e_md + memmap_info->efi_memdesc_size;
2420 size_t left_size = memmap_end - (void*)next_e_md;
2422 memmap_info->efi_memmap_size -= (void*)e_md - (void*)s_md;
2423 if (left_size > 0)
2424 memmove(next_s_md, next_e_md, left_size);
2427 return 0;
2430 /*
2431 * d->arch.convmem_end is mostly read only and sometimes increased.
2432 * It is protected by memmap_lock
2434 * d->arch.convmem_end is also referned by guest(self p2m exposure)
2435 * d->shared_info.arch.memmap_info_xxx and memmap_info are
2436 * referenced by tools stack(save/dump-core/foreign p2m exposure).
2438 * reader side:
2439 * - get d->arch.convmem_end (via XENMEM_maximum_gpfn)
2440 * - issue get_memmap hypercall to get memmap
2441 * In VMM
2442 * - lock memmap_lock
2443 * - copy memmap from target guest
2444 * - unlock memmap_lock
2445 * - copy memmap into tools stack address space.
2446 * - check d->shared_info.memmap_info_num_pages. try again if necessary
2447 * - get d->arch.convmem_end. try again if changed.
2449 * writer side:
2450 * - lock memmap_lock
2451 * - increase d->arch.convmem_end at first if necessary
2452 * - unlock memmap_lock
2453 * - allocate memory
2454 * In fact page allocation isn't blocking, so unlock/lock isn't necessary.
2455 * - lock memmap_lock
2456 * - update memmap_info
2457 * - unlock memmap_lock
2458 */
2459 static int
2460 __dom0vp_add_memdesc(struct domain *targ_d,
2461 const struct xen_ia64_memmap_info *u_memmap_info,
2462 const char *u_memmap)
2464 int ret = 0;
2465 const void* const u_memmap_end = u_memmap + u_memmap_info->efi_memmap_size;
2466 const efi_memory_desc_t *md;
2468 unsigned long md_end_max;
2469 unsigned long num_pages;
2470 unsigned long order;
2471 unsigned long memmap_info_pfn;
2473 struct page_info *page = NULL;
2474 struct xen_ia64_memmap_info *memmap_info;
2475 size_t unused_size;
2477 const void *p;
2479 /* update d->arch.convmem_end */
2480 md_end_max = 0;
2481 for (p = u_memmap; p < u_memmap_end;
2482 p += u_memmap_info->efi_memdesc_size) {
2483 md = p;
2484 if (MD_END(md) > md_end_max)
2485 md_end_max = MD_END(md);
2487 memmap_lock(targ_d);
2488 /* convmem_end is also protected memdesc lock */
2489 if (md_end_max > targ_d->arch.convmem_end)
2490 targ_d->arch.convmem_end = md_end_max;
2492 /* memmap_copy_from_guest() unlock/lock memmap_lock() */
2493 ret = memmap_copy_from(targ_d, &page, &order);
2494 if (ret != 0)
2495 goto out;
2496 memmap_info = page_to_virt(page);
2497 num_pages = targ_d->shared_info->arch.memmap_info_num_pages;
2498 memmap_info_pfn = targ_d->shared_info->arch.memmap_info_pfn;
2500 if (memmap_info->efi_memdesc_size != u_memmap_info->efi_memdesc_size ||
2501 memmap_info->efi_memdesc_version !=
2502 u_memmap_info->efi_memdesc_version) {
2503 ret = -EINVAL;
2504 goto out;
2507 /* update memdesc */
2508 for (p = u_memmap;
2509 p < u_memmap_end;
2510 p += u_memmap_info->efi_memdesc_size) {
2511 md = p;
2512 ret = __dom0vp_add_memdesc_one(memmap_info, num_pages, md);
2513 if (ret != 0)
2514 goto out;
2517 /* zero out the unused region to avoid hypervisor bit leak */
2518 unused_size = (num_pages << PAGE_SHIFT) -
2519 (sizeof(*memmap_info) + memmap_info->efi_memmap_size);
2520 if (unused_size > 0)
2521 memset((void*)memmap_info->memdesc + memmap_info->efi_memmap_size,
2522 0, unused_size);
2524 /* copy back into domain. */
2525 ret = __memmap_copy_to(targ_d, memmap_info_pfn, memmap_info, num_pages);
2527 out:
2528 memmap_unlock(targ_d);
2530 if (page != NULL)
2531 free_domheap_pages(page, order);
2532 return ret;
2535 unsigned long
2536 dom0vp_get_memmap(domid_t domid, XEN_GUEST_HANDLE(char) buffer)
2538 unsigned long ret = 0;
2539 struct domain *targ_d;
2541 struct page_info *page = NULL;
2542 unsigned long order;
2544 struct xen_ia64_memmap_info *memmap_info;
2545 unsigned long num_pages;
2547 ret = rcu_lock_target_domain_by_id(domid, &targ_d);
2548 if (ret != 0)
2549 return ret;
2551 memmap_lock(targ_d);
2553 ret = memmap_copy_from(targ_d, &page, &order);
2554 if (ret != 0)
2555 goto unlock_out;
2557 memmap_info = page_to_virt(page);
2558 num_pages = targ_d->shared_info->arch.memmap_info_num_pages;
2559 if ((num_pages << PAGE_SHIFT) - sizeof(*memmap_info) <
2560 memmap_info->efi_memmap_size) {
2561 ret = -EFAULT;
2562 goto unlock_out;
2564 memmap_unlock(targ_d);
2565 rcu_unlock_domain(targ_d);
2567 if (copy_to_guest(buffer, (char*)memmap_info, sizeof(*memmap_info)) ||
2568 copy_to_guest_offset(buffer, sizeof(*memmap_info),
2569 (char*)memmap_info->memdesc,
2570 memmap_info->efi_memmap_size))
2571 ret = -EFAULT;
2573 out:
2574 if (page != NULL)
2575 free_domheap_pages(page, order);
2576 return ret;
2578 unlock_out:
2579 memmap_unlock(targ_d);
2580 rcu_unlock_domain(targ_d);
2581 goto out;
2583 #endif
2585 // grant table host mapping
2586 // mpaddr: host_addr: pseudo physical address
2587 // mfn: frame: machine page frame
2588 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
2589 int
2590 create_grant_host_mapping(unsigned long gpaddr, unsigned long mfn,
2591 unsigned int flags, unsigned int cache_flags)
2593 struct domain* d = current->domain;
2594 struct page_info* page;
2595 int ret;
2597 if ((flags & (GNTMAP_device_map |
2598 GNTMAP_application_map | GNTMAP_contains_pte)) ||
2599 (cache_flags)) {
2600 gdprintk(XENLOG_INFO, "%s: flags 0x%x cache_flags 0x%x\n",
2601 __func__, flags, cache_flags);
2602 return GNTST_general_error;
2605 BUG_ON(!mfn_valid(mfn));
2606 page = mfn_to_page(mfn);
2607 ret = get_page(page, page_get_owner(page));
2608 BUG_ON(ret == 0);
2609 assign_domain_page_replace(d, gpaddr, mfn,
2610 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2611 ASSIGN_tlb_track |
2612 #endif
2613 ((flags & GNTMAP_readonly) ?
2614 ASSIGN_readonly : ASSIGN_writable));
2615 perfc_incr(create_grant_host_mapping);
2616 return GNTST_okay;
2619 // grant table host unmapping
2620 int
2621 replace_grant_host_mapping(unsigned long gpaddr,
2622 unsigned long mfn, unsigned long new_gpaddr, unsigned int flags)
2624 struct domain* d = current->domain;
2625 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
2626 volatile pte_t* pte;
2627 unsigned long cur_arflags;
2628 pte_t cur_pte;
2629 pte_t new_pte = __pte(0);
2630 pte_t old_pte;
2631 struct page_info* page = mfn_to_page(mfn);
2632 struct page_info* new_page = NULL;
2633 volatile pte_t* new_page_pte = NULL;
2634 unsigned long new_page_mfn = INVALID_MFN;
2636 if (new_gpaddr) {
2637 new_page_pte = lookup_noalloc_domain_pte_none(d, new_gpaddr);
2638 if (likely(new_page_pte != NULL)) {
2639 new_pte = ptep_get_and_clear(&d->arch.mm,
2640 new_gpaddr, new_page_pte);
2641 if (likely(pte_present(new_pte))) {
2642 struct domain* page_owner;
2644 new_page_mfn = pte_pfn(new_pte);
2645 new_page = mfn_to_page(new_page_mfn);
2646 page_owner = page_get_owner(new_page);
2647 if (unlikely(page_owner == NULL)) {
2648 gdprintk(XENLOG_INFO,
2649 "%s: page_owner == NULL "
2650 "gpaddr 0x%lx mfn 0x%lx "
2651 "new_gpaddr 0x%lx mfn 0x%lx\n",
2652 __func__, gpaddr, mfn, new_gpaddr, new_page_mfn);
2653 new_page = NULL; /* prevent domain_put_page() */
2654 return GNTST_general_error;
2657 /*
2658 * domain_put_page(clear_PGC_allcoated = 0)
2659 * doesn't decrement refcount of page with
2660 * pte_ptc_allocated() = 1. Be carefull.
2661 */
2662 if (unlikely(!pte_pgc_allocated(new_pte))) {
2663 /* domain_put_page() decrements page refcount. adjust it. */
2664 if (get_page(new_page, page_owner)) {
2665 gdprintk(XENLOG_INFO,
2666 "%s: get_page() failed. "
2667 "gpaddr 0x%lx mfn 0x%lx "
2668 "new_gpaddr 0x%lx mfn 0x%lx\n",
2669 __func__, gpaddr, mfn,
2670 new_gpaddr, new_page_mfn);
2671 return GNTST_general_error;
2674 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 0);
2675 } else
2676 new_pte = __pte(0);
2680 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
2681 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
2682 return GNTST_general_error;
2685 pte = lookup_noalloc_domain_pte(d, gpaddr);
2686 if (pte == NULL) {
2687 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
2688 __func__, gpaddr, mfn);
2689 return GNTST_general_error;
2692 again:
2693 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
2694 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
2695 if (!pte_present(cur_pte) ||
2696 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
2697 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
2698 __func__, gpaddr, mfn, pte_val(cur_pte));
2699 return GNTST_general_error;
2702 if (new_page) {
2703 BUG_ON(new_page_mfn == INVALID_MFN);
2704 set_gpfn_from_mfn(new_page_mfn, gpfn);
2705 /* smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2706 has release semantics. */
2708 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
2709 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
2710 if (pte_pfn(old_pte) == mfn) {
2711 goto again;
2713 if (new_page) {
2714 BUG_ON(new_page_mfn == INVALID_MFN);
2715 set_gpfn_from_mfn(new_page_mfn, INVALID_M2P_ENTRY);
2716 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 1);
2718 goto out;
2720 if (unlikely(!pte_present(old_pte)))
2721 goto out;
2722 BUG_ON(pte_pfn(old_pte) != mfn);
2724 /* try_to_clear_PGC_allocate(d, page) is not needed. */
2725 BUG_ON(page_get_owner(page) == d &&
2726 get_gpfn_from_mfn(mfn) == gpfn);
2727 BUG_ON(pte_pgc_allocated(old_pte));
2728 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
2730 perfc_incr(replace_grant_host_mapping);
2731 return GNTST_okay;
2733 out:
2734 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
2735 "0x%lx old_pte 0x%lx\n",
2736 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
2737 return GNTST_general_error;
2740 // heavily depends on the struct page layout.
2741 // gnttab_transfer() calls steal_page() with memflags = 0
2742 // For grant table transfer, we must fill the page.
2743 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
2744 // For memory exchange, we don't have to fill the page because
2745 // memory_exchange() does it.
2746 int
2747 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
2749 #if 0 /* if big endian */
2750 # error "implement big endian version of steal_page()"
2751 #endif
2752 unsigned long x, y;
2754 if (page_get_owner(page) != d) {
2755 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
2756 __func__, d, page_get_owner(page));
2757 return -1;
2760 if (!(memflags & MEMF_no_refcount)) {
2761 unsigned long gpfn;
2762 struct page_info *new;
2763 unsigned long new_mfn;
2764 int ret;
2766 new = alloc_domheap_page(d, 0);
2767 if (new == NULL) {
2768 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
2769 return -1;
2771 // zero out pages for security reasons
2772 clear_page(page_to_virt(new));
2773 // assign_domain_page_cmpxchg_rel() has release semantics
2774 // so smp_mb() isn't needed.
2776 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
2777 if (gpfn == INVALID_M2P_ENTRY) {
2778 free_domheap_page(new);
2779 return -1;
2781 new_mfn = page_to_mfn(new);
2782 set_gpfn_from_mfn(new_mfn, gpfn);
2783 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2784 // has release semantics.
2786 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
2787 ASSIGN_writable |
2788 ASSIGN_pgc_allocated, 0);
2789 if (ret < 0) {
2790 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
2791 ret);
2792 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
2793 free_domheap_page(new);
2794 return -1;
2796 perfc_incr(steal_page_refcount);
2799 spin_lock(&d->page_alloc_lock);
2800 /* check again */
2801 if (is_xen_heap_page(page) || page_get_owner(page) != d) {
2802 goto fail;
2805 /*
2806 * We require there is just one reference (PGC_allocated). We temporarily
2807 * drop this reference now so that we can safely swizzle the owner.
2808 */
2809 y = page->count_info;
2810 do {
2811 x = y;
2813 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
2814 (1 | PGC_allocated)))) {
2815 struct domain* nd = page_get_owner(page);
2816 if (nd == NULL) {
2817 gdprintk(XENLOG_INFO, "gnttab_transfer: "
2818 "Bad page %p: ed=%p(%u), "
2819 "sd=%p,"
2820 " caf=%016lx, taf=%" PRtype_info
2821 " memflags 0x%x\n",
2822 (void *) page_to_mfn(page),
2823 d, d->domain_id,
2824 nd,
2825 x,
2826 page->u.inuse.type_info,
2827 memflags);
2828 } else {
2829 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
2830 "Bad page %p: ed=%p(%u), "
2831 "sd=%p(%u),"
2832 " caf=%016lx, taf=%" PRtype_info
2833 " memflags 0x%x\n",
2834 (void *) page_to_mfn(page),
2835 d, d->domain_id,
2836 nd, nd->domain_id,
2837 x,
2838 page->u.inuse.type_info,
2839 memflags);
2841 goto fail;
2844 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
2845 } while (unlikely(y != x));
2847 /* Swizzle the owner then reinstate the PGC_allocated reference. */
2848 page_set_owner(page, NULL);
2849 y = page->count_info;
2850 do {
2851 x = y;
2852 BUG_ON((x & (PGC_count_mask | PGC_allocated)) != PGC_allocated);
2853 y = cmpxchg(&page->count_info, x, x | 1);
2854 } while (unlikely(y != x));
2856 /* Unlink from original owner. */
2857 if ( !(memflags & MEMF_no_refcount) )
2858 d->tot_pages--;
2859 list_del(&page->list);
2861 spin_unlock(&d->page_alloc_lock);
2862 perfc_incr(steal_page);
2863 return 0;
2865 fail:
2866 spin_unlock(&d->page_alloc_lock);
2867 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%016lx, taf=%" PRtype_info,
2868 (void *)page_to_mfn(page), d, d->domain_id,
2869 page_get_owner(page), page->count_info, page->u.inuse.type_info);
2870 return -1;
2873 static void
2874 __guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2875 unsigned long mfn)
2877 set_gpfn_from_mfn(mfn, gpfn);
2878 smp_mb();
2879 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
2880 ASSIGN_writable | ASSIGN_pgc_allocated);
2881 if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ){
2882 int i, j;
2883 j = 1 << (PAGE_SHIFT-PAGE_SHIFT_4K);
2884 for(i = 0 ; i < j; i++)
2885 iommu_map_page(d, gpfn*j + i, mfn*j + i);
2889 int
2890 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2891 unsigned long mfn, unsigned int page_order)
2893 unsigned long i;
2895 for (i = 0; i < (1UL << page_order); i++) {
2896 BUG_ON(!mfn_valid(mfn));
2897 BUG_ON((mfn_to_page(mfn)->count_info & ~PGC_xen_heap) !=
2898 (PGC_allocated | 1));
2899 __guest_physmap_add_page(d, gpfn, mfn);
2900 mfn++;
2901 gpfn++;
2904 perfc_incr(guest_physmap_add_page);
2905 return 0;
2908 void
2909 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
2910 unsigned long mfn, unsigned int page_order)
2912 unsigned long i;
2914 BUG_ON(mfn == 0);//XXX
2916 for (i = 0; i < (1UL << page_order); i++)
2917 zap_domain_page_one(d, (gpfn+i) << PAGE_SHIFT, 0, mfn+i);
2919 perfc_incr(guest_physmap_remove_page);
2922 static void
2923 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
2924 volatile pte_t* ptep, pte_t old_pte,
2925 struct page_info* page)
2927 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2928 struct tlb_track_entry* entry;
2929 #endif
2931 if (shadow_mode_enabled(d))
2932 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
2934 #ifndef CONFIG_XEN_IA64_TLB_TRACK
2935 //XXX sledgehammer.
2936 // flush finer range.
2937 domain_flush_vtlb_all(d);
2938 put_page(page);
2939 #else
2940 switch (tlb_track_search_and_remove(d->arch.tlb_track,
2941 ptep, old_pte, &entry)) {
2942 case TLB_TRACK_NOT_TRACKED:
2943 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
2944 /* This page is zapped from this domain
2945 * by memory decrease or exchange or dom0vp_zap_physmap.
2946 * I.e. the page is zapped for returning this page to xen
2947 * (balloon driver or DMA page allocation) or
2948 * foreign domain mapped page is unmapped from the domain.
2949 * In the former case the page is to be freed so that
2950 * we can defer freeing page to batch.
2951 * In the latter case the page is unmapped so that
2952 * we need to flush it. But to optimize it, we
2953 * queue the page and flush vTLB only once.
2954 * I.e. The caller must call dfree_flush() explicitly.
2955 */
2956 domain_flush_vtlb_all(d);
2957 put_page(page);
2958 break;
2959 case TLB_TRACK_NOT_FOUND:
2960 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
2961 /* This page is zapped from this domain
2962 * by grant table page unmap.
2963 * Luckily the domain that mapped this page didn't
2964 * access this page so that we don't have to flush vTLB.
2965 * Probably the domain did only DMA.
2966 */
2967 /* do nothing */
2968 put_page(page);
2969 break;
2970 case TLB_TRACK_FOUND:
2971 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
2972 /* This page is zapped from this domain
2973 * by grant table page unmap.
2974 * Fortunately this page is accessced via only one virtual
2975 * memory address. So it is easy to flush it.
2976 */
2977 domain_flush_vtlb_track_entry(d, entry);
2978 tlb_track_free_entry(d->arch.tlb_track, entry);
2979 put_page(page);
2980 break;
2981 case TLB_TRACK_MANY:
2982 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
2983 /* This page is zapped from this domain
2984 * by grant table page unmap.
2985 * Unfortunately this page is accessced via many virtual
2986 * memory address (or too many times with single virtual address).
2987 * So we abondaned to track virtual addresses.
2988 * full vTLB flush is necessary.
2989 */
2990 domain_flush_vtlb_all(d);
2991 put_page(page);
2992 break;
2993 case TLB_TRACK_AGAIN:
2994 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
2995 BUG();
2996 break;
2998 #endif
2999 perfc_incr(domain_page_flush_and_put);
3002 int
3003 domain_page_mapped(struct domain* d, unsigned long mpaddr)
3005 volatile pte_t * pte;
3007 pte = lookup_noalloc_domain_pte(d, mpaddr);
3008 if(pte != NULL && !pte_none(*pte))
3009 return 1;
3010 return 0;
3013 /* Flush cache of domain d. */
3014 void domain_cache_flush (struct domain *d, int sync_only)
3016 struct mm_struct *mm = &d->arch.mm;
3017 volatile pgd_t *pgd = mm->pgd;
3018 unsigned long maddr;
3019 int i,j,k, l;
3020 int nbr_page = 0;
3021 void (*flush_func)(unsigned long start, unsigned long end);
3022 extern void flush_dcache_range (unsigned long, unsigned long);
3024 if (sync_only)
3025 flush_func = &flush_icache_range;
3026 else
3027 flush_func = &flush_dcache_range;
3029 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
3030 volatile pud_t *pud;
3031 if (!pgd_present(*pgd)) // acquire semantics
3032 continue;
3033 pud = pud_offset(pgd, 0);
3034 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
3035 volatile pmd_t *pmd;
3036 if (!pud_present(*pud)) // acquire semantics
3037 continue;
3038 pmd = pmd_offset(pud, 0);
3039 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
3040 volatile pte_t *pte;
3041 if (!pmd_present(*pmd)) // acquire semantics
3042 continue;
3043 pte = pte_offset_map(pmd, 0);
3044 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
3045 if (!pte_present(*pte)) // acquire semantics
3046 continue;
3047 /* Convert PTE to maddr. */
3048 maddr = __va_ul (pte_val(*pte)
3049 & _PAGE_PPN_MASK);
3050 (*flush_func)(maddr, maddr+ PAGE_SIZE);
3051 nbr_page++;
3056 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
3059 static void free_page_type(struct page_info *page, unsigned long type)
3063 static int alloc_page_type(struct page_info *page, unsigned long type)
3065 return 1;
3068 void *pgtable_quicklist_alloc(void)
3070 struct page_info *page;
3071 void *p;
3073 BUG_ON(dom_p2m == NULL);
3074 page = alloc_domheap_page(dom_p2m, 0);
3075 if (page == NULL)
3076 return NULL;
3078 p = page_to_virt(page);
3079 clear_page(p);
3080 return p;
3083 void pgtable_quicklist_free(void *pgtable_entry)
3085 struct page_info* page = virt_to_page(pgtable_entry);
3087 BUG_ON(page_get_owner(page) != dom_p2m);
3088 BUG_ON(page->count_info != (1 | PGC_allocated));
3090 put_page(page);
3093 void put_page_type(struct page_info *page)
3095 u64 nx, x, y = page->u.inuse.type_info;
3097 again:
3098 do {
3099 x = y;
3100 nx = x - 1;
3102 ASSERT((x & PGT_count_mask) != 0);
3104 /*
3105 * The page should always be validated while a reference is held. The
3106 * exception is during domain destruction, when we forcibly invalidate
3107 * page-table pages if we detect a referential loop.
3108 * See domain.c:relinquish_list().
3109 */
3110 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
3112 if ( unlikely((nx & PGT_count_mask) == 0) )
3114 /* Record TLB information for flush later. Races are harmless. */
3115 page->tlbflush_timestamp = tlbflush_current_time();
3117 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
3118 likely(nx & PGT_validated) )
3120 /*
3121 * Page-table pages must be unvalidated when count is zero. The
3122 * 'free' is safe because the refcnt is non-zero and validated
3123 * bit is clear => other ops will spin or fail.
3124 */
3125 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
3126 x & ~PGT_validated)) != x) )
3127 goto again;
3128 /* We cleared the 'valid bit' so we do the clean up. */
3129 free_page_type(page, x);
3130 /* Carry on, but with the 'valid bit' now clear. */
3131 x &= ~PGT_validated;
3132 nx &= ~PGT_validated;
3136 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
3140 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
3142 struct page_info *page = mfn_to_page(page_nr);
3144 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
3146 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
3147 return 0;
3150 return 1;
3154 int get_page_type(struct page_info *page, unsigned long type)
3156 u64 nx, x, y = page->u.inuse.type_info;
3158 ASSERT(!(type & ~PGT_type_mask));
3160 again:
3161 do {
3162 x = y;
3163 nx = x + 1;
3164 if ( unlikely((nx & PGT_count_mask) == 0) )
3166 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
3167 return 0;
3169 else if ( unlikely((x & PGT_count_mask) == 0) )
3171 if ( (x & PGT_type_mask) != type )
3173 /*
3174 * On type change we check to flush stale TLB entries. This
3175 * may be unnecessary (e.g., page was GDT/LDT) but those
3176 * circumstances should be very rare.
3177 */
3178 cpumask_t mask =
3179 page_get_owner(page)->domain_dirty_cpumask;
3180 tlbflush_filter(mask, page->tlbflush_timestamp);
3182 if ( unlikely(!cpus_empty(mask)) )
3184 perfc_incr(need_flush_tlb_flush);
3185 flush_tlb_mask(mask);
3188 /* We lose existing type, back pointer, and validity. */
3189 nx &= ~(PGT_type_mask | PGT_validated);
3190 nx |= type;
3192 /* No special validation needed for writable pages. */
3193 /* Page tables and GDT/LDT need to be scanned for validity. */
3194 if ( type == PGT_writable_page )
3195 nx |= PGT_validated;
3198 else if ( unlikely((x & PGT_type_mask) != type) )
3200 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
3201 (type != PGT_l1_page_table) )
3202 MEM_LOG("Bad type (saw %08lx != exp %08lx) "
3203 "for mfn %016lx (pfn %016lx)",
3204 x, type, page_to_mfn(page),
3205 get_gpfn_from_mfn(page_to_mfn(page)));
3206 return 0;
3208 else if ( unlikely(!(x & PGT_validated)) )
3210 /* Someone else is updating validation of this page. Wait... */
3211 while ( (y = page->u.inuse.type_info) == x )
3212 cpu_relax();
3213 goto again;
3216 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
3218 if ( unlikely(!(nx & PGT_validated)) )
3220 /* Try to validate page type; drop the new reference on failure. */
3221 if ( unlikely(!alloc_page_type(page, type)) )
3223 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08lx"
3224 ": caf=%016lx taf=%" PRtype_info,
3225 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
3226 type, page->count_info, page->u.inuse.type_info);
3227 /* Noone else can get a reference. We hold the only ref. */
3228 page->u.inuse.type_info = 0;
3229 return 0;
3232 /* Noone else is updating simultaneously. */
3233 __set_bit(_PGT_validated, &page->u.inuse.type_info);
3236 return 1;
3239 int page_is_conventional_ram(unsigned long mfn)
3241 return (efi_mem_type(pfn_to_paddr(mfn)) == EFI_CONVENTIONAL_MEMORY);
3245 long
3246 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3248 struct page_info *page = NULL;
3249 long rc;
3251 switch (op) {
3252 case XENMEM_add_to_physmap:
3254 struct xen_add_to_physmap xatp;
3255 unsigned long prev_mfn, mfn = 0, gpfn;
3256 struct domain *d;
3258 if (copy_from_guest(&xatp, arg, 1))
3259 return -EFAULT;
3261 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3262 if (rc)
3263 return rc;
3265 /* This hypercall is used for VT-i domain only */
3266 if (!is_hvm_domain(d)) {
3267 rcu_unlock_domain(d);
3268 return -ENOSYS;
3271 switch (xatp.space) {
3272 case XENMAPSPACE_shared_info:
3273 if (xatp.idx == 0)
3274 mfn = virt_to_mfn(d->shared_info);
3275 break;
3276 case XENMAPSPACE_grant_table:
3277 spin_lock(&d->grant_table->lock);
3279 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
3280 (xatp.idx < max_nr_grant_frames))
3281 gnttab_grow_table(d, xatp.idx + 1);
3283 if (xatp.idx < nr_grant_frames(d->grant_table))
3284 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3286 spin_unlock(&d->grant_table->lock);
3287 break;
3288 case XENMAPSPACE_gmfn: {
3289 struct xen_ia64_memmap_info memmap_info;
3290 efi_memory_desc_t md;
3291 int ret;
3293 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3294 if ( !get_page_from_pagenr(xatp.idx, d) )
3295 break;
3297 mfn = xatp.idx;
3298 page = mfn_to_page(mfn);
3300 memmap_info.efi_memmap_size = sizeof(md);
3301 memmap_info.efi_memdesc_size = sizeof(md);
3302 memmap_info.efi_memdesc_version =
3303 EFI_MEMORY_DESCRIPTOR_VERSION;
3305 md.type = EFI_CONVENTIONAL_MEMORY;
3306 md.pad = 0;
3307 md.phys_addr = xatp.gpfn << PAGE_SHIFT;
3308 md.virt_addr = 0;
3309 md.num_pages = 1UL << (PAGE_SHIFT - EFI_PAGE_SHIFT);
3310 md.attribute = EFI_MEMORY_WB;
3312 ret = __dom0vp_add_memdesc(d, &memmap_info, (char*)&md);
3313 if (ret != 0) {
3314 put_page(page);
3315 rcu_unlock_domain(d);
3316 gdprintk(XENLOG_DEBUG,
3317 "%s:%d td %d gpfn 0x%lx mfn 0x%lx ret %d\n",
3318 __func__, __LINE__,
3319 d->domain_id, xatp.gpfn, xatp.idx, ret);
3320 return ret;
3322 break;
3324 default:
3325 break;
3328 if (mfn == 0) {
3329 if ( page )
3330 put_page(page);
3331 rcu_unlock_domain(d);
3332 return -EINVAL;
3335 domain_lock(d);
3337 /* Check remapping necessity */
3338 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3339 if (mfn == prev_mfn)
3340 goto out;
3342 /* Remove previously mapped page if it was present. */
3343 if (prev_mfn && mfn_valid(prev_mfn)) {
3344 if (is_xen_heap_mfn(prev_mfn))
3345 /* Xen heap frames are simply unhooked from this phys slot. */
3346 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3347 else
3348 /* Normal domain memory is freed, to avoid leaking memory. */
3349 guest_remove_page(d, xatp.gpfn);
3352 /* Unmap from old location, if any. */
3353 gpfn = get_gpfn_from_mfn(mfn);
3354 if (gpfn != INVALID_M2P_ENTRY)
3355 guest_physmap_remove_page(d, gpfn, mfn, 0);
3357 /* Map at new location. */
3358 /* Here page->count_info = PGC_allocated | N where N >= 1*/
3359 __guest_physmap_add_page(d, xatp.gpfn, mfn);
3361 out:
3362 domain_unlock(d);
3364 if ( page )
3365 put_page(page);
3367 rcu_unlock_domain(d);
3369 break;
3372 case XENMEM_machine_memory_map:
3374 struct xen_memory_map memmap;
3375 struct xen_ia64_memmap_info memmap_info;
3376 XEN_GUEST_HANDLE(char) buffer;
3378 if (!IS_PRIV(current->domain))
3379 return -EINVAL;
3380 if (copy_from_guest(&memmap, arg, 1))
3381 return -EFAULT;
3382 if (memmap.nr_entries <
3383 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size)
3384 return -EINVAL;
3386 memmap.nr_entries =
3387 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size;
3388 memset(&memmap_info, 0, sizeof(memmap_info));
3389 memmap_info.efi_memmap_size = ia64_boot_param->efi_memmap_size;
3390 memmap_info.efi_memdesc_size = ia64_boot_param->efi_memdesc_size;
3391 memmap_info.efi_memdesc_version = ia64_boot_param->efi_memdesc_version;
3393 buffer = guest_handle_cast(memmap.buffer, char);
3394 if (copy_to_guest(buffer, (char*)&memmap_info, sizeof(memmap_info)) ||
3395 copy_to_guest_offset(buffer, sizeof(memmap_info),
3396 (char*)__va(ia64_boot_param->efi_memmap),
3397 ia64_boot_param->efi_memmap_size) ||
3398 copy_to_guest(arg, &memmap, 1))
3399 return -EFAULT;
3400 return 0;
3403 case XENMEM_get_pod_target:
3404 case XENMEM_set_pod_target: {
3405 /* XXX: PoD populate on demand isn't supported yet. */
3406 xen_pod_target_t target;
3407 struct domain *d;
3409 /* Support DOMID_SELF? */
3410 if ( !IS_PRIV(current->domain) )
3411 return -EINVAL;
3413 if ( copy_from_guest(&target, arg, 1) )
3414 return -EFAULT;
3416 rc = rcu_lock_target_domain_by_id(target.domid, &d);
3417 if ( rc != 0 )
3418 return rc;
3420 if ( op == XENMEM_set_pod_target )
3422 /* if -ENOSYS is returned,
3423 domain builder aborts domain creation. */
3424 /* rc = -ENOSYS; */
3427 target.tot_pages = d->tot_pages;
3428 target.pod_cache_pages = 0;
3429 target.pod_entries = 0;
3431 if ( copy_to_guest(arg, &target, 1) )
3433 rc= -EFAULT;
3434 goto pod_target_out_unlock;
3437 pod_target_out_unlock:
3438 rcu_unlock_domain(d);
3439 return rc;
3442 default:
3443 return -ENOSYS;
3446 return 0;
3449 int is_iomem_page(unsigned long mfn)
3451 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
3454 static void __xencomm_mark_dirty(struct domain *d,
3455 unsigned long addr, unsigned int len)
3457 unsigned long gpfn;
3458 unsigned long end_addr = addr + len;
3460 if (shadow_mode_enabled(d)) {
3461 for (addr &= PAGE_MASK; addr < end_addr; addr += PAGE_SIZE) {
3462 gpfn = get_gpfn_from_mfn(virt_to_mfn(addr));
3463 shadow_mark_page_dirty(d, gpfn);
3468 void xencomm_mark_dirty(unsigned long addr, unsigned int len)
3470 __xencomm_mark_dirty(current->domain, addr, len);
3473 /* stubs for populate on demand */
3474 int
3475 guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
3476 unsigned int order)
3478 gdprintk(XENLOG_WARNING, "populate on demand isn't supported yet\n");
3479 return -ENOSYS;
3482 int
3483 p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn,
3484 unsigned int order)
3486 gdprintk(XENLOG_WARNING, "populate on demand isn't supported yet\n");
3487 return 0;
3490 /*
3491 * Local variables:
3492 * mode: C
3493 * c-set-style: "BSD"
3494 * c-basic-offset: 4
3495 * tab-width: 4
3496 * indent-tabs-mode: nil
3497 * End:
3498 */