ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 19107:696351cde9a4

Allow memflags to be specified to alloc_xenheap_pages().

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jan 28 16:58:41 2009 +0000 (2009-01-28)
parents 2f993fde1dc6
children 4773f40d14f2
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * replace_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <linux/sort.h>
179 #include <xen/guest_access.h>
180 #include <asm/page.h>
181 #include <asm/dom_fw_common.h>
182 #include <public/memory.h>
183 #include <asm/event.h>
184 #include <asm/debugger.h>
186 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
187 volatile pte_t* ptep, pte_t old_pte,
188 struct page_info* page);
190 static void __xencomm_mark_dirty(struct domain *d,
191 unsigned long addr, unsigned int len);
193 extern unsigned long ia64_iobase;
195 struct domain *dom_xen, *dom_io;
197 /*
198 * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO.
199 * If more reserved domain ids are introduced, this might be increased.
200 */
201 #define DOMID_P2M (0x7FF8U)
202 static struct domain *dom_p2m;
204 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
205 void
206 alloc_dom_xen_and_dom_io(void)
207 {
208 /*
209 * Initialise our DOMID_XEN domain.
210 * Any Xen-heap pages that we will allow to be mapped will have
211 * their domain field set to dom_xen.
212 */
213 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
214 BUG_ON(dom_xen == NULL);
216 /*
217 * Initialise our DOMID_IO domain.
218 * This domain owns I/O pages that are within the range of the page_info
219 * array. Mappings occur at the priv of the caller.
220 */
221 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
222 BUG_ON(dom_io == NULL);
223 }
225 static int
226 mm_teardown_can_skip(struct domain* d, unsigned long offset)
227 {
228 return d->arch.mm_teardown_offset > offset;
229 }
231 static void
232 mm_teardown_update_offset(struct domain* d, unsigned long offset)
233 {
234 d->arch.mm_teardown_offset = offset;
235 }
237 static void
238 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
239 {
240 pte_t old_pte;
241 unsigned long mfn;
242 struct page_info* page;
244 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
246 // vmx domain use bit[58:56] to distinguish io region from memory.
247 // see vmx_build_physmap_table() in vmx_init.c
248 if (!pte_mem(old_pte))
249 return;
251 // domain might map IO space or acpi table pages. check it.
252 mfn = pte_pfn(old_pte);
253 if (!mfn_valid(mfn))
254 return;
255 page = mfn_to_page(mfn);
256 BUG_ON(page_get_owner(page) == NULL);
258 // struct page_info corresponding to mfn may exist or not depending
259 // on CONFIG_VIRTUAL_FRAME_TABLE.
260 // The above check is too easy.
261 // The right way is to check whether this page is of io area or acpi pages
263 if (pte_pgc_allocated(old_pte)) {
264 BUG_ON(page_get_owner(page) != d);
265 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
266 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
267 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
268 put_page(page);
269 } else {
270 put_page(page);
271 }
272 }
274 static int
275 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
276 {
277 unsigned long i;
278 volatile pte_t* pte = pte_offset_map(pmd, offset);
280 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
281 unsigned long cur_offset = offset + (i << PAGE_SHIFT);
282 if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
283 continue;
284 if (!pte_present(*pte)) { // acquire semantics
285 mm_teardown_update_offset(d, cur_offset);
286 continue;
287 }
288 mm_teardown_update_offset(d, cur_offset);
289 mm_teardown_pte(d, pte, cur_offset);
290 if (hypercall_preempt_check())
291 return -EAGAIN;
292 }
293 return 0;
294 }
296 static int
297 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
298 {
299 unsigned long i;
300 volatile pmd_t *pmd = pmd_offset(pud, offset);
302 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
303 unsigned long cur_offset = offset + (i << PMD_SHIFT);
304 if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
305 continue;
306 if (!pmd_present(*pmd)) { // acquire semantics
307 mm_teardown_update_offset(d, cur_offset);
308 continue;
309 }
310 if (mm_teardown_pmd(d, pmd, cur_offset))
311 return -EAGAIN;
312 }
313 return 0;
314 }
316 static int
317 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
318 {
319 unsigned long i;
320 volatile pud_t *pud = pud_offset(pgd, offset);
322 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
323 unsigned long cur_offset = offset + (i << PUD_SHIFT);
324 #ifndef __PAGETABLE_PUD_FOLDED
325 if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
326 continue;
327 #endif
328 if (!pud_present(*pud)) { // acquire semantics
329 #ifndef __PAGETABLE_PUD_FOLDED
330 mm_teardown_update_offset(d, cur_offset);
331 #endif
332 continue;
333 }
334 if (mm_teardown_pud(d, pud, cur_offset))
335 return -EAGAIN;
336 }
337 return 0;
338 }
340 int
341 mm_teardown(struct domain* d)
342 {
343 struct mm_struct* mm = &d->arch.mm;
344 unsigned long i;
345 volatile pgd_t* pgd;
347 if (mm->pgd == NULL)
348 return 0;
350 pgd = pgd_offset(mm, 0);
351 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
352 unsigned long cur_offset = i << PGDIR_SHIFT;
354 if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE))
355 continue;
356 if (!pgd_present(*pgd)) { // acquire semantics
357 mm_teardown_update_offset(d, cur_offset);
358 continue;
359 }
360 if (mm_teardown_pgd(d, pgd, cur_offset))
361 return -EAGAIN;
362 }
364 foreign_p2m_destroy(d);
365 return 0;
366 }
368 static void
369 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
370 unsigned long offset)
371 {
372 pte_free_kernel(pte_offset_map(pmd, offset));
373 }
375 static void
376 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
377 unsigned long offset)
378 {
379 unsigned long i;
380 volatile pmd_t *pmd = pmd_offset(pud, offset);
382 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
383 if (!pmd_present(*pmd))
384 continue;
385 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
386 }
387 pmd_free(pmd_offset(pud, offset));
388 }
390 static void
391 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
392 unsigned long offset)
393 {
394 unsigned long i;
395 volatile pud_t *pud = pud_offset(pgd, offset);
397 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
398 if (!pud_present(*pud))
399 continue;
400 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
401 }
402 pud_free(pud_offset(pgd, offset));
403 }
405 static void
406 mm_p2m_teardown(struct domain* d)
407 {
408 struct mm_struct* mm = &d->arch.mm;
409 unsigned long i;
410 volatile pgd_t* pgd;
412 BUG_ON(mm->pgd == NULL);
413 pgd = pgd_offset(mm, 0);
414 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
415 if (!pgd_present(*pgd))
416 continue;
417 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
418 }
419 pgd_free(mm->pgd);
420 mm->pgd = NULL;
421 }
423 void
424 mm_final_teardown(struct domain* d)
425 {
426 if (d->arch.shadow_bitmap != NULL) {
427 xfree(d->arch.shadow_bitmap);
428 d->arch.shadow_bitmap = NULL;
429 }
430 mm_p2m_teardown(d);
431 }
433 unsigned long
434 domain_get_maximum_gpfn(struct domain *d)
435 {
436 return (d->arch.convmem_end - 1) >> PAGE_SHIFT;
437 }
439 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
440 void
441 share_xen_page_with_guest(struct page_info *page,
442 struct domain *d, int readonly)
443 {
444 if ( page_get_owner(page) == d )
445 return;
447 #if 1
448 if (readonly) {
449 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
450 }
451 #endif
453 // alloc_xenheap_pages() doesn't initialize page owner.
454 //BUG_ON(page_get_owner(page) != NULL);
456 spin_lock(&d->page_alloc_lock);
458 #ifndef __ia64__
459 /* The incremented type count pins as writable or read-only. */
460 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
461 page->u.inuse.type_info |= PGT_validated | 1;
462 #endif
464 page_set_owner(page, d);
465 wmb(); /* install valid domain ptr before updating refcnt. */
466 ASSERT(page->count_info == 0);
468 /* Only add to the allocation list if the domain isn't dying. */
469 if ( !d->is_dying )
470 {
471 page->count_info |= PGC_allocated | 1;
472 if ( unlikely(d->xenheap_pages++ == 0) )
473 get_knownalive_domain(d);
474 list_add_tail(&page->list, &d->xenpage_list);
475 }
477 // grant_table_destroy() releases these pages.
478 // but it doesn't clear their m2p entry. So there might remain stale
479 // entries. such a stale entry is cleared here.
480 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
482 spin_unlock(&d->page_alloc_lock);
483 }
485 void
486 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
487 {
488 share_xen_page_with_guest(page, dom_xen, readonly);
489 }
491 unsigned long
492 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
493 {
494 unsigned long pte;
496 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
497 if (!pte) {
498 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
499 }
501 if ((pte & _PAGE_IO) && is_hvm_domain(d))
502 return INVALID_MFN;
504 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
505 }
507 // given a domain virtual address, pte and pagesize, extract the metaphysical
508 // address, convert the pte for a physical address for (possibly different)
509 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
510 // current->arch.vhpt_pg_shift!)
511 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* itir,
512 struct p2m_entry* entry)
513 {
514 struct domain *d = current->domain;
515 ia64_itir_t _itir = {.itir = itir__};
516 u64 mask, mpaddr, pteval2;
517 u64 arflags;
518 u64 arflags2;
519 u64 maflags2;
521 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
523 // FIXME address had better be pre-validated on insert
524 mask = ~itir_mask(_itir.itir);
525 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
527 if (_itir.ps > PAGE_SHIFT)
528 _itir.ps = PAGE_SHIFT;
530 ((ia64_itir_t*)itir)->itir = _itir.itir;/* Copy the whole register. */
531 ((ia64_itir_t*)itir)->ps = _itir.ps; /* Overwrite ps part! */
533 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
534 if (_itir.ps < PAGE_SHIFT)
535 pteval2 |= mpaddr & ~PAGE_MASK & ~((1L << _itir.ps) - 1);
537 /* Check access rights. */
538 arflags = pteval & _PAGE_AR_MASK;
539 arflags2 = pteval2 & _PAGE_AR_MASK;
540 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
541 #if 0
542 dprintk(XENLOG_WARNING,
543 "%s:%d "
544 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
545 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
546 __func__, __LINE__,
547 pteval, arflags, address, itir__,
548 pteval2, arflags2, mpaddr);
549 #endif
550 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
551 }
553 /* Check memory attribute. The switch is on the *requested* memory
554 attribute. */
555 maflags2 = pteval2 & _PAGE_MA_MASK;
556 switch (pteval & _PAGE_MA_MASK) {
557 case _PAGE_MA_NAT:
558 /* NaT pages are always accepted! */
559 break;
560 case _PAGE_MA_UC:
561 case _PAGE_MA_UCE:
562 case _PAGE_MA_WC:
563 if (maflags2 == _PAGE_MA_WB) {
564 /* Don't let domains WB-map uncached addresses.
565 This can happen when domU tries to touch i/o
566 port space. Also prevents possible address
567 aliasing issues. */
568 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) {
569 u64 ucwb;
571 /*
572 * If dom0 page has both UC & WB attributes
573 * don't warn about attempted UC access.
574 */
575 ucwb = efi_mem_attribute(mpaddr, PAGE_SIZE);
576 ucwb &= EFI_MEMORY_UC | EFI_MEMORY_WB;
577 ucwb ^= EFI_MEMORY_UC | EFI_MEMORY_WB;
579 if (d != dom0 || ucwb != 0)
580 gdprintk(XENLOG_WARNING, "Warning: UC"
581 " to WB for mpaddr=%lx\n",
582 mpaddr);
583 }
584 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
585 }
586 break;
587 case _PAGE_MA_WB:
588 if (maflags2 != _PAGE_MA_WB) {
589 /* Forbid non-coherent access to coherent memory. */
590 panic_domain(NULL, "try to use WB mem attr on "
591 "UC page, mpaddr=%lx\n", mpaddr);
592 }
593 break;
594 default:
595 panic_domain(NULL, "try to use unknown mem attribute\n");
596 }
598 /* If shadow mode is enabled, virtualize dirty bit. */
599 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
600 u64 mp_page = mpaddr >> PAGE_SHIFT;
601 pteval |= _PAGE_VIRT_D;
603 /* If the page is not already dirty, don't set the dirty bit! */
604 if (mp_page < d->arch.shadow_bitmap_size * 8
605 && !test_bit(mp_page, d->arch.shadow_bitmap))
606 pteval &= ~_PAGE_D;
607 }
609 /* Ignore non-addr bits of pteval2 and force PL0->1
610 (PL3 is unaffected) */
611 return (pteval & ~(_PAGE_PPN_MASK | _PAGE_PL_MASK)) |
612 (pteval2 & _PAGE_PPN_MASK) |
613 (vcpu_pl_adjust(pteval, 7) & _PAGE_PL_MASK);
614 }
616 // given a current domain metaphysical address, return the physical address
617 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
618 struct p2m_entry* entry)
619 {
620 unsigned long pteval;
622 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
623 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
624 }
626 //XXX !xxx_present() should be used instread of !xxx_none()?
627 // pud, pmd, pte page is zero cleared when they are allocated.
628 // Their area must be visible before population so that
629 // cmpxchg must have release semantics.
630 static volatile pte_t*
631 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
632 {
633 struct mm_struct *mm = &d->arch.mm;
634 volatile pgd_t *pgd;
635 volatile pud_t *pud;
636 volatile pmd_t *pmd;
638 BUG_ON(mm->pgd == NULL);
640 pgd = pgd_offset(mm, mpaddr);
641 again_pgd:
642 if (unlikely(pgd_none(*pgd))) { // acquire semantics
643 pud_t *old_pud = NULL;
644 pud = pud_alloc_one(mm, mpaddr);
645 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
646 pud_free(pud);
647 goto again_pgd;
648 }
649 }
651 pud = pud_offset(pgd, mpaddr);
652 again_pud:
653 if (unlikely(pud_none(*pud))) { // acquire semantics
654 pmd_t* old_pmd = NULL;
655 pmd = pmd_alloc_one(mm, mpaddr);
656 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
657 pmd_free(pmd);
658 goto again_pud;
659 }
660 }
662 pmd = pmd_offset(pud, mpaddr);
663 again_pmd:
664 if (unlikely(pmd_none(*pmd))) { // acquire semantics
665 pte_t* old_pte = NULL;
666 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
667 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
668 pte_free_kernel(pte);
669 goto again_pmd;
670 }
671 }
673 return pte_offset_map(pmd, mpaddr);
674 }
676 //XXX xxx_none() should be used instread of !xxx_present()?
677 volatile pte_t*
678 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
679 {
680 struct mm_struct *mm = &d->arch.mm;
681 volatile pgd_t *pgd;
682 volatile pud_t *pud;
683 volatile pmd_t *pmd;
685 BUG_ON(mm->pgd == NULL);
686 pgd = pgd_offset(mm, mpaddr);
687 if (unlikely(!pgd_present(*pgd))) // acquire semantics
688 return NULL;
690 pud = pud_offset(pgd, mpaddr);
691 if (unlikely(!pud_present(*pud))) // acquire semantics
692 return NULL;
694 pmd = pmd_offset(pud, mpaddr);
695 if (unlikely(!pmd_present(*pmd))) // acquire semantics
696 return NULL;
698 return pte_offset_map(pmd, mpaddr);
699 }
701 static volatile pte_t*
702 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
703 {
704 struct mm_struct *mm = &d->arch.mm;
705 volatile pgd_t *pgd;
706 volatile pud_t *pud;
707 volatile pmd_t *pmd;
709 BUG_ON(mm->pgd == NULL);
710 pgd = pgd_offset(mm, mpaddr);
711 if (unlikely(pgd_none(*pgd))) // acquire semantics
712 return NULL;
714 pud = pud_offset(pgd, mpaddr);
715 if (unlikely(pud_none(*pud))) // acquire semantics
716 return NULL;
718 pmd = pmd_offset(pud, mpaddr);
719 if (unlikely(pmd_none(*pmd))) // acquire semantics
720 return NULL;
722 return pte_offset_map(pmd, mpaddr);
723 }
725 unsigned long
726 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
727 {
728 volatile pte_t *pte;
730 pte = lookup_noalloc_domain_pte(d, mpaddr);
731 if (pte == NULL)
732 return INVALID_MFN;
734 if (pte_present(*pte))
735 return (pte->pte & _PFN_MASK);
736 return INVALID_MFN;
737 }
739 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
740 struct p2m_entry* entry)
741 {
742 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
744 if (pte != NULL) {
745 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
746 if (pte_present(tmp_pte)) {
747 if (entry != NULL)
748 p2m_entry_set(entry, pte, tmp_pte);
749 return pte_val(tmp_pte);
750 } else if (is_hvm_domain(d))
751 return INVALID_MFN;
752 }
754 if (mpaddr < d->arch.convmem_end && !d->is_dying) {
755 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
756 "d %"PRId16" 0x%lx (< 0x%lx)\n",
757 current->vcpu_id, PSCB(current, iip),
758 d->domain_id, mpaddr, d->arch.convmem_end);
759 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
760 /* Log I/O port probing, but complain less loudly about it */
761 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
762 "d %"PRId16" 0x%lx\n",
763 current->vcpu_id, PSCB(current, iip), d->domain_id,
764 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
765 } else {
766 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
767 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
768 current->vcpu_id, PSCB(current, iip),
769 d->domain_id, mpaddr, d->arch.convmem_end);
770 }
772 debugger_event (XEN_IA64_DEBUG_ON_BAD_MPA);
774 if (entry != NULL)
775 p2m_entry_set(entry, NULL, __pte(0));
776 //XXX This is a work around until the emulation memory access to a region
777 // where memory or device are attached is implemented.
778 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_PRIV |
779 _PAGE_AR_RWX)));
780 }
782 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
783 #if 1
784 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
785 {
786 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
787 unsigned long imva;
789 pte &= _PAGE_PPN_MASK;
790 imva = (unsigned long) __va(pte);
791 imva |= mpaddr & ~PAGE_MASK;
792 return (void*)imva;
793 }
794 #else
795 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
796 {
797 unsigned long imva = __gpa_to_mpa(d, mpaddr);
799 return (void *)__va(imva);
800 }
801 #endif
803 unsigned long
804 paddr_to_maddr(unsigned long paddr)
805 {
806 struct vcpu *v = current;
807 struct domain *d = v->domain;
808 u64 pa;
810 pa = ____lookup_domain_mpa(d, paddr);
811 if (pa == INVALID_MFN) {
812 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
813 __func__, paddr, vcpu_regs(v)->cr_iip);
814 return 0;
815 }
816 return (pa & _PFN_MASK) | (paddr & ~PAGE_MASK);
817 }
819 /* Allocate a new page for domain and map it to the specified metaphysical
820 address. */
821 static struct page_info *
822 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
823 volatile pte_t* pte)
824 {
825 struct page_info *p;
826 unsigned long maddr;
828 BUG_ON(!pte_none(*pte));
830 p = alloc_domheap_page(d, 0);
831 if (unlikely(!p)) {
832 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
833 return(p);
834 }
836 // zero out pages for security reasons
837 clear_page(page_to_virt(p));
838 maddr = page_to_maddr (p);
839 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
840 && maddr < __get_cpu_var(vhpt_pend))) {
841 /* FIXME: how can this happen ?
842 vhpt is allocated by alloc_domheap_page. */
843 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
844 maddr);
845 }
847 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
848 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
849 // because set_pte_rel() has release semantics
850 set_pte_rel(pte,
851 pfn_pte(maddr >> PAGE_SHIFT,
852 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
853 _PAGE_PL_PRIV | _PAGE_AR_RWX)));
855 smp_mb();
856 return p;
857 }
859 struct page_info *
860 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
861 {
862 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
864 if (!pte_none(*pte))
865 return NULL;
867 return __assign_new_domain_page(d, mpaddr, pte);
868 }
870 void __init
871 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
872 {
873 volatile pte_t *pte;
875 BUG_ON(d != dom0);
876 pte = lookup_alloc_domain_pte(d, mpaddr);
877 if (pte_none(*pte)) {
878 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
879 if (p == NULL) {
880 panic("%s: can't allocate page for dom0\n", __func__);
881 }
882 }
883 }
885 static unsigned long
886 flags_to_prot (unsigned long flags)
887 {
888 unsigned long res = _PAGE_PL_PRIV | __DIRTY_BITS;
890 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
891 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
892 #ifdef CONFIG_XEN_IA64_TLB_TRACK
893 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
894 #endif
895 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
896 res |= flags & ASSIGN_io ? _PAGE_IO: 0;
898 return res;
899 }
901 /* map a physical address to the specified metaphysical addr */
902 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
903 // This is called by assign_domain_mmio_page().
904 // So accessing to pte is racy.
905 int
906 __assign_domain_page(struct domain *d,
907 unsigned long mpaddr, unsigned long physaddr,
908 unsigned long flags)
909 {
910 volatile pte_t *pte;
911 pte_t old_pte;
912 pte_t new_pte;
913 pte_t ret_pte;
914 unsigned long prot = flags_to_prot(flags);
916 pte = lookup_alloc_domain_pte(d, mpaddr);
918 old_pte = __pte(0);
919 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
920 again_hvm_page_io:
921 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
922 if (pte_val(ret_pte) == pte_val(old_pte)) {
923 smp_mb();
924 return 0;
925 }
926 /* in HVM guest, when VTD is enabled,
927 * P2M entry may change from _PAGE_IO type to real MMIO page
928 */
929 if(is_hvm_domain(d) && (pte_val(ret_pte) & _PAGE_IO) &&
930 !mfn_valid(physaddr >> PAGE_SHIFT)) {
931 old_pte = ret_pte;
932 goto again_hvm_page_io;
933 }
935 // dom0 tries to map real machine's I/O region, but failed.
936 // It is very likely that dom0 doesn't boot correctly because
937 // it can't access I/O. So complain here.
938 if (flags & ASSIGN_nocache) {
939 int warn = 0;
941 if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT))
942 warn = 1;
943 else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) {
944 u32 type;
945 u64 attr;
947 warn = 1;
949 /*
950 * See
951 * complete_dom0_memmap()
952 * case EFI_RUNTIME_SERVICES_CODE:
953 * case EFI_RUNTIME_SERVICES_DATA:
954 * case EFI_ACPI_RECLAIM_MEMORY:
955 * case EFI_ACPI_MEMORY_NVS:
956 * case EFI_RESERVED_TYPE:
957 *
958 * Currently only EFI_RUNTIME_SERVICES_CODE is found
959 * so that we suppress only EFI_RUNTIME_SERVICES_CODE case.
960 */
961 type = efi_mem_type(physaddr);
962 attr = efi_mem_attributes(physaddr);
963 if (type == EFI_RUNTIME_SERVICES_CODE &&
964 (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB))
965 warn = 0;
966 }
967 if (warn)
968 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
969 "\talready assigned pte_val 0x%016lx\n"
970 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
971 __func__, __LINE__,
972 d, d->domain_id, pte_val(ret_pte),
973 mpaddr, physaddr, flags);
974 }
976 return -EAGAIN;
977 }
979 /* get_page() and map a physical address to the specified metaphysical addr */
980 void
981 assign_domain_page(struct domain *d,
982 unsigned long mpaddr, unsigned long physaddr)
983 {
984 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
986 BUG_ON((physaddr & _PAGE_PPN_MASK) != physaddr);
987 BUG_ON(page->count_info != (PGC_allocated | 1));
988 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
989 // because __assign_domain_page() uses set_pte_rel() which has
990 // release semantics, smp_mb() isn't needed.
991 (void)__assign_domain_page(d, mpaddr, physaddr,
992 ASSIGN_writable | ASSIGN_pgc_allocated);
993 }
995 static void
996 ioports_get_mmio_addr(const struct io_space *space,
997 unsigned long fp, unsigned long lp,
998 unsigned long *mmio_start, unsigned long *mmio_end)
999 {
1000 if (space->sparse) {
1001 *mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
1002 *mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
1003 } else {
1004 *mmio_start = fp & PAGE_MASK;
1005 *mmio_end = PAGE_ALIGN(lp);
1009 static unsigned long
1010 ioports_get_mmio_base(const struct io_space *space, struct domain *d)
1012 if (VMX_DOMAIN(d->vcpu[0]))
1013 return LEGACY_IO_START;
1015 if (space == &io_space[0] && d != dom0)
1016 return IO_PORTS_PADDR;
1018 return __pa(space->mmio_base);
1021 /*
1022 * Inpurt
1023 * fgp: first guest port
1024 * fmp: first machine port
1025 * lmp: last machine port
1026 */
1027 int
1028 ioports_permit_access(struct domain *d, unsigned int fgp,
1029 unsigned int fmp, unsigned int lmp)
1031 struct io_space *space;
1032 unsigned long mmio_start, mach_start, mach_end;
1033 int ret;
1035 if (IO_SPACE_NR(fmp) >= num_io_spaces) {
1036 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fmp, lmp);
1037 return -EFAULT;
1040 /*
1041 * The ioport_cap rangeset tracks the I/O port address including
1042 * the port space ID. This means port space IDs need to match
1043 * between Xen and dom0. This is also a requirement because
1044 * the hypercall to pass these port ranges only uses a u32.
1046 * NB - non-dom0 driver domains may only have a subset of the
1047 * I/O port spaces and thus will number port spaces differently.
1048 * This is ok, they don't make use of this interface.
1049 */
1050 ret = rangeset_add_range(d->arch.ioport_caps, fmp, lmp);
1051 if (ret != 0)
1052 return ret;
1054 space = &io_space[IO_SPACE_NR(fmp)];
1056 /* Legacy I/O on dom0 is already setup */
1057 if (d == dom0 && space == &io_space[0])
1058 return 0;
1060 fmp = IO_SPACE_PORT(fmp);
1061 lmp = IO_SPACE_PORT(lmp);
1063 ioports_get_mmio_addr(space, fmp, lmp, &mach_start, &mach_end);
1065 /*
1066 * The "machine first port" is not necessarily identity mapped
1067 * to the guest first port. At least for the legacy range.
1068 */
1069 mach_start = mach_start | __pa(space->mmio_base);
1070 mach_end = mach_end | __pa(space->mmio_base);
1072 mmio_start = IO_SPACE_SPARSE_ENCODING(fgp) & PAGE_MASK;
1073 mmio_start |= ioports_get_mmio_base(space, d);
1075 while (mach_start < mach_end) {
1076 (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache);
1077 mmio_start += PAGE_SIZE;
1078 mach_start += PAGE_SIZE;
1081 return 0;
1084 static int
1085 ioports_has_allowed(struct domain *d, unsigned int fp, unsigned int lp)
1087 for (; fp < lp; fp++)
1088 if (rangeset_contains_singleton(d->arch.ioport_caps, fp))
1089 return 1;
1091 return 0;
1094 int
1095 ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
1097 int ret;
1098 struct mm_struct *mm = &d->arch.mm;
1099 unsigned long mmio_start, mmio_end, mmio_base;
1100 unsigned int fp_base, lp_base;
1101 struct io_space *space;
1103 if (IO_SPACE_NR(fp) >= num_io_spaces) {
1104 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
1105 return -EFAULT;
1108 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
1109 if (ret != 0)
1110 return ret;
1112 space = &io_space[IO_SPACE_NR(fp)];
1113 fp_base = IO_SPACE_PORT(fp);
1114 lp_base = IO_SPACE_PORT(lp);
1116 ioports_get_mmio_addr(space, fp_base, lp_base, &mmio_start, &mmio_end);
1118 mmio_base = ioports_get_mmio_base(space, d);
1120 for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
1121 unsigned int port, range;
1122 unsigned long mpaddr;
1123 volatile pte_t *pte;
1124 pte_t old_pte;
1126 if (space->sparse) {
1127 port = IO_SPACE_SPARSE_DECODING(mmio_start);
1128 range = IO_SPACE_SPARSE_PORTS_PER_PAGE - 1;
1129 } else {
1130 port = mmio_start;
1131 range = PAGE_SIZE - 1;
1134 port |= IO_SPACE_BASE(IO_SPACE_NR(fp));
1136 if (port < fp || port + range > lp) {
1137 /* Maybe this covers an allowed port. */
1138 if (ioports_has_allowed(d, port, port + range))
1139 continue;
1142 mpaddr = mmio_start | mmio_base;
1143 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1144 BUG_ON(pte == NULL);
1145 BUG_ON(pte_none(*pte));
1147 /* clear pte */
1148 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1150 domain_flush_vtlb_all(d);
1151 return 0;
1154 static void
1155 assign_domain_same_page(struct domain *d,
1156 unsigned long mpaddr, unsigned long size,
1157 unsigned long flags)
1159 //XXX optimization
1160 unsigned long end = PAGE_ALIGN(mpaddr + size);
1161 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
1162 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
1166 int
1167 efi_mmio(unsigned long physaddr, unsigned long size)
1169 void *efi_map_start, *efi_map_end;
1170 u64 efi_desc_size;
1171 void* p;
1173 efi_map_start = __va(ia64_boot_param->efi_memmap);
1174 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1175 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1177 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1178 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1179 unsigned long start = md->phys_addr;
1180 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1182 if (start <= physaddr && physaddr < end) {
1183 if ((physaddr + size) > end) {
1184 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1185 __func__, physaddr, size);
1186 return 0;
1189 // for io space
1190 if (md->type == EFI_MEMORY_MAPPED_IO ||
1191 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1192 return 1;
1195 // for runtime
1196 // see efi_enter_virtual_mode(void)
1197 // in linux/arch/ia64/kernel/efi.c
1198 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1199 !(md->attribute & EFI_MEMORY_WB)) {
1200 return 1;
1203 return 0;
1206 if (physaddr < start) {
1207 break;
1211 return 1;
1214 unsigned long
1215 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1216 unsigned long phys_addr, unsigned long size,
1217 unsigned long flags)
1219 unsigned long addr = mpaddr & PAGE_MASK;
1220 unsigned long end = PAGE_ALIGN(mpaddr + size);
1222 if (size == 0) {
1223 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1224 __func__, d, mpaddr, size);
1226 if (!efi_mmio(phys_addr, size)) {
1227 #ifndef NDEBUG
1228 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1229 __func__, d, mpaddr, size);
1230 #endif
1231 return -EINVAL;
1234 for (phys_addr &= PAGE_MASK; addr < end;
1235 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1236 __assign_domain_page(d, addr, phys_addr, flags);
1239 return mpaddr;
1242 unsigned long
1243 assign_domain_mach_page(struct domain *d,
1244 unsigned long mpaddr, unsigned long size,
1245 unsigned long flags)
1247 BUG_ON(flags & ASSIGN_pgc_allocated);
1248 assign_domain_same_page(d, mpaddr, size, flags);
1249 return mpaddr;
1252 static void
1253 adjust_page_count_info(struct page_info* page)
1255 struct domain* d = page_get_owner(page);
1256 BUG_ON((page->count_info & PGC_count_mask) < 1);
1257 if (d != NULL) {
1258 int ret = get_page(page, d);
1259 BUG_ON(ret == 0);
1260 } else {
1261 u64 x, nx, y;
1263 y = *((u64*)&page->count_info);
1264 do {
1265 x = y;
1266 nx = x + 1;
1268 BUG_ON((x >> 32) != 0);
1269 BUG_ON((nx & PGC_count_mask) != 2);
1270 y = cmpxchg((u64*)&page->count_info, x, nx);
1271 } while (unlikely(y != x));
1275 static void
1276 domain_put_page(struct domain* d, unsigned long mpaddr,
1277 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1279 unsigned long mfn = pte_pfn(old_pte);
1280 struct page_info* page = mfn_to_page(mfn);
1282 if (pte_pgc_allocated(old_pte)) {
1283 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1284 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1285 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1286 } else {
1287 BUG();
1290 if (likely(clear_PGC_allocate)) {
1291 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1292 BUG();
1293 /* put_page() is done by domain_page_flush_and_put() */
1294 } else {
1295 // In this case, page reference count mustn't touched.
1296 // domain_page_flush_and_put() decrements it, we increment
1297 // it in advence. This patch is slow path.
1298 //
1299 // guest_remove_page(): owner = d, count_info = 1
1300 // memory_exchange(): owner = NULL, count_info = 1
1301 // XENMEM_add_to_physmap: ower = d, count_info >= 1
1302 adjust_page_count_info(page);
1305 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1308 // caller must get_page(mfn_to_page(mfn)) before call.
1309 // caller must call set_gpfn_from_mfn() before call if necessary.
1310 // because set_gpfn_from_mfn() result must be visible before pte xchg
1311 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1312 // flags: ASSIGN_xxx
1313 static void
1314 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1315 unsigned long mfn, unsigned long flags)
1317 struct mm_struct *mm = &d->arch.mm;
1318 volatile pte_t* pte;
1319 pte_t old_pte;
1320 pte_t npte;
1321 unsigned long prot = flags_to_prot(flags);
1323 pte = lookup_alloc_domain_pte(d, mpaddr);
1325 // update pte
1326 npte = pfn_pte(mfn, __pgprot(prot));
1327 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1328 if (pte_mem(old_pte)) {
1329 unsigned long old_mfn = pte_pfn(old_pte);
1331 // mfn = old_mfn case can happen when domain maps a granted page
1332 // twice with the same pseudo physial address.
1333 // It's non sense, but allowed.
1334 // __gnttab_map_grant_ref()
1335 // => create_host_mapping()
1336 // => assign_domain_page_replace()
1337 if (mfn != old_mfn) {
1338 domain_put_page(d, mpaddr, pte, old_pte, 1);
1341 perfc_incr(assign_domain_page_replace);
1344 // caller must get_page(new_page) before
1345 // Only steal_page() calls this function.
1346 static int
1347 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1348 struct page_info* old_page,
1349 struct page_info* new_page,
1350 unsigned long flags, int clear_PGC_allocate)
1352 struct mm_struct *mm = &d->arch.mm;
1353 volatile pte_t* pte;
1354 unsigned long old_mfn;
1355 unsigned long old_prot;
1356 pte_t old_pte;
1357 unsigned long new_mfn;
1358 unsigned long new_prot;
1359 pte_t new_pte;
1360 pte_t ret_pte;
1362 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1363 pte = lookup_alloc_domain_pte(d, mpaddr);
1365 again:
1366 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1367 old_mfn = page_to_mfn(old_page);
1368 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1369 if (!pte_present(old_pte)) {
1370 gdprintk(XENLOG_INFO,
1371 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1372 __func__, pte_val(old_pte), old_prot, old_mfn);
1373 return -EINVAL;
1376 new_prot = flags_to_prot(flags);
1377 new_mfn = page_to_mfn(new_page);
1378 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1380 // update pte
1381 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1382 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1383 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1384 goto again;
1387 gdprintk(XENLOG_INFO,
1388 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1389 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1390 __func__,
1391 pte_val(old_pte), old_prot, old_mfn,
1392 pte_val(ret_pte), pte_pfn(ret_pte));
1393 return -EINVAL;
1396 BUG_ON(!pte_mem(old_pte));
1397 BUG_ON(!pte_pgc_allocated(old_pte));
1398 BUG_ON(page_get_owner(old_page) != d);
1399 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1400 BUG_ON(old_mfn == new_mfn);
1402 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1403 if (likely(clear_PGC_allocate)) {
1404 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1405 BUG();
1406 } else {
1407 int ret;
1408 // adjust for count_info for domain_page_flush_and_put()
1409 // This is slow path.
1410 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1411 BUG_ON(d == NULL);
1412 ret = get_page(old_page, d);
1413 BUG_ON(ret == 0);
1416 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1417 perfc_incr(assign_domain_pge_cmpxchg_rel);
1418 return 0;
1421 static void
1422 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1423 int clear_PGC_allocate, unsigned long mfn)
1425 struct mm_struct *mm = &d->arch.mm;
1426 volatile pte_t *pte;
1427 pte_t old_pte;
1428 struct page_info *page;
1430 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1431 if (pte == NULL)
1432 return;
1433 if (pte_none(*pte))
1434 return;
1436 if (mfn == INVALID_MFN) {
1437 // clear pte
1438 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1439 if(!pte_mem(old_pte))
1440 return;
1441 mfn = pte_pfn(old_pte);
1442 } else {
1443 unsigned long old_arflags;
1444 pte_t new_pte;
1445 pte_t ret_pte;
1447 again:
1448 // memory_exchange() calls guest_physmap_remove_page() with
1449 // a stealed page. i.e. page owner = NULL.
1450 BUG_ON(mfn_valid(mfn) &&
1451 page_get_owner(mfn_to_page(mfn)) != d &&
1452 page_get_owner(mfn_to_page(mfn)) != NULL);
1453 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1454 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1455 new_pte = __pte(0);
1457 // update pte
1458 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1459 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1460 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1461 goto again;
1464 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1465 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1466 __func__,
1467 pte_val(old_pte), old_arflags, mfn,
1468 pte_val(ret_pte), pte_pfn(ret_pte));
1469 return;
1471 BUG_ON(mfn != pte_pfn(ret_pte));
1474 perfc_incr(zap_domain_page_one);
1475 if(!mfn_valid(mfn))
1476 return;
1478 if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ){
1479 int i, j;
1480 j = 1 << (PAGE_SHIFT-PAGE_SHIFT_4K);
1481 for(i = 0 ; i < j; i++)
1482 iommu_unmap_page(d, (mpaddr>>PAGE_SHIFT)*j + i);
1485 page = mfn_to_page(mfn);
1486 BUG_ON((page->count_info & PGC_count_mask) == 0);
1488 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1489 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1492 int
1493 deassign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1494 unsigned long phys_addr, unsigned long size )
1496 unsigned long addr = mpaddr & PAGE_MASK;
1497 unsigned long end = PAGE_ALIGN(mpaddr + size);
1499 if (size == 0) {
1500 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1501 __func__, d, mpaddr, size);
1503 if (!efi_mmio(phys_addr, size)) {
1504 #ifndef NDEBUG
1505 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1506 __func__, d, mpaddr, size);
1507 #endif
1508 return -EINVAL;
1511 for (; addr < end; addr += PAGE_SIZE )
1512 zap_domain_page_one(d, addr, 0, INVALID_MFN);
1513 return 0;
1516 unsigned long
1517 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1518 unsigned int extent_order)
1520 if (extent_order != 0) {
1521 //XXX
1522 return -ENOSYS;
1525 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1526 perfc_incr(dom0vp_zap_physmap);
1527 return 0;
1530 static unsigned long
1531 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1532 unsigned long mfn_or_gmfn,
1533 unsigned long flags, domid_t domid, int is_gmfn)
1535 int error = -EINVAL;
1536 struct domain* rd;
1537 unsigned long mfn;
1539 /* Not allowed by a domain. */
1540 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1541 return -EINVAL;
1543 rd = rcu_lock_domain_by_id(domid);
1544 if (unlikely(rd == NULL)) {
1545 switch (domid) {
1546 case DOMID_XEN:
1547 rd = dom_xen;
1548 break;
1549 case DOMID_IO:
1550 rd = dom_io;
1551 break;
1552 default:
1553 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1554 "gpfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1555 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1556 return -ESRCH;
1558 BUG_ON(rd == NULL);
1559 rcu_lock_domain(rd);
1562 if (unlikely(rd == d))
1563 goto out1;
1564 /*
1565 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1566 * It can be considered that their p2m conversion is p==m.
1567 */
1568 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1569 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1570 else
1571 mfn = mfn_or_gmfn;
1572 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1573 goto out1;
1575 error = 0;
1576 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1577 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1578 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1579 //don't update p2m table because this page belongs to rd, not d.
1580 perfc_incr(dom0vp_add_physmap);
1581 out1:
1582 rcu_unlock_domain(rd);
1583 return error;
1586 unsigned long
1587 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1588 unsigned long flags, domid_t domid)
1590 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1593 unsigned long
1594 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1595 unsigned long gmfn, unsigned long flags,
1596 domid_t domid)
1598 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1601 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1602 #define P2M_PFN_ROUNDUP(x) (((x) + PTRS_PER_PTE - 1) & \
1603 ~(PTRS_PER_PTE - 1))
1604 #define P2M_PFN_ROUNDDOWN(x) ((x) & ~(PTRS_PER_PTE - 1))
1605 #define P2M_NUM_PFN(x) (((x) + PTRS_PER_PTE - 1) / PTRS_PER_PTE)
1606 #define MD_END(md) ((md)->phys_addr + \
1607 ((md)->num_pages << EFI_PAGE_SHIFT))
1608 static struct page_info* p2m_pte_zero_page = NULL;
1610 /* This must called before dom0 p2m table allocation */
1611 void __init
1612 expose_p2m_init(void)
1614 pte_t* pte;
1616 /*
1617 * Initialise our DOMID_P2M domain.
1618 * This domain owns m2p table pages.
1619 */
1620 dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
1621 BUG_ON(dom_p2m == NULL);
1622 dom_p2m->max_pages = ~0U;
1624 pte = pte_alloc_one_kernel(NULL, 0);
1625 BUG_ON(pte == NULL);
1626 smp_mb();// make contents of the page visible.
1627 p2m_pte_zero_page = virt_to_page(pte);
1630 // allocate pgd, pmd of dest_dom if necessary
1631 static int
1632 allocate_pgd_pmd(struct domain* dest_dom, unsigned long dest_gpfn,
1633 struct domain* src_dom,
1634 unsigned long src_gpfn, unsigned long num_src_gpfn)
1636 unsigned long i = 0;
1638 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1639 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1641 while (i < num_src_gpfn) {
1642 volatile pte_t* src_pte;
1643 volatile pte_t* dest_pte;
1645 src_pte = lookup_noalloc_domain_pte(src_dom,
1646 (src_gpfn + i) << PAGE_SHIFT);
1647 if (src_pte == NULL) {
1648 i++;
1649 continue;
1652 dest_pte = lookup_alloc_domain_pte(dest_dom,
1653 (dest_gpfn << PAGE_SHIFT) +
1654 i * sizeof(pte_t));
1655 if (dest_pte == NULL) {
1656 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n",
1657 __func__);
1658 return -ENOMEM;
1661 // skip to next pte page
1662 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1664 return 0;
1667 static int
1668 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1670 int ret = get_page(page, dom_p2m);
1671 BUG_ON(ret != 1);
1672 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1673 ASSIGN_readonly);
1676 // expose pte page
1677 static int
1678 expose_p2m_range(struct domain* dest_dom, unsigned long dest_gpfn,
1679 struct domain* src_dom,
1680 unsigned long src_gpfn, unsigned long num_src_gpfn)
1682 unsigned long i = 0;
1684 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1685 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1687 while (i < num_src_gpfn) {
1688 volatile pte_t* pte;
1690 pte = lookup_noalloc_domain_pte(src_dom, (src_gpfn + i) << PAGE_SHIFT);
1691 if (pte == NULL) {
1692 i++;
1693 continue;
1696 if (expose_p2m_page(dest_dom,
1697 (dest_gpfn << PAGE_SHIFT) + i * sizeof(pte_t),
1698 virt_to_page(pte)) < 0) {
1699 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1700 return -EAGAIN;
1703 // skip to next pte page
1704 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1706 return 0;
1709 // expose p2m_pte_zero_page
1710 static int
1711 expose_zero_page(struct domain* dest_dom, unsigned long dest_gpfn,
1712 unsigned long num_src_gpfn)
1714 unsigned long i;
1716 for (i = 0; i < P2M_NUM_PFN(num_src_gpfn); i++) {
1717 volatile pte_t* pte;
1718 pte = lookup_noalloc_domain_pte(dest_dom,
1719 (dest_gpfn + i) << PAGE_SHIFT);
1720 if (pte == NULL || pte_present(*pte))
1721 continue;
1723 if (expose_p2m_page(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1724 p2m_pte_zero_page) < 0) {
1725 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n",
1726 __func__);
1727 return -EAGAIN;
1730 return 0;
1733 static int
1734 expose_p2m(struct domain* dest_dom, unsigned long dest_gpfn,
1735 struct domain* src_dom,
1736 unsigned long src_gpfn, unsigned long num_src_gpfn)
1738 if (allocate_pgd_pmd(dest_dom, dest_gpfn,
1739 src_dom, src_gpfn, num_src_gpfn))
1740 return -ENOMEM;
1742 if (expose_p2m_range(dest_dom, dest_gpfn,
1743 src_dom, src_gpfn, num_src_gpfn))
1744 return -EAGAIN;
1746 if (expose_zero_page(dest_dom, dest_gpfn, num_src_gpfn))
1747 return -EAGAIN;
1749 return 0;
1752 static void
1753 unexpose_p2m(struct domain* dest_dom,
1754 unsigned long dest_gpfn, unsigned long num_dest_gpfn)
1756 unsigned long i;
1758 for (i = 0; i < num_dest_gpfn; i++) {
1759 zap_domain_page_one(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1760 0, INVALID_MFN);
1764 // It is possible to optimize loop, But this isn't performance critical.
1765 unsigned long
1766 dom0vp_expose_p2m(struct domain* d,
1767 unsigned long conv_start_gpfn,
1768 unsigned long assign_start_gpfn,
1769 unsigned long expose_size, unsigned long granule_pfn)
1771 unsigned long ret;
1772 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1774 if ((expose_size % PAGE_SIZE) != 0 ||
1775 (granule_pfn % PTRS_PER_PTE) != 0 ||
1776 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1777 (conv_start_gpfn % granule_pfn) != 0 ||
1778 (assign_start_gpfn % granule_pfn) != 0 ||
1779 (expose_num_pfn % granule_pfn) != 0) {
1780 gdprintk(XENLOG_INFO,
1781 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1782 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1783 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1784 return -EINVAL;
1787 if (granule_pfn != PTRS_PER_PTE) {
1788 gdprintk(XENLOG_INFO,
1789 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1790 __func__, granule_pfn, PTRS_PER_PTE);
1791 return -ENOSYS;
1793 ret = expose_p2m(d, assign_start_gpfn,
1794 d, conv_start_gpfn, expose_num_pfn);
1795 return ret;
1798 static int
1799 memmap_info_copy_from_guest(struct xen_ia64_memmap_info* memmap_info,
1800 char** memmap_p,
1801 XEN_GUEST_HANDLE(char) buffer)
1803 char *memmap;
1804 char *p;
1805 char *memmap_end;
1806 efi_memory_desc_t *md;
1807 unsigned long start;
1808 unsigned long end;
1809 efi_memory_desc_t *prev_md;
1811 if (copy_from_guest((char*)memmap_info, buffer, sizeof(*memmap_info)))
1812 return -EFAULT;
1813 if (memmap_info->efi_memdesc_size < sizeof(efi_memory_desc_t) ||
1814 memmap_info->efi_memmap_size < memmap_info->efi_memdesc_size ||
1815 (memmap_info->efi_memmap_size % memmap_info->efi_memdesc_size) != 0)
1816 return -EINVAL;
1818 memmap = _xmalloc(memmap_info->efi_memmap_size,
1819 __alignof__(efi_memory_desc_t));
1820 if (memmap == NULL)
1821 return -ENOMEM;
1822 if (copy_from_guest_offset(memmap, buffer, sizeof(*memmap_info),
1823 memmap_info->efi_memmap_size)) {
1824 xfree(memmap);
1825 return -EFAULT;
1828 /* intergirty check & simplify */
1829 sort(memmap, memmap_info->efi_memmap_size / memmap_info->efi_memdesc_size,
1830 memmap_info->efi_memdesc_size, efi_mdt_cmp, NULL);
1832 /* alignement & overlap check */
1833 prev_md = NULL;
1834 p = memmap;
1835 memmap_end = memmap + memmap_info->efi_memmap_size;
1836 for (p = memmap; p < memmap_end; p += memmap_info->efi_memmap_size) {
1837 md = (efi_memory_desc_t*)p;
1838 start = md->phys_addr;
1840 if (start & ((1UL << EFI_PAGE_SHIFT) - 1) || md->num_pages == 0) {
1841 xfree(memmap);
1842 return -EINVAL;
1845 if (prev_md != NULL) {
1846 unsigned long prev_end = MD_END(prev_md);
1847 if (prev_end > start) {
1848 xfree(memmap);
1849 return -EINVAL;
1853 prev_md = (efi_memory_desc_t *)p;
1856 /* coalease */
1857 prev_md = NULL;
1858 p = memmap;
1859 while (p < memmap_end) {
1860 md = (efi_memory_desc_t*)p;
1861 start = md->phys_addr;
1862 end = MD_END(md);
1864 start = P2M_PFN_ROUNDDOWN(start >> PAGE_SHIFT) << PAGE_SHIFT;
1865 end = P2M_PFN_ROUNDUP(end >> PAGE_SHIFT) << PAGE_SHIFT;
1866 md->phys_addr = start;
1867 md->num_pages = (end - start) >> EFI_PAGE_SHIFT;
1869 if (prev_md != NULL) {
1870 unsigned long prev_end = MD_END(prev_md);
1871 if (prev_end >= start) {
1872 size_t left;
1873 end = max(prev_end, end);
1874 prev_md->num_pages = (end - prev_md->phys_addr) >> EFI_PAGE_SHIFT;
1876 left = memmap_end - p;
1877 if (left > memmap_info->efi_memdesc_size) {
1878 left -= memmap_info->efi_memdesc_size;
1879 memmove(p, p + memmap_info->efi_memdesc_size, left);
1882 memmap_info->efi_memmap_size -= memmap_info->efi_memdesc_size;
1883 memmap_end -= memmap_info->efi_memdesc_size;
1884 continue;
1888 prev_md = md;
1889 p += memmap_info->efi_memdesc_size;
1892 if (copy_to_guest(buffer, (char*)memmap_info, sizeof(*memmap_info)) ||
1893 copy_to_guest_offset(buffer, sizeof(*memmap_info),
1894 (char*)memmap, memmap_info->efi_memmap_size)) {
1895 xfree(memmap);
1896 return -EFAULT;
1899 *memmap_p = memmap;
1900 return 0;
1903 static int
1904 foreign_p2m_allocate_pte(struct domain* d,
1905 const struct xen_ia64_memmap_info* memmap_info,
1906 const void* memmap)
1908 const void* memmap_end = memmap + memmap_info->efi_memmap_size;
1909 const void* p;
1911 for (p = memmap; p < memmap_end; p += memmap_info->efi_memdesc_size) {
1912 const efi_memory_desc_t* md = p;
1913 unsigned long start = md->phys_addr;
1914 unsigned long end = MD_END(md);
1915 unsigned long gpaddr;
1917 for (gpaddr = start; gpaddr < end; gpaddr += PAGE_SIZE) {
1918 if (lookup_alloc_domain_pte(d, gpaddr) == NULL) {
1919 return -ENOMEM;
1924 return 0;
1927 struct foreign_p2m_region {
1928 unsigned long gpfn;
1929 unsigned long num_gpfn;
1930 };
1932 struct foreign_p2m_entry {
1933 struct list_head list;
1934 int busy;
1936 /* src domain */
1937 struct domain* src_dom;
1939 /* region into which foreign p2m table is mapped */
1940 unsigned long gpfn;
1941 unsigned long num_gpfn;
1942 unsigned int num_region;
1943 struct foreign_p2m_region region[0];
1944 };
1946 /* caller must increment the reference count of src_dom */
1947 static int
1948 foreign_p2m_alloc(struct foreign_p2m* foreign_p2m,
1949 unsigned long dest_gpfn, struct domain* src_dom,
1950 struct xen_ia64_memmap_info* memmap_info, void* memmap,
1951 struct foreign_p2m_entry** entryp)
1953 void* memmap_end = memmap + memmap_info->efi_memmap_size;
1954 efi_memory_desc_t* md;
1955 unsigned long dest_gpfn_end;
1956 unsigned long src_gpfn;
1957 unsigned long src_gpfn_end;
1959 unsigned int num_region;
1960 struct foreign_p2m_entry* entry;
1961 struct foreign_p2m_entry* prev;
1962 struct foreign_p2m_entry* pos;
1964 num_region = (memmap_end - memmap) / memmap_info->efi_memdesc_size;
1966 md = memmap;
1967 src_gpfn = P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
1969 md = memmap + (num_region - 1) * memmap_info->efi_memdesc_size;
1970 src_gpfn_end = MD_END(md) >> PAGE_SHIFT;
1971 if (src_gpfn_end >
1972 P2M_PFN_ROUNDUP(src_dom->arch.convmem_end >> PAGE_SHIFT))
1973 return -EINVAL;
1975 src_gpfn_end = P2M_PFN_ROUNDUP(src_gpfn_end);
1976 dest_gpfn_end = dest_gpfn + P2M_NUM_PFN(src_gpfn_end - src_gpfn);
1977 entry = _xmalloc(sizeof(*entry) + num_region * sizeof(entry->region[0]),
1978 __alignof__(*entry));
1979 if (entry == NULL)
1980 return -ENOMEM;
1982 entry->busy = 1;
1983 entry->gpfn = dest_gpfn;
1984 entry->num_gpfn = dest_gpfn_end - dest_gpfn;
1985 entry->src_dom = src_dom;
1986 entry->num_region = 0;
1987 memset(entry->region, 0, sizeof(entry->region[0]) * num_region);
1988 prev = NULL;
1990 spin_lock(&foreign_p2m->lock);
1991 if (list_empty(&foreign_p2m->head))
1992 prev = (struct foreign_p2m_entry*)&foreign_p2m->head;
1994 list_for_each_entry(pos, &foreign_p2m->head, list) {
1995 if (pos->gpfn + pos->num_gpfn < dest_gpfn) {
1996 prev = pos;
1997 continue;
2000 if (dest_gpfn_end < pos->gpfn) {
2001 if (prev != NULL && prev->gpfn + prev->num_gpfn > dest_gpfn)
2002 prev = NULL;/* overlap */
2003 break;
2006 /* overlap */
2007 prev = NULL;
2008 break;
2010 if (prev != NULL) {
2011 list_add(&entry->list, &prev->list);
2012 spin_unlock(&foreign_p2m->lock);
2013 *entryp = entry;
2014 return 0;
2016 spin_unlock(&foreign_p2m->lock);
2017 xfree(entry);
2018 return -EBUSY;
2021 static void
2022 foreign_p2m_unexpose(struct domain* dest_dom, struct foreign_p2m_entry* entry)
2024 unsigned int i;
2026 BUG_ON(!entry->busy);
2027 for (i = 0; i < entry->num_region; i++)
2028 unexpose_p2m(dest_dom,
2029 entry->region[i].gpfn, entry->region[i].num_gpfn);
2032 static void
2033 foreign_p2m_unbusy(struct foreign_p2m* foreign_p2m,
2034 struct foreign_p2m_entry* entry)
2036 spin_lock(&foreign_p2m->lock);
2037 BUG_ON(!entry->busy);
2038 entry->busy = 0;
2039 spin_unlock(&foreign_p2m->lock);
2042 static void
2043 foreign_p2m_free(struct foreign_p2m* foreign_p2m,
2044 struct foreign_p2m_entry* entry)
2046 spin_lock(&foreign_p2m->lock);
2047 BUG_ON(!entry->busy);
2048 list_del(&entry->list);
2049 spin_unlock(&foreign_p2m->lock);
2051 put_domain(entry->src_dom);
2052 xfree(entry);
2055 void
2056 foreign_p2m_init(struct domain* d)
2058 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
2059 INIT_LIST_HEAD(&foreign_p2m->head);
2060 spin_lock_init(&foreign_p2m->lock);
2063 void
2064 foreign_p2m_destroy(struct domain* d)
2066 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
2067 struct foreign_p2m_entry* entry;
2068 struct foreign_p2m_entry* n;
2070 spin_lock(&foreign_p2m->lock);
2071 list_for_each_entry_safe(entry, n, &foreign_p2m->head, list) {
2072 /* mm_teardown() cleared p2m table already */
2073 /* foreign_p2m_unexpose(d, entry);*/
2074 list_del(&entry->list);
2075 put_domain(entry->src_dom);
2076 xfree(entry);
2078 spin_unlock(&foreign_p2m->lock);
2081 unsigned long
2082 dom0vp_expose_foreign_p2m(struct domain* dest_dom,
2083 unsigned long dest_gpfn,
2084 domid_t domid,
2085 XEN_GUEST_HANDLE(char) buffer,
2086 unsigned long flags)
2088 unsigned long ret = 0;
2089 struct domain* src_dom;
2090 struct xen_ia64_memmap_info memmap_info;
2091 char* memmap;
2092 void* memmap_end;
2093 void* p;
2095 struct foreign_p2m_entry* entry;
2097 ret = memmap_info_copy_from_guest(&memmap_info, &memmap, buffer);
2098 if (ret != 0)
2099 return ret;
2101 dest_dom = rcu_lock_domain(dest_dom);
2102 if (dest_dom == NULL) {
2103 ret = -EINVAL;
2104 goto out;
2106 #if 1
2107 // Self foreign domain p2m exposure isn't allowed.
2108 // Otherwise the domain can't be destroyed because
2109 // no one decrements the domain reference count.
2110 if (domid == dest_dom->domain_id) {
2111 ret = -EINVAL;
2112 goto out;
2114 #endif
2116 src_dom = get_domain_by_id(domid);
2117 if (src_dom == NULL) {
2118 ret = -EINVAL;
2119 goto out_unlock;
2122 if (flags & IA64_DOM0VP_EFP_ALLOC_PTE) {
2123 ret = foreign_p2m_allocate_pte(src_dom, &memmap_info, memmap);
2124 if (ret != 0)
2125 goto out_unlock;
2128 ret = foreign_p2m_alloc(&dest_dom->arch.foreign_p2m, dest_gpfn,
2129 src_dom, &memmap_info, memmap, &entry);
2130 if (ret != 0)
2131 goto out_unlock;
2133 memmap_end = memmap + memmap_info.efi_memmap_size;
2134 for (p = memmap; p < memmap_end; p += memmap_info.efi_memdesc_size) {
2135 efi_memory_desc_t* md = p;
2136 unsigned long src_gpfn =
2137 P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
2138 unsigned long src_gpfn_end =
2139 P2M_PFN_ROUNDUP(MD_END(md) >> PAGE_SHIFT);
2140 unsigned long num_src_gpfn = src_gpfn_end - src_gpfn;
2142 ret = expose_p2m(dest_dom, dest_gpfn + src_gpfn / PTRS_PER_PTE,
2143 src_dom, src_gpfn, num_src_gpfn);
2144 if (ret != 0)
2145 break;
2147 entry->region[entry->num_region].gpfn =
2148 dest_gpfn + src_gpfn / PTRS_PER_PTE;
2149 entry->region[entry->num_region].num_gpfn = P2M_NUM_PFN(num_src_gpfn);
2150 entry->num_region++;
2153 if (ret == 0) {
2154 foreign_p2m_unbusy(&dest_dom->arch.foreign_p2m, entry);
2155 } else {
2156 foreign_p2m_unexpose(dest_dom, entry);
2157 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2160 out_unlock:
2161 rcu_unlock_domain(dest_dom);
2162 out:
2163 xfree(memmap);
2164 return ret;
2167 unsigned long
2168 dom0vp_unexpose_foreign_p2m(struct domain* dest_dom,
2169 unsigned long dest_gpfn,
2170 domid_t domid)
2172 int ret = -ENOENT;
2173 struct foreign_p2m* foreign_p2m = &dest_dom->arch.foreign_p2m;
2174 struct foreign_p2m_entry* entry;
2176 dest_dom = rcu_lock_domain(dest_dom);
2177 if (dest_dom == NULL)
2178 return ret;
2179 spin_lock(&foreign_p2m->lock);
2180 list_for_each_entry(entry, &foreign_p2m->head, list) {
2181 if (entry->gpfn < dest_gpfn)
2182 continue;
2183 if (dest_gpfn < entry->gpfn)
2184 break;
2186 if (domid == entry->src_dom->domain_id)
2187 ret = 0;
2188 else
2189 ret = -EINVAL;
2190 break;
2192 if (ret == 0) {
2193 if (entry->busy == 0)
2194 entry->busy = 1;
2195 else
2196 ret = -EBUSY;
2198 spin_unlock(&foreign_p2m->lock);
2200 if (ret == 0) {
2201 foreign_p2m_unexpose(dest_dom, entry);
2202 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2204 rcu_unlock_domain(dest_dom);
2205 return ret;
2208 /* this lock can be only for memmap_info. domain_lock() is abused here */
2209 static void
2210 memmap_lock(struct domain *d)
2212 domain_lock(d);
2215 static void
2216 memmap_unlock(struct domain *d)
2218 domain_unlock(d);
2221 /* copy memory range to domain pseudo physical address space */
2222 static int
2223 __memmap_copy_to(struct domain *d, unsigned long dest_gpfn,
2224 void *src, unsigned long num_pages)
2226 BUG_ON(((unsigned long)src & ~PAGE_MASK) != 0);
2228 while (num_pages > 0) {
2229 unsigned long mfn;
2230 struct page_info *page;
2231 void *virt;
2233 mfn = gmfn_to_mfn_foreign(d, dest_gpfn);
2234 if (mfn == 0 || mfn == INVALID_MFN)
2235 return -EFAULT;
2236 page = mfn_to_page(mfn);
2237 if (get_page(page, d) == 0)
2238 return -EFAULT;
2239 virt = mfn_to_virt(mfn);
2240 copy_page(virt, src);
2241 __xencomm_mark_dirty(d, (unsigned long)virt, PAGE_SIZE);
2242 put_page(page);
2244 src += PAGE_SIZE;
2245 dest_gpfn++;
2246 num_pages--;
2249 return 0;
2252 /* copy memory range from domain pseudo physical address space */
2253 static int
2254 __memmap_copy_from(void *dest, struct domain *d, unsigned long src_gpfn,
2255 unsigned long num_pages)
2257 BUG_ON(((unsigned long)dest & ~PAGE_MASK) != 0);
2259 while (num_pages > 0) {
2260 unsigned long mfn;
2261 struct page_info *page;
2263 mfn = gmfn_to_mfn_foreign(d, src_gpfn);
2264 if (mfn == 0 || mfn == INVALID_MFN)
2265 return -EFAULT;
2266 page = mfn_to_page(mfn);
2267 if (get_page(page, d) == 0)
2268 return -EFAULT;
2269 copy_page(dest, mfn_to_virt(mfn));
2270 put_page(page);
2272 dest += PAGE_SIZE;
2273 src_gpfn++;
2274 num_pages--;
2277 return 0;
2280 /* This function unlock/lock memmeap_lock.
2281 * caller must free (*page, *order) even if error case by ckecking
2282 * *page = NULL.
2283 */
2284 static int
2285 memmap_copy_from(struct domain *d,
2286 struct page_info **page, unsigned long *order)
2288 unsigned long num_pages;
2289 struct xen_ia64_memmap_info *memmap_info;
2290 unsigned long memmap_info_pfn;
2292 num_pages = d->shared_info->arch.memmap_info_num_pages;
2293 memmap_unlock(d);
2295 again:
2296 *order = get_order(num_pages << PAGE_SHIFT);
2297 *page = alloc_domheap_pages(NULL, *order, 0);
2298 if (*page == NULL)
2299 return -ENOMEM;
2300 memmap_info = page_to_virt(*page);
2302 memmap_lock(d);
2303 if (d->shared_info->arch.memmap_info_num_pages != num_pages) {
2304 num_pages = d->shared_info->arch.memmap_info_num_pages;
2305 memmap_unlock(d);
2306 free_domheap_pages(*page, *order);
2307 goto again;
2309 memmap_info_pfn = d->shared_info->arch.memmap_info_pfn;
2311 /* copy into local to make them virtually contiguous */
2312 return __memmap_copy_from(memmap_info, d, memmap_info_pfn, num_pages);
2315 static int
2316 memdesc_can_expand(const struct xen_ia64_memmap_info *memmap_info,
2317 unsigned long num_pages)
2319 /* Is there room for one more md? */
2320 if ((num_pages << PAGE_SHIFT) <
2321 (sizeof(*memmap_info) + memmap_info->efi_memmap_size +
2322 memmap_info->efi_memdesc_size))
2323 return 0;
2325 return 1;
2328 static int
2329 memdesc_can_collapse(const efi_memory_desc_t *lhs,
2330 const efi_memory_desc_t *rhs)
2332 return (lhs->type == rhs->type && lhs->attribute == rhs->attribute);
2335 static int
2336 __dom0vp_add_memdesc_one(struct xen_ia64_memmap_info *memmap_info,
2337 unsigned long num_pages,
2338 const efi_memory_desc_t *md)
2340 void* const memmap_end = (void*)memmap_info->memdesc +
2341 memmap_info->efi_memmap_size;
2342 void *p;
2343 efi_memory_desc_t *tmp_md;
2344 efi_memory_desc_t *s_md;
2345 efi_memory_desc_t *e_md;
2346 u64 phys_addr;
2347 u64 phys_addr_end;
2349 /* fast path. appending to the last entry */
2350 tmp_md = (efi_memory_desc_t*)(memmap_end - memmap_info->efi_memdesc_size);
2351 if (MD_END(tmp_md) < md->phys_addr) {
2352 /* append one */
2353 if (!memdesc_can_expand(memmap_info, num_pages))
2354 return -ENOMEM;
2356 memcpy(memmap_end, md, memmap_info->efi_memdesc_size);
2357 memmap_info->efi_memmap_size += memmap_info->efi_memdesc_size;
2358 return 0;
2360 /* fast path. expand the last entry */
2361 if (tmp_md->phys_addr <= md->phys_addr) {
2362 if (!memdesc_can_collapse(tmp_md, md))
2363 return -EINVAL;
2365 phys_addr_end = max(MD_END(tmp_md), MD_END(md));
2366 tmp_md->num_pages =
2367 (phys_addr_end - tmp_md->phys_addr) >> EFI_PAGE_SHIFT;
2368 return 0;
2371 /* slow path */
2372 s_md = NULL;
2373 e_md = NULL;
2374 for (p = memmap_info->memdesc;
2375 p < memmap_end;
2376 p += memmap_info->efi_memdesc_size) {
2377 tmp_md = p;
2379 if (MD_END(tmp_md) < md->phys_addr)
2380 continue;
2382 if (MD_END(md) < tmp_md->phys_addr) {
2383 if (s_md == NULL) {
2384 void *next_md = p + memmap_info->efi_memdesc_size;
2385 size_t left_size = memmap_end - (void*)tmp_md;
2387 /* found hole. just insert md here*/
2388 if (!memdesc_can_expand(memmap_info, num_pages))
2389 return -ENOMEM;
2391 memmove(next_md, tmp_md, left_size);
2392 memcpy(tmp_md, md, memmap_info->efi_memdesc_size);
2393 memmap_info->efi_memmap_size += memmap_info->efi_memdesc_size;
2394 return 0;
2396 break;
2399 if (s_md == NULL)
2400 s_md = tmp_md;
2401 e_md = tmp_md;
2403 if (!memdesc_can_collapse(tmp_md, md))
2404 return -EINVAL;
2406 BUG_ON(s_md == NULL || e_md == NULL);
2408 /* collapse into one */
2409 phys_addr = min(md->phys_addr, s_md->phys_addr);
2410 phys_addr_end = max(MD_END(md), MD_END(e_md));
2411 s_md->phys_addr = phys_addr;
2412 s_md->num_pages = (phys_addr_end - phys_addr) >> EFI_PAGE_SHIFT;
2413 if (s_md != e_md) {
2414 void *next_s_md = (void*)s_md + memmap_info->efi_memdesc_size;
2415 void *next_e_md = (void*)e_md + memmap_info->efi_memdesc_size;
2416 size_t left_size = memmap_end - (void*)next_e_md;
2418 memmap_info->efi_memmap_size -= (void*)e_md - (void*)s_md;
2419 if (left_size > 0)
2420 memmove(next_s_md, next_e_md, left_size);
2423 return 0;
2426 /*
2427 * d->arch.convmem_end is mostly read only and sometimes increased.
2428 * It is protected by memmap_lock
2430 * d->arch.convmem_end is also referned by guest(self p2m exposure)
2431 * d->shared_info.arch.memmap_info_xxx and memmap_info are
2432 * referenced by tools stack(save/dump-core/foreign p2m exposure).
2434 * reader side:
2435 * - get d->arch.convmem_end (via XENMEM_maximum_gpfn)
2436 * - issue get_memmap hypercall to get memmap
2437 * In VMM
2438 * - lock memmap_lock
2439 * - copy memmap from target guest
2440 * - unlock memmap_lock
2441 * - copy memmap into tools stack address space.
2442 * - check d->shared_info.memmap_info_num_pages. try again if necessary
2443 * - get d->arch.convmem_end. try again if changed.
2445 * writer side:
2446 * - lock memmap_lock
2447 * - increase d->arch.convmem_end at first if necessary
2448 * - unlock memmap_lock
2449 * - allocate memory
2450 * In fact page allocation isn't blocking, so unlock/lock isn't necessary.
2451 * - lock memmap_lock
2452 * - update memmap_info
2453 * - unlock memmap_lock
2454 */
2455 static int
2456 __dom0vp_add_memdesc(struct domain *targ_d,
2457 const struct xen_ia64_memmap_info *u_memmap_info,
2458 const char *u_memmap)
2460 int ret = 0;
2461 const void* const u_memmap_end = u_memmap + u_memmap_info->efi_memmap_size;
2462 const efi_memory_desc_t *md;
2464 unsigned long md_end_max;
2465 unsigned long num_pages;
2466 unsigned long order;
2467 unsigned long memmap_info_pfn;
2469 struct page_info *page = NULL;
2470 struct xen_ia64_memmap_info *memmap_info;
2471 size_t unused_size;
2473 const void *p;
2475 /* update d->arch.convmem_end */
2476 md_end_max = 0;
2477 for (p = u_memmap; p < u_memmap_end;
2478 p += u_memmap_info->efi_memdesc_size) {
2479 md = p;
2480 if (MD_END(md) > md_end_max)
2481 md_end_max = MD_END(md);
2483 memmap_lock(targ_d);
2484 /* convmem_end is also protected memdesc lock */
2485 if (md_end_max > targ_d->arch.convmem_end)
2486 targ_d->arch.convmem_end = md_end_max;
2488 /* memmap_copy_from_guest() unlock/lock memmap_lock() */
2489 ret = memmap_copy_from(targ_d, &page, &order);
2490 if (ret != 0)
2491 goto out;
2492 memmap_info = page_to_virt(page);
2493 num_pages = targ_d->shared_info->arch.memmap_info_num_pages;
2494 memmap_info_pfn = targ_d->shared_info->arch.memmap_info_pfn;
2496 if (memmap_info->efi_memdesc_size != u_memmap_info->efi_memdesc_size ||
2497 memmap_info->efi_memdesc_version !=
2498 u_memmap_info->efi_memdesc_version) {
2499 ret = -EINVAL;
2500 goto out;
2503 /* update memdesc */
2504 for (p = u_memmap;
2505 p < u_memmap_end;
2506 p += u_memmap_info->efi_memdesc_size) {
2507 md = p;
2508 ret = __dom0vp_add_memdesc_one(memmap_info, num_pages, md);
2509 if (ret != 0)
2510 goto out;
2513 /* zero out the unused region to avoid hypervisor bit leak */
2514 unused_size = (num_pages << PAGE_SHIFT) -
2515 (sizeof(*memmap_info) + memmap_info->efi_memmap_size);
2516 if (unused_size > 0)
2517 memset((void*)memmap_info->memdesc + memmap_info->efi_memmap_size,
2518 0, unused_size);
2520 /* copy back into domain. */
2521 ret = __memmap_copy_to(targ_d, memmap_info_pfn, memmap_info, num_pages);
2523 out:
2524 memmap_unlock(targ_d);
2526 if (page != NULL)
2527 free_domheap_pages(page, order);
2528 return ret;
2531 unsigned long
2532 dom0vp_get_memmap(domid_t domid, XEN_GUEST_HANDLE(char) buffer)
2534 unsigned long ret = 0;
2535 struct domain *targ_d;
2537 struct page_info *page = NULL;
2538 unsigned long order;
2540 struct xen_ia64_memmap_info *memmap_info;
2541 unsigned long num_pages;
2543 ret = rcu_lock_target_domain_by_id(domid, &targ_d);
2544 if (ret != 0)
2545 return ret;
2547 memmap_lock(targ_d);
2549 ret = memmap_copy_from(targ_d, &page, &order);
2550 if (ret != 0)
2551 goto unlock_out;
2553 memmap_info = page_to_virt(page);
2554 num_pages = targ_d->shared_info->arch.memmap_info_num_pages;
2555 if ((num_pages << PAGE_SHIFT) - sizeof(*memmap_info) <
2556 memmap_info->efi_memmap_size) {
2557 ret = -EFAULT;
2558 goto unlock_out;
2560 memmap_unlock(targ_d);
2561 rcu_unlock_domain(targ_d);
2563 if (copy_to_guest(buffer, (char*)memmap_info, sizeof(*memmap_info)) ||
2564 copy_to_guest_offset(buffer, sizeof(*memmap_info),
2565 (char*)memmap_info->memdesc,
2566 memmap_info->efi_memmap_size))
2567 ret = -EFAULT;
2569 out:
2570 if (page != NULL)
2571 free_domheap_pages(page, order);
2572 return ret;
2574 unlock_out:
2575 memmap_unlock(targ_d);
2576 rcu_unlock_domain(targ_d);
2577 goto out;
2579 #endif
2581 // grant table host mapping
2582 // mpaddr: host_addr: pseudo physical address
2583 // mfn: frame: machine page frame
2584 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
2585 int
2586 create_grant_host_mapping(unsigned long gpaddr, unsigned long mfn,
2587 unsigned int flags, unsigned int cache_flags)
2589 struct domain* d = current->domain;
2590 struct page_info* page;
2591 int ret;
2593 if ((flags & (GNTMAP_device_map |
2594 GNTMAP_application_map | GNTMAP_contains_pte)) ||
2595 (cache_flags)) {
2596 gdprintk(XENLOG_INFO, "%s: flags 0x%x cache_flags 0x%x\n",
2597 __func__, flags, cache_flags);
2598 return GNTST_general_error;
2601 BUG_ON(!mfn_valid(mfn));
2602 page = mfn_to_page(mfn);
2603 ret = get_page(page, page_get_owner(page));
2604 BUG_ON(ret == 0);
2605 assign_domain_page_replace(d, gpaddr, mfn,
2606 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2607 ASSIGN_tlb_track |
2608 #endif
2609 ((flags & GNTMAP_readonly) ?
2610 ASSIGN_readonly : ASSIGN_writable));
2611 perfc_incr(create_grant_host_mapping);
2612 return GNTST_okay;
2615 // grant table host unmapping
2616 int
2617 replace_grant_host_mapping(unsigned long gpaddr,
2618 unsigned long mfn, unsigned long new_gpaddr, unsigned int flags)
2620 struct domain* d = current->domain;
2621 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
2622 volatile pte_t* pte;
2623 unsigned long cur_arflags;
2624 pte_t cur_pte;
2625 pte_t new_pte = __pte(0);
2626 pte_t old_pte;
2627 struct page_info* page = mfn_to_page(mfn);
2628 struct page_info* new_page = NULL;
2629 volatile pte_t* new_page_pte = NULL;
2630 unsigned long new_page_mfn = INVALID_MFN;
2632 if (new_gpaddr) {
2633 new_page_pte = lookup_noalloc_domain_pte_none(d, new_gpaddr);
2634 if (likely(new_page_pte != NULL)) {
2635 new_pte = ptep_get_and_clear(&d->arch.mm,
2636 new_gpaddr, new_page_pte);
2637 if (likely(pte_present(new_pte))) {
2638 struct domain* page_owner;
2640 new_page_mfn = pte_pfn(new_pte);
2641 new_page = mfn_to_page(new_page_mfn);
2642 page_owner = page_get_owner(new_page);
2643 if (unlikely(page_owner == NULL)) {
2644 gdprintk(XENLOG_INFO,
2645 "%s: page_owner == NULL "
2646 "gpaddr 0x%lx mfn 0x%lx "
2647 "new_gpaddr 0x%lx mfn 0x%lx\n",
2648 __func__, gpaddr, mfn, new_gpaddr, new_page_mfn);
2649 new_page = NULL; /* prevent domain_put_page() */
2650 return GNTST_general_error;
2653 /*
2654 * domain_put_page(clear_PGC_allcoated = 0)
2655 * doesn't decrement refcount of page with
2656 * pte_ptc_allocated() = 1. Be carefull.
2657 */
2658 if (unlikely(!pte_pgc_allocated(new_pte))) {
2659 /* domain_put_page() decrements page refcount. adjust it. */
2660 if (get_page(new_page, page_owner)) {
2661 gdprintk(XENLOG_INFO,
2662 "%s: get_page() failed. "
2663 "gpaddr 0x%lx mfn 0x%lx "
2664 "new_gpaddr 0x%lx mfn 0x%lx\n",
2665 __func__, gpaddr, mfn,
2666 new_gpaddr, new_page_mfn);
2667 return GNTST_general_error;
2670 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 0);
2671 } else
2672 new_pte = __pte(0);
2676 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
2677 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
2678 return GNTST_general_error;
2681 pte = lookup_noalloc_domain_pte(d, gpaddr);
2682 if (pte == NULL) {
2683 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
2684 __func__, gpaddr, mfn);
2685 return GNTST_general_error;
2688 again:
2689 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
2690 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
2691 if (!pte_present(cur_pte) ||
2692 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
2693 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
2694 __func__, gpaddr, mfn, pte_val(cur_pte));
2695 return GNTST_general_error;
2698 if (new_page) {
2699 BUG_ON(new_page_mfn == INVALID_MFN);
2700 set_gpfn_from_mfn(new_page_mfn, gpfn);
2701 /* smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2702 has release semantics. */
2704 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
2705 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
2706 if (pte_pfn(old_pte) == mfn) {
2707 goto again;
2709 if (new_page) {
2710 BUG_ON(new_page_mfn == INVALID_MFN);
2711 set_gpfn_from_mfn(new_page_mfn, INVALID_M2P_ENTRY);
2712 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 1);
2714 goto out;
2716 if (unlikely(!pte_present(old_pte)))
2717 goto out;
2718 BUG_ON(pte_pfn(old_pte) != mfn);
2720 /* try_to_clear_PGC_allocate(d, page) is not needed. */
2721 BUG_ON(page_get_owner(page) == d &&
2722 get_gpfn_from_mfn(mfn) == gpfn);
2723 BUG_ON(pte_pgc_allocated(old_pte));
2724 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
2726 perfc_incr(replace_grant_host_mapping);
2727 return GNTST_okay;
2729 out:
2730 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
2731 "0x%lx old_pte 0x%lx\n",
2732 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
2733 return GNTST_general_error;
2736 // heavily depends on the struct page layout.
2737 // gnttab_transfer() calls steal_page() with memflags = 0
2738 // For grant table transfer, we must fill the page.
2739 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
2740 // For memory exchange, we don't have to fill the page because
2741 // memory_exchange() does it.
2742 int
2743 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
2745 #if 0 /* if big endian */
2746 # error "implement big endian version of steal_page()"
2747 #endif
2748 u32 _d, _nd;
2749 u64 x, nx, y;
2751 if (page_get_owner(page) != d) {
2752 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
2753 __func__, d, page_get_owner(page));
2754 return -1;
2757 if (!(memflags & MEMF_no_refcount)) {
2758 unsigned long gpfn;
2759 struct page_info *new;
2760 unsigned long new_mfn;
2761 int ret;
2763 new = alloc_domheap_page(d, 0);
2764 if (new == NULL) {
2765 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
2766 return -1;
2768 // zero out pages for security reasons
2769 clear_page(page_to_virt(new));
2770 // assign_domain_page_cmpxchg_rel() has release semantics
2771 // so smp_mb() isn't needed.
2773 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
2774 if (gpfn == INVALID_M2P_ENTRY) {
2775 free_domheap_page(new);
2776 return -1;
2778 new_mfn = page_to_mfn(new);
2779 set_gpfn_from_mfn(new_mfn, gpfn);
2780 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2781 // has release semantics.
2783 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
2784 ASSIGN_writable |
2785 ASSIGN_pgc_allocated, 0);
2786 if (ret < 0) {
2787 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
2788 ret);
2789 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
2790 free_domheap_page(new);
2791 return -1;
2793 perfc_incr(steal_page_refcount);
2796 spin_lock(&d->page_alloc_lock);
2798 /*
2799 * The tricky bit: atomically release ownership while there is just one
2800 * benign reference to the page (PGC_allocated). If that reference
2801 * disappears then the deallocation routine will safely spin.
2802 */
2803 _d = pickle_domptr(d);
2804 y = *((u64*)&page->count_info);
2805 do {
2806 x = y;
2807 nx = x & 0xffffffff;
2808 // page->count_info: untouched
2809 // page->u.inused._domain = 0;
2810 _nd = x >> 32;
2812 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
2813 (1 | PGC_allocated))) ||
2814 unlikely(_nd != _d)) {
2815 struct domain* nd = unpickle_domptr(_nd);
2816 if (nd == NULL) {
2817 gdprintk(XENLOG_INFO, "gnttab_transfer: "
2818 "Bad page %p: ed=%p(%u) 0x%x, "
2819 "sd=%p 0x%x,"
2820 " caf=%016lx, taf=%" PRtype_info
2821 " memflags 0x%x\n",
2822 (void *) page_to_mfn(page),
2823 d, d->domain_id, _d,
2824 nd, _nd,
2825 x,
2826 page->u.inuse.type_info,
2827 memflags);
2828 } else {
2829 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
2830 "Bad page %p: ed=%p(%u) 0x%x, "
2831 "sd=%p(%u) 0x%x,"
2832 " caf=%016lx, taf=%" PRtype_info
2833 " memflags 0x%x\n",
2834 (void *) page_to_mfn(page),
2835 d, d->domain_id, _d,
2836 nd, nd->domain_id, _nd,
2837 x,
2838 page->u.inuse.type_info,
2839 memflags);
2841 spin_unlock(&d->page_alloc_lock);
2842 return -1;
2845 y = cmpxchg((u64*)&page->count_info, x, nx);
2846 } while (unlikely(y != x));
2848 /*
2849 * Unlink from 'd'. At least one reference remains (now anonymous), so
2850 * noone else is spinning to try to delete this page from 'd'.
2851 */
2852 if ( !(memflags & MEMF_no_refcount) )
2853 d->tot_pages--;
2854 list_del(&page->list);
2856 spin_unlock(&d->page_alloc_lock);
2857 perfc_incr(steal_page);
2858 return 0;
2861 static void
2862 __guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2863 unsigned long mfn)
2865 set_gpfn_from_mfn(mfn, gpfn);
2866 smp_mb();
2867 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
2868 ASSIGN_writable | ASSIGN_pgc_allocated);
2869 if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ){
2870 int i, j;
2871 j = 1 << (PAGE_SHIFT-PAGE_SHIFT_4K);
2872 for(i = 0 ; i < j; i++)
2873 iommu_map_page(d, gpfn*j + i, mfn*j + i);
2877 int
2878 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2879 unsigned long mfn, unsigned int page_order)
2881 unsigned long i;
2883 for (i = 0; i < (1UL << page_order); i++) {
2884 BUG_ON(!mfn_valid(mfn));
2885 BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
2886 __guest_physmap_add_page(d, gpfn, mfn);
2887 mfn++;
2888 gpfn++;
2891 perfc_incr(guest_physmap_add_page);
2892 return 0;
2895 void
2896 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
2897 unsigned long mfn, unsigned int page_order)
2899 unsigned long i;
2901 BUG_ON(mfn == 0);//XXX
2903 for (i = 0; i < (1UL << page_order); i++)
2904 zap_domain_page_one(d, (gpfn+i) << PAGE_SHIFT, 0, mfn+i);
2906 perfc_incr(guest_physmap_remove_page);
2909 static void
2910 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
2911 volatile pte_t* ptep, pte_t old_pte,
2912 struct page_info* page)
2914 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2915 struct tlb_track_entry* entry;
2916 #endif
2918 if (shadow_mode_enabled(d))
2919 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
2921 #ifndef CONFIG_XEN_IA64_TLB_TRACK
2922 //XXX sledgehammer.
2923 // flush finer range.
2924 domain_flush_vtlb_all(d);
2925 put_page(page);
2926 #else
2927 switch (tlb_track_search_and_remove(d->arch.tlb_track,
2928 ptep, old_pte, &entry)) {
2929 case TLB_TRACK_NOT_TRACKED:
2930 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
2931 /* This page is zapped from this domain
2932 * by memory decrease or exchange or dom0vp_zap_physmap.
2933 * I.e. the page is zapped for returning this page to xen
2934 * (balloon driver or DMA page allocation) or
2935 * foreign domain mapped page is unmapped from the domain.
2936 * In the former case the page is to be freed so that
2937 * we can defer freeing page to batch.
2938 * In the latter case the page is unmapped so that
2939 * we need to flush it. But to optimize it, we
2940 * queue the page and flush vTLB only once.
2941 * I.e. The caller must call dfree_flush() explicitly.
2942 */
2943 domain_flush_vtlb_all(d);
2944 put_page(page);
2945 break;
2946 case TLB_TRACK_NOT_FOUND:
2947 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
2948 /* This page is zapped from this domain
2949 * by grant table page unmap.
2950 * Luckily the domain that mapped this page didn't
2951 * access this page so that we don't have to flush vTLB.
2952 * Probably the domain did only DMA.
2953 */
2954 /* do nothing */
2955 put_page(page);
2956 break;
2957 case TLB_TRACK_FOUND:
2958 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
2959 /* This page is zapped from this domain
2960 * by grant table page unmap.
2961 * Fortunately this page is accessced via only one virtual
2962 * memory address. So it is easy to flush it.
2963 */
2964 domain_flush_vtlb_track_entry(d, entry);
2965 tlb_track_free_entry(d->arch.tlb_track, entry);
2966 put_page(page);
2967 break;
2968 case TLB_TRACK_MANY:
2969 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
2970 /* This page is zapped from this domain
2971 * by grant table page unmap.
2972 * Unfortunately this page is accessced via many virtual
2973 * memory address (or too many times with single virtual address).
2974 * So we abondaned to track virtual addresses.
2975 * full vTLB flush is necessary.
2976 */
2977 domain_flush_vtlb_all(d);
2978 put_page(page);
2979 break;
2980 case TLB_TRACK_AGAIN:
2981 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
2982 BUG();
2983 break;
2985 #endif
2986 perfc_incr(domain_page_flush_and_put);
2989 int
2990 domain_page_mapped(struct domain* d, unsigned long mpaddr)
2992 volatile pte_t * pte;
2994 pte = lookup_noalloc_domain_pte(d, mpaddr);
2995 if(pte != NULL && !pte_none(*pte))
2996 return 1;
2997 return 0;
3000 /* Flush cache of domain d. */
3001 void domain_cache_flush (struct domain *d, int sync_only)
3003 struct mm_struct *mm = &d->arch.mm;
3004 volatile pgd_t *pgd = mm->pgd;
3005 unsigned long maddr;
3006 int i,j,k, l;
3007 int nbr_page = 0;
3008 void (*flush_func)(unsigned long start, unsigned long end);
3009 extern void flush_dcache_range (unsigned long, unsigned long);
3011 if (sync_only)
3012 flush_func = &flush_icache_range;
3013 else
3014 flush_func = &flush_dcache_range;
3016 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
3017 volatile pud_t *pud;
3018 if (!pgd_present(*pgd)) // acquire semantics
3019 continue;
3020 pud = pud_offset(pgd, 0);
3021 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
3022 volatile pmd_t *pmd;
3023 if (!pud_present(*pud)) // acquire semantics
3024 continue;
3025 pmd = pmd_offset(pud, 0);
3026 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
3027 volatile pte_t *pte;
3028 if (!pmd_present(*pmd)) // acquire semantics
3029 continue;
3030 pte = pte_offset_map(pmd, 0);
3031 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
3032 if (!pte_present(*pte)) // acquire semantics
3033 continue;
3034 /* Convert PTE to maddr. */
3035 maddr = __va_ul (pte_val(*pte)
3036 & _PAGE_PPN_MASK);
3037 (*flush_func)(maddr, maddr+ PAGE_SIZE);
3038 nbr_page++;
3043 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
3046 #ifdef VERBOSE
3047 #define MEM_LOG(_f, _a...) \
3048 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
3049 current->domain->domain_id , __LINE__ , ## _a )
3050 #else
3051 #define MEM_LOG(_f, _a...) ((void)0)
3052 #endif
3054 static void free_page_type(struct page_info *page, u32 type)
3058 static int alloc_page_type(struct page_info *page, u32 type)
3060 return 1;
3063 static int opt_p2m_xenheap;
3064 boolean_param("p2m_xenheap", opt_p2m_xenheap);
3066 void *pgtable_quicklist_alloc(void)
3068 void *p;
3070 BUG_ON(dom_p2m == NULL);
3071 if (!opt_p2m_xenheap) {
3072 struct page_info *page = alloc_domheap_page(dom_p2m, 0);
3073 if (page == NULL)
3074 return NULL;
3075 p = page_to_virt(page);
3076 clear_page(p);
3077 return p;
3079 p = alloc_xenheap_page();
3080 if (p) {
3081 clear_page(p);
3082 /*
3083 * This page should be read only. At this moment, the third
3084 * argument doesn't make sense. It should be 1 when supported.
3085 */
3086 share_xen_page_with_guest(virt_to_page(p), dom_p2m, 0);
3088 return p;
3091 void pgtable_quicklist_free(void *pgtable_entry)
3093 struct page_info* page = virt_to_page(pgtable_entry);
3095 BUG_ON(page_get_owner(page) != dom_p2m);
3096 BUG_ON(page->count_info != (1 | PGC_allocated));
3098 put_page(page);
3099 if (opt_p2m_xenheap)
3100 free_xenheap_page(pgtable_entry);
3103 void put_page_type(struct page_info *page)
3105 u64 nx, x, y = page->u.inuse.type_info;
3107 again:
3108 do {
3109 x = y;
3110 nx = x - 1;
3112 ASSERT((x & PGT_count_mask) != 0);
3114 /*
3115 * The page should always be validated while a reference is held. The
3116 * exception is during domain destruction, when we forcibly invalidate
3117 * page-table pages if we detect a referential loop.
3118 * See domain.c:relinquish_list().
3119 */
3120 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
3122 if ( unlikely((nx & PGT_count_mask) == 0) )
3124 /* Record TLB information for flush later. Races are harmless. */
3125 page->tlbflush_timestamp = tlbflush_current_time();
3127 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
3128 likely(nx & PGT_validated) )
3130 /*
3131 * Page-table pages must be unvalidated when count is zero. The
3132 * 'free' is safe because the refcnt is non-zero and validated
3133 * bit is clear => other ops will spin or fail.
3134 */
3135 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
3136 x & ~PGT_validated)) != x) )
3137 goto again;
3138 /* We cleared the 'valid bit' so we do the clean up. */
3139 free_page_type(page, x);
3140 /* Carry on, but with the 'valid bit' now clear. */
3141 x &= ~PGT_validated;
3142 nx &= ~PGT_validated;
3146 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
3150 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
3152 struct page_info *page = mfn_to_page(page_nr);
3154 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
3156 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
3157 return 0;
3160 return 1;
3164 int get_page_type(struct page_info *page, u32 type)
3166 u64 nx, x, y = page->u.inuse.type_info;
3168 ASSERT(!(type & ~PGT_type_mask));
3170 again:
3171 do {
3172 x = y;
3173 nx = x + 1;
3174 if ( unlikely((nx & PGT_count_mask) == 0) )
3176 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
3177 return 0;
3179 else if ( unlikely((x & PGT_count_mask) == 0) )
3181 if ( (x & PGT_type_mask) != type )
3183 /*
3184 * On type change we check to flush stale TLB entries. This
3185 * may be unnecessary (e.g., page was GDT/LDT) but those
3186 * circumstances should be very rare.
3187 */
3188 cpumask_t mask =
3189 page_get_owner(page)->domain_dirty_cpumask;
3190 tlbflush_filter(mask, page->tlbflush_timestamp);
3192 if ( unlikely(!cpus_empty(mask)) )
3194 perfc_incr(need_flush_tlb_flush);
3195 flush_tlb_mask(mask);
3198 /* We lose existing type, back pointer, and validity. */
3199 nx &= ~(PGT_type_mask | PGT_validated);
3200 nx |= type;
3202 /* No special validation needed for writable pages. */
3203 /* Page tables and GDT/LDT need to be scanned for validity. */
3204 if ( type == PGT_writable_page )
3205 nx |= PGT_validated;
3208 else if ( unlikely((x & PGT_type_mask) != type) )
3210 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
3211 (type != PGT_l1_page_table) )
3212 MEM_LOG("Bad type (saw %08lx != exp %08x) "
3213 "for mfn %016lx (pfn %016lx)",
3214 x, type, page_to_mfn(page),
3215 get_gpfn_from_mfn(page_to_mfn(page)));
3216 return 0;
3218 else if ( unlikely(!(x & PGT_validated)) )
3220 /* Someone else is updating validation of this page. Wait... */
3221 while ( (y = page->u.inuse.type_info) == x )
3222 cpu_relax();
3223 goto again;
3226 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
3228 if ( unlikely(!(nx & PGT_validated)) )
3230 /* Try to validate page type; drop the new reference on failure. */
3231 if ( unlikely(!alloc_page_type(page, type)) )
3233 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
3234 ": caf=%08x taf=%" PRtype_info,
3235 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
3236 type, page->count_info, page->u.inuse.type_info);
3237 /* Noone else can get a reference. We hold the only ref. */
3238 page->u.inuse.type_info = 0;
3239 return 0;
3242 /* Noone else is updating simultaneously. */
3243 __set_bit(_PGT_validated, &page->u.inuse.type_info);
3246 return 1;
3249 int page_is_conventional_ram(unsigned long mfn)
3251 return (efi_mem_type(pfn_to_paddr(mfn)) == EFI_CONVENTIONAL_MEMORY);
3255 long
3256 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3258 struct page_info *page = NULL;
3259 long rc;
3261 switch (op) {
3262 case XENMEM_add_to_physmap:
3264 struct xen_add_to_physmap xatp;
3265 unsigned long prev_mfn, mfn = 0, gpfn;
3266 struct domain *d;
3268 if (copy_from_guest(&xatp, arg, 1))
3269 return -EFAULT;
3271 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3272 if (rc)
3273 return rc;
3275 /* This hypercall is used for VT-i domain only */
3276 if (!is_hvm_domain(d)) {
3277 rcu_unlock_domain(d);
3278 return -ENOSYS;
3281 switch (xatp.space) {
3282 case XENMAPSPACE_shared_info:
3283 if (xatp.idx == 0)
3284 mfn = virt_to_mfn(d->shared_info);
3285 break;
3286 case XENMAPSPACE_grant_table:
3287 spin_lock(&d->grant_table->lock);
3289 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
3290 (xatp.idx < max_nr_grant_frames))
3291 gnttab_grow_table(d, xatp.idx + 1);
3293 if (xatp.idx < nr_grant_frames(d->grant_table))
3294 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3296 spin_unlock(&d->grant_table->lock);
3297 break;
3298 case XENMAPSPACE_gmfn: {
3299 struct xen_ia64_memmap_info memmap_info;
3300 efi_memory_desc_t md;
3301 int ret;
3303 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3304 if ( !get_page_from_pagenr(xatp.idx, d) )
3305 break;
3307 mfn = xatp.idx;
3308 page = mfn_to_page(mfn);
3310 memmap_info.efi_memmap_size = sizeof(md);
3311 memmap_info.efi_memdesc_size = sizeof(md);
3312 memmap_info.efi_memdesc_version =
3313 EFI_MEMORY_DESCRIPTOR_VERSION;
3315 md.type = EFI_CONVENTIONAL_MEMORY;
3316 md.pad = 0;
3317 md.phys_addr = xatp.gpfn << PAGE_SHIFT;
3318 md.virt_addr = 0;
3319 md.num_pages = 1UL << (PAGE_SHIFT - EFI_PAGE_SHIFT);
3320 md.attribute = EFI_MEMORY_WB;
3322 ret = __dom0vp_add_memdesc(d, &memmap_info, (char*)&md);
3323 if (ret != 0) {
3324 put_page(page);
3325 rcu_unlock_domain(d);
3326 gdprintk(XENLOG_DEBUG,
3327 "%s:%d td %d gpfn 0x%lx mfn 0x%lx ret %d\n",
3328 __func__, __LINE__,
3329 d->domain_id, xatp.gpfn, xatp.idx, ret);
3330 return ret;
3332 break;
3334 default:
3335 break;
3338 if (mfn == 0) {
3339 if ( page )
3340 put_page(page);
3341 rcu_unlock_domain(d);
3342 return -EINVAL;
3345 domain_lock(d);
3347 /* Check remapping necessity */
3348 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3349 if (mfn == prev_mfn)
3350 goto out;
3352 /* Remove previously mapped page if it was present. */
3353 if (prev_mfn && mfn_valid(prev_mfn)) {
3354 if (is_xen_heap_mfn(prev_mfn))
3355 /* Xen heap frames are simply unhooked from this phys slot. */
3356 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3357 else
3358 /* Normal domain memory is freed, to avoid leaking memory. */
3359 guest_remove_page(d, xatp.gpfn);
3362 /* Unmap from old location, if any. */
3363 gpfn = get_gpfn_from_mfn(mfn);
3364 if (gpfn != INVALID_M2P_ENTRY)
3365 guest_physmap_remove_page(d, gpfn, mfn, 0);
3367 /* Map at new location. */
3368 /* Here page->count_info = PGC_allocated | N where N >= 1*/
3369 __guest_physmap_add_page(d, xatp.gpfn, mfn);
3371 out:
3372 domain_unlock(d);
3374 if ( page )
3375 put_page(page);
3377 rcu_unlock_domain(d);
3379 break;
3382 case XENMEM_machine_memory_map:
3384 struct xen_memory_map memmap;
3385 struct xen_ia64_memmap_info memmap_info;
3386 XEN_GUEST_HANDLE(char) buffer;
3388 if (!IS_PRIV(current->domain))
3389 return -EINVAL;
3390 if (copy_from_guest(&memmap, arg, 1))
3391 return -EFAULT;
3392 if (memmap.nr_entries <
3393 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size)
3394 return -EINVAL;
3396 memmap.nr_entries =
3397 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size;
3398 memset(&memmap_info, 0, sizeof(memmap_info));
3399 memmap_info.efi_memmap_size = ia64_boot_param->efi_memmap_size;
3400 memmap_info.efi_memdesc_size = ia64_boot_param->efi_memdesc_size;
3401 memmap_info.efi_memdesc_version = ia64_boot_param->efi_memdesc_version;
3403 buffer = guest_handle_cast(memmap.buffer, char);
3404 if (copy_to_guest(buffer, (char*)&memmap_info, sizeof(memmap_info)) ||
3405 copy_to_guest_offset(buffer, sizeof(memmap_info),
3406 (char*)__va(ia64_boot_param->efi_memmap),
3407 ia64_boot_param->efi_memmap_size) ||
3408 copy_to_guest(arg, &memmap, 1))
3409 return -EFAULT;
3410 return 0;
3413 case XENMEM_get_pod_target:
3414 case XENMEM_set_pod_target: {
3415 /* XXX: PoD populate on demand isn't supported yet. */
3416 xen_pod_target_t target;
3417 struct domain *d;
3419 /* Support DOMID_SELF? */
3420 if ( !IS_PRIV(current->domain) )
3421 return -EINVAL;
3423 if ( copy_from_guest(&target, arg, 1) )
3424 return -EFAULT;
3426 rc = rcu_lock_target_domain_by_id(target.domid, &d);
3427 if ( rc != 0 )
3428 return rc;
3430 if ( op == XENMEM_set_pod_target )
3432 /* if -ENOSYS is returned,
3433 domain builder aborts domain creation. */
3434 /* rc = -ENOSYS; */
3437 target.tot_pages = d->tot_pages;
3438 target.pod_cache_pages = 0;
3439 target.pod_entries = 0;
3441 if ( copy_to_guest(arg, &target, 1) )
3443 rc= -EFAULT;
3444 goto pod_target_out_unlock;
3447 pod_target_out_unlock:
3448 rcu_unlock_domain(d);
3449 return rc;
3452 default:
3453 return -ENOSYS;
3456 return 0;
3459 int is_iomem_page(unsigned long mfn)
3461 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
3464 static void __xencomm_mark_dirty(struct domain *d,
3465 unsigned long addr, unsigned int len)
3467 unsigned long gpfn;
3468 unsigned long end_addr = addr + len;
3470 if (shadow_mode_enabled(d)) {
3471 for (addr &= PAGE_MASK; addr < end_addr; addr += PAGE_SIZE) {
3472 gpfn = get_gpfn_from_mfn(virt_to_mfn(addr));
3473 shadow_mark_page_dirty(d, gpfn);
3478 void xencomm_mark_dirty(unsigned long addr, unsigned int len)
3480 __xencomm_mark_dirty(current->domain, addr, len);
3483 /* stubs for populate on demand */
3484 int
3485 guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
3486 unsigned int order)
3488 gdprintk(XENLOG_WARNING, "populate on demand isn't supported yet\n");
3489 return -ENOSYS;
3492 int
3493 p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn,
3494 unsigned int order)
3496 gdprintk(XENLOG_WARNING, "populate on demand isn't supported yet\n");
3497 return 1;
3500 /*
3501 * Local variables:
3502 * mode: C
3503 * c-set-style: "BSD"
3504 * c-basic-offset: 4
3505 * tab-width: 4
3506 * indent-tabs-mode: nil
3507 * End:
3508 */