ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 18503:325904748847

[IA64] A small fix in mm.c. Use PAGE_MASK, not ~PAGE_MASK.

~PAGE_MASK was wrongly used to get page aligned address.
Use PAGE_MASK, not ~PAGE_MASK.

Signed-off-by: Anthony xu <anthony.xu@intel.com>
author Isaku Yamahata <yamahata@valinux.co.jp>
date Thu Sep 25 11:47:53 2008 +0900 (2008-09-25)
parents 4ddd63b4be9b
children 788ed94f8fe4
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * replace_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <linux/sort.h>
179 #include <xen/guest_access.h>
180 #include <asm/page.h>
181 #include <asm/dom_fw_common.h>
182 #include <public/memory.h>
183 #include <asm/event.h>
184 #include <asm/debugger.h>
186 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
187 volatile pte_t* ptep, pte_t old_pte,
188 struct page_info* page);
190 extern unsigned long ia64_iobase;
192 struct domain *dom_xen, *dom_io;
194 /*
195 * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO.
196 * If more reserved domain ids are introduced, this might be increased.
197 */
198 #define DOMID_P2M (0x7FF8U)
199 static struct domain *dom_p2m;
201 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
202 void
203 alloc_dom_xen_and_dom_io(void)
204 {
205 /*
206 * Initialise our DOMID_XEN domain.
207 * Any Xen-heap pages that we will allow to be mapped will have
208 * their domain field set to dom_xen.
209 */
210 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
211 BUG_ON(dom_xen == NULL);
213 /*
214 * Initialise our DOMID_IO domain.
215 * This domain owns I/O pages that are within the range of the page_info
216 * array. Mappings occur at the priv of the caller.
217 */
218 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
219 BUG_ON(dom_io == NULL);
220 }
222 static int
223 mm_teardown_can_skip(struct domain* d, unsigned long offset)
224 {
225 return d->arch.mm_teardown_offset > offset;
226 }
228 static void
229 mm_teardown_update_offset(struct domain* d, unsigned long offset)
230 {
231 d->arch.mm_teardown_offset = offset;
232 }
234 static void
235 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
236 {
237 pte_t old_pte;
238 unsigned long mfn;
239 struct page_info* page;
241 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
243 // vmx domain use bit[58:56] to distinguish io region from memory.
244 // see vmx_build_physmap_table() in vmx_init.c
245 if (!pte_mem(old_pte))
246 return;
248 // domain might map IO space or acpi table pages. check it.
249 mfn = pte_pfn(old_pte);
250 if (!mfn_valid(mfn))
251 return;
252 page = mfn_to_page(mfn);
253 BUG_ON(page_get_owner(page) == NULL);
255 // struct page_info corresponding to mfn may exist or not depending
256 // on CONFIG_VIRTUAL_FRAME_TABLE.
257 // The above check is too easy.
258 // The right way is to check whether this page is of io area or acpi pages
260 if (pte_pgc_allocated(old_pte)) {
261 BUG_ON(page_get_owner(page) != d);
262 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
263 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
264 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
265 put_page(page);
266 } else {
267 put_page(page);
268 }
269 }
271 static int
272 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
273 {
274 unsigned long i;
275 volatile pte_t* pte = pte_offset_map(pmd, offset);
277 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
278 unsigned long cur_offset = offset + (i << PAGE_SHIFT);
279 if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
280 continue;
281 if (!pte_present(*pte)) { // acquire semantics
282 mm_teardown_update_offset(d, cur_offset);
283 continue;
284 }
285 mm_teardown_update_offset(d, cur_offset);
286 mm_teardown_pte(d, pte, cur_offset);
287 if (hypercall_preempt_check())
288 return -EAGAIN;
289 }
290 return 0;
291 }
293 static int
294 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
295 {
296 unsigned long i;
297 volatile pmd_t *pmd = pmd_offset(pud, offset);
299 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
300 unsigned long cur_offset = offset + (i << PMD_SHIFT);
301 if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
302 continue;
303 if (!pmd_present(*pmd)) { // acquire semantics
304 mm_teardown_update_offset(d, cur_offset);
305 continue;
306 }
307 if (mm_teardown_pmd(d, pmd, cur_offset))
308 return -EAGAIN;
309 }
310 return 0;
311 }
313 static int
314 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
315 {
316 unsigned long i;
317 volatile pud_t *pud = pud_offset(pgd, offset);
319 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
320 unsigned long cur_offset = offset + (i << PUD_SHIFT);
321 #ifndef __PAGETABLE_PUD_FOLDED
322 if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
323 continue;
324 #endif
325 if (!pud_present(*pud)) { // acquire semantics
326 #ifndef __PAGETABLE_PUD_FOLDED
327 mm_teardown_update_offset(d, cur_offset);
328 #endif
329 continue;
330 }
331 if (mm_teardown_pud(d, pud, cur_offset))
332 return -EAGAIN;
333 }
334 return 0;
335 }
337 int
338 mm_teardown(struct domain* d)
339 {
340 struct mm_struct* mm = &d->arch.mm;
341 unsigned long i;
342 volatile pgd_t* pgd;
344 if (mm->pgd == NULL)
345 return 0;
347 pgd = pgd_offset(mm, 0);
348 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
349 unsigned long cur_offset = i << PGDIR_SHIFT;
351 if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE))
352 continue;
353 if (!pgd_present(*pgd)) { // acquire semantics
354 mm_teardown_update_offset(d, cur_offset);
355 continue;
356 }
357 if (mm_teardown_pgd(d, pgd, cur_offset))
358 return -EAGAIN;
359 }
361 foreign_p2m_destroy(d);
362 return 0;
363 }
365 static void
366 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
367 unsigned long offset)
368 {
369 pte_free_kernel(pte_offset_map(pmd, offset));
370 }
372 static void
373 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
374 unsigned long offset)
375 {
376 unsigned long i;
377 volatile pmd_t *pmd = pmd_offset(pud, offset);
379 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
380 if (!pmd_present(*pmd))
381 continue;
382 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
383 }
384 pmd_free(pmd_offset(pud, offset));
385 }
387 static void
388 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
389 unsigned long offset)
390 {
391 unsigned long i;
392 volatile pud_t *pud = pud_offset(pgd, offset);
394 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
395 if (!pud_present(*pud))
396 continue;
397 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
398 }
399 pud_free(pud_offset(pgd, offset));
400 }
402 static void
403 mm_p2m_teardown(struct domain* d)
404 {
405 struct mm_struct* mm = &d->arch.mm;
406 unsigned long i;
407 volatile pgd_t* pgd;
409 BUG_ON(mm->pgd == NULL);
410 pgd = pgd_offset(mm, 0);
411 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
412 if (!pgd_present(*pgd))
413 continue;
414 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
415 }
416 pgd_free(mm->pgd);
417 mm->pgd = NULL;
418 }
420 void
421 mm_final_teardown(struct domain* d)
422 {
423 if (d->arch.shadow_bitmap != NULL) {
424 xfree(d->arch.shadow_bitmap);
425 d->arch.shadow_bitmap = NULL;
426 }
427 mm_p2m_teardown(d);
428 }
430 unsigned long
431 domain_get_maximum_gpfn(struct domain *d)
432 {
433 return (d->arch.convmem_end - 1) >> PAGE_SHIFT;
434 }
436 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
437 void
438 share_xen_page_with_guest(struct page_info *page,
439 struct domain *d, int readonly)
440 {
441 if ( page_get_owner(page) == d )
442 return;
444 #if 1
445 if (readonly) {
446 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
447 }
448 #endif
450 // alloc_xenheap_pages() doesn't initialize page owner.
451 //BUG_ON(page_get_owner(page) != NULL);
453 spin_lock(&d->page_alloc_lock);
455 #ifndef __ia64__
456 /* The incremented type count pins as writable or read-only. */
457 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
458 page->u.inuse.type_info |= PGT_validated | 1;
459 #endif
461 page_set_owner(page, d);
462 wmb(); /* install valid domain ptr before updating refcnt. */
463 ASSERT(page->count_info == 0);
465 /* Only add to the allocation list if the domain isn't dying. */
466 if ( !d->is_dying )
467 {
468 page->count_info |= PGC_allocated | 1;
469 if ( unlikely(d->xenheap_pages++ == 0) )
470 get_knownalive_domain(d);
471 list_add_tail(&page->list, &d->xenpage_list);
472 }
474 // grant_table_destroy() releases these pages.
475 // but it doesn't clear their m2p entry. So there might remain stale
476 // entries. such a stale entry is cleared here.
477 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
479 spin_unlock(&d->page_alloc_lock);
480 }
482 void
483 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
484 {
485 share_xen_page_with_guest(page, dom_xen, readonly);
486 }
488 unsigned long
489 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
490 {
491 unsigned long pte;
493 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
494 if (!pte) {
495 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
496 }
498 if ((pte & _PAGE_IO) && is_hvm_domain(d))
499 return INVALID_MFN;
501 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
502 }
504 // given a domain virtual address, pte and pagesize, extract the metaphysical
505 // address, convert the pte for a physical address for (possibly different)
506 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
507 // current->arch.vhpt_pg_shift!)
508 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* itir,
509 struct p2m_entry* entry)
510 {
511 struct domain *d = current->domain;
512 ia64_itir_t _itir = {.itir = itir__};
513 u64 mask, mpaddr, pteval2;
514 u64 arflags;
515 u64 arflags2;
516 u64 maflags2;
518 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
520 // FIXME address had better be pre-validated on insert
521 mask = ~itir_mask(_itir.itir);
522 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
524 if (_itir.ps > PAGE_SHIFT)
525 _itir.ps = PAGE_SHIFT;
527 ((ia64_itir_t*)itir)->itir = _itir.itir;/* Copy the whole register. */
528 ((ia64_itir_t*)itir)->ps = _itir.ps; /* Overwrite ps part! */
530 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
531 if (_itir.ps < PAGE_SHIFT)
532 pteval2 |= mpaddr & ~PAGE_MASK & ~((1L << _itir.ps) - 1);
534 /* Check access rights. */
535 arflags = pteval & _PAGE_AR_MASK;
536 arflags2 = pteval2 & _PAGE_AR_MASK;
537 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
538 #if 0
539 dprintk(XENLOG_WARNING,
540 "%s:%d "
541 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
542 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
543 __func__, __LINE__,
544 pteval, arflags, address, itir__,
545 pteval2, arflags2, mpaddr);
546 #endif
547 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
548 }
550 /* Check memory attribute. The switch is on the *requested* memory
551 attribute. */
552 maflags2 = pteval2 & _PAGE_MA_MASK;
553 switch (pteval & _PAGE_MA_MASK) {
554 case _PAGE_MA_NAT:
555 /* NaT pages are always accepted! */
556 break;
557 case _PAGE_MA_UC:
558 case _PAGE_MA_UCE:
559 case _PAGE_MA_WC:
560 if (maflags2 == _PAGE_MA_WB) {
561 /* Don't let domains WB-map uncached addresses.
562 This can happen when domU tries to touch i/o
563 port space. Also prevents possible address
564 aliasing issues. */
565 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) {
566 u64 ucwb;
568 /*
569 * If dom0 page has both UC & WB attributes
570 * don't warn about attempted UC access.
571 */
572 ucwb = efi_mem_attribute(mpaddr, PAGE_SIZE);
573 ucwb &= EFI_MEMORY_UC | EFI_MEMORY_WB;
574 ucwb ^= EFI_MEMORY_UC | EFI_MEMORY_WB;
576 if (d != dom0 || ucwb != 0)
577 gdprintk(XENLOG_WARNING, "Warning: UC"
578 " to WB for mpaddr=%lx\n",
579 mpaddr);
580 }
581 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
582 }
583 break;
584 case _PAGE_MA_WB:
585 if (maflags2 != _PAGE_MA_WB) {
586 /* Forbid non-coherent access to coherent memory. */
587 panic_domain(NULL, "try to use WB mem attr on "
588 "UC page, mpaddr=%lx\n", mpaddr);
589 }
590 break;
591 default:
592 panic_domain(NULL, "try to use unknown mem attribute\n");
593 }
595 /* If shadow mode is enabled, virtualize dirty bit. */
596 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
597 u64 mp_page = mpaddr >> PAGE_SHIFT;
598 pteval |= _PAGE_VIRT_D;
600 /* If the page is not already dirty, don't set the dirty bit! */
601 if (mp_page < d->arch.shadow_bitmap_size * 8
602 && !test_bit(mp_page, d->arch.shadow_bitmap))
603 pteval &= ~_PAGE_D;
604 }
606 /* Ignore non-addr bits of pteval2 and force PL0->1
607 (PL3 is unaffected) */
608 return (pteval & ~(_PAGE_PPN_MASK | _PAGE_PL_MASK)) |
609 (pteval2 & _PAGE_PPN_MASK) |
610 (vcpu_pl_adjust(pteval, 7) & _PAGE_PL_MASK);
611 }
613 // given a current domain metaphysical address, return the physical address
614 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
615 struct p2m_entry* entry)
616 {
617 unsigned long pteval;
619 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
620 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
621 }
623 //XXX !xxx_present() should be used instread of !xxx_none()?
624 // pud, pmd, pte page is zero cleared when they are allocated.
625 // Their area must be visible before population so that
626 // cmpxchg must have release semantics.
627 static volatile pte_t*
628 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
629 {
630 struct mm_struct *mm = &d->arch.mm;
631 volatile pgd_t *pgd;
632 volatile pud_t *pud;
633 volatile pmd_t *pmd;
635 BUG_ON(mm->pgd == NULL);
637 pgd = pgd_offset(mm, mpaddr);
638 again_pgd:
639 if (unlikely(pgd_none(*pgd))) { // acquire semantics
640 pud_t *old_pud = NULL;
641 pud = pud_alloc_one(mm, mpaddr);
642 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
643 pud_free(pud);
644 goto again_pgd;
645 }
646 }
648 pud = pud_offset(pgd, mpaddr);
649 again_pud:
650 if (unlikely(pud_none(*pud))) { // acquire semantics
651 pmd_t* old_pmd = NULL;
652 pmd = pmd_alloc_one(mm, mpaddr);
653 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
654 pmd_free(pmd);
655 goto again_pud;
656 }
657 }
659 pmd = pmd_offset(pud, mpaddr);
660 again_pmd:
661 if (unlikely(pmd_none(*pmd))) { // acquire semantics
662 pte_t* old_pte = NULL;
663 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
664 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
665 pte_free_kernel(pte);
666 goto again_pmd;
667 }
668 }
670 return pte_offset_map(pmd, mpaddr);
671 }
673 //XXX xxx_none() should be used instread of !xxx_present()?
674 volatile pte_t*
675 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
676 {
677 struct mm_struct *mm = &d->arch.mm;
678 volatile pgd_t *pgd;
679 volatile pud_t *pud;
680 volatile pmd_t *pmd;
682 BUG_ON(mm->pgd == NULL);
683 pgd = pgd_offset(mm, mpaddr);
684 if (unlikely(!pgd_present(*pgd))) // acquire semantics
685 return NULL;
687 pud = pud_offset(pgd, mpaddr);
688 if (unlikely(!pud_present(*pud))) // acquire semantics
689 return NULL;
691 pmd = pmd_offset(pud, mpaddr);
692 if (unlikely(!pmd_present(*pmd))) // acquire semantics
693 return NULL;
695 return pte_offset_map(pmd, mpaddr);
696 }
698 static volatile pte_t*
699 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
700 {
701 struct mm_struct *mm = &d->arch.mm;
702 volatile pgd_t *pgd;
703 volatile pud_t *pud;
704 volatile pmd_t *pmd;
706 BUG_ON(mm->pgd == NULL);
707 pgd = pgd_offset(mm, mpaddr);
708 if (unlikely(pgd_none(*pgd))) // acquire semantics
709 return NULL;
711 pud = pud_offset(pgd, mpaddr);
712 if (unlikely(pud_none(*pud))) // acquire semantics
713 return NULL;
715 pmd = pmd_offset(pud, mpaddr);
716 if (unlikely(pmd_none(*pmd))) // acquire semantics
717 return NULL;
719 return pte_offset_map(pmd, mpaddr);
720 }
722 unsigned long
723 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
724 {
725 volatile pte_t *pte;
727 pte = lookup_noalloc_domain_pte(d, mpaddr);
728 if (pte == NULL)
729 return INVALID_MFN;
731 if (pte_present(*pte))
732 return (pte->pte & _PFN_MASK);
733 return INVALID_MFN;
734 }
736 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
737 struct p2m_entry* entry)
738 {
739 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
741 if (pte != NULL) {
742 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
743 if (pte_present(tmp_pte)) {
744 if (entry != NULL)
745 p2m_entry_set(entry, pte, tmp_pte);
746 return pte_val(tmp_pte);
747 } else if (is_hvm_domain(d))
748 return INVALID_MFN;
749 }
751 if (mpaddr < d->arch.convmem_end && !d->is_dying) {
752 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
753 "d %"PRId16" 0x%lx (< 0x%lx)\n",
754 current->vcpu_id, PSCB(current, iip),
755 d->domain_id, mpaddr, d->arch.convmem_end);
756 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
757 /* Log I/O port probing, but complain less loudly about it */
758 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
759 "d %"PRId16" 0x%lx\n",
760 current->vcpu_id, PSCB(current, iip), d->domain_id,
761 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
762 } else {
763 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
764 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
765 current->vcpu_id, PSCB(current, iip),
766 d->domain_id, mpaddr, d->arch.convmem_end);
767 }
769 debugger_event (XEN_IA64_DEBUG_ON_BAD_MPA);
771 if (entry != NULL)
772 p2m_entry_set(entry, NULL, __pte(0));
773 //XXX This is a work around until the emulation memory access to a region
774 // where memory or device are attached is implemented.
775 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_PRIV |
776 _PAGE_AR_RWX)));
777 }
779 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
780 #if 1
781 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
782 {
783 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
784 unsigned long imva;
786 pte &= _PAGE_PPN_MASK;
787 imva = (unsigned long) __va(pte);
788 imva |= mpaddr & ~PAGE_MASK;
789 return (void*)imva;
790 }
791 #else
792 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
793 {
794 unsigned long imva = __gpa_to_mpa(d, mpaddr);
796 return (void *)__va(imva);
797 }
798 #endif
800 unsigned long
801 paddr_to_maddr(unsigned long paddr)
802 {
803 struct vcpu *v = current;
804 struct domain *d = v->domain;
805 u64 pa;
807 pa = ____lookup_domain_mpa(d, paddr);
808 if (pa == INVALID_MFN) {
809 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
810 __func__, paddr, vcpu_regs(v)->cr_iip);
811 return 0;
812 }
813 return (pa & _PFN_MASK) | (paddr & ~PAGE_MASK);
814 }
816 /* Allocate a new page for domain and map it to the specified metaphysical
817 address. */
818 static struct page_info *
819 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
820 volatile pte_t* pte)
821 {
822 struct page_info *p;
823 unsigned long maddr;
825 BUG_ON(!pte_none(*pte));
827 p = alloc_domheap_page(d, 0);
828 if (unlikely(!p)) {
829 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
830 return(p);
831 }
833 // zero out pages for security reasons
834 clear_page(page_to_virt(p));
835 maddr = page_to_maddr (p);
836 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
837 && maddr < __get_cpu_var(vhpt_pend))) {
838 /* FIXME: how can this happen ?
839 vhpt is allocated by alloc_domheap_page. */
840 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
841 maddr);
842 }
844 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
845 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
846 // because set_pte_rel() has release semantics
847 set_pte_rel(pte,
848 pfn_pte(maddr >> PAGE_SHIFT,
849 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
850 _PAGE_PL_PRIV | _PAGE_AR_RWX)));
852 smp_mb();
853 return p;
854 }
856 struct page_info *
857 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
858 {
859 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
861 if (!pte_none(*pte))
862 return NULL;
864 return __assign_new_domain_page(d, mpaddr, pte);
865 }
867 void __init
868 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
869 {
870 volatile pte_t *pte;
872 BUG_ON(d != dom0);
873 pte = lookup_alloc_domain_pte(d, mpaddr);
874 if (pte_none(*pte)) {
875 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
876 if (p == NULL) {
877 panic("%s: can't allocate page for dom0\n", __func__);
878 }
879 }
880 }
882 static unsigned long
883 flags_to_prot (unsigned long flags)
884 {
885 unsigned long res = _PAGE_PL_PRIV | __DIRTY_BITS;
887 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
888 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
889 #ifdef CONFIG_XEN_IA64_TLB_TRACK
890 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
891 #endif
892 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
893 res |= flags & ASSIGN_io ? _PAGE_IO: 0;
895 return res;
896 }
898 /* map a physical address to the specified metaphysical addr */
899 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
900 // This is called by assign_domain_mmio_page().
901 // So accessing to pte is racy.
902 int
903 __assign_domain_page(struct domain *d,
904 unsigned long mpaddr, unsigned long physaddr,
905 unsigned long flags)
906 {
907 volatile pte_t *pte;
908 pte_t old_pte;
909 pte_t new_pte;
910 pte_t ret_pte;
911 unsigned long prot = flags_to_prot(flags);
913 pte = lookup_alloc_domain_pte(d, mpaddr);
915 old_pte = __pte(0);
916 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
917 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
918 if (pte_val(ret_pte) == pte_val(old_pte)) {
919 smp_mb();
920 return 0;
921 }
923 // dom0 tries to map real machine's I/O region, but failed.
924 // It is very likely that dom0 doesn't boot correctly because
925 // it can't access I/O. So complain here.
926 if (flags & ASSIGN_nocache) {
927 int warn = 0;
929 if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT))
930 warn = 1;
931 else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) {
932 u32 type;
933 u64 attr;
935 warn = 1;
937 /*
938 * See
939 * complete_dom0_memmap()
940 * case EFI_RUNTIME_SERVICES_CODE:
941 * case EFI_RUNTIME_SERVICES_DATA:
942 * case EFI_ACPI_RECLAIM_MEMORY:
943 * case EFI_ACPI_MEMORY_NVS:
944 * case EFI_RESERVED_TYPE:
945 *
946 * Currently only EFI_RUNTIME_SERVICES_CODE is found
947 * so that we suppress only EFI_RUNTIME_SERVICES_CODE case.
948 */
949 type = efi_mem_type(physaddr);
950 attr = efi_mem_attributes(physaddr);
951 if (type == EFI_RUNTIME_SERVICES_CODE &&
952 (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB))
953 warn = 0;
954 }
955 if (warn)
956 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
957 "\talready assigned pte_val 0x%016lx\n"
958 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
959 __func__, __LINE__,
960 d, d->domain_id, pte_val(ret_pte),
961 mpaddr, physaddr, flags);
962 }
964 return -EAGAIN;
965 }
967 /* get_page() and map a physical address to the specified metaphysical addr */
968 void
969 assign_domain_page(struct domain *d,
970 unsigned long mpaddr, unsigned long physaddr)
971 {
972 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
974 BUG_ON((physaddr & _PAGE_PPN_MASK) != physaddr);
975 BUG_ON(page->count_info != (PGC_allocated | 1));
976 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
977 // because __assign_domain_page() uses set_pte_rel() which has
978 // release semantics, smp_mb() isn't needed.
979 (void)__assign_domain_page(d, mpaddr, physaddr,
980 ASSIGN_writable | ASSIGN_pgc_allocated);
981 }
983 int
984 ioports_permit_access(struct domain *d, unsigned int fp, unsigned int lp)
985 {
986 struct io_space *space;
987 unsigned long mmio_start, mmio_end, mach_start;
988 int ret;
990 if (IO_SPACE_NR(fp) >= num_io_spaces) {
991 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
992 return -EFAULT;
993 }
995 /*
996 * The ioport_cap rangeset tracks the I/O port address including
997 * the port space ID. This means port space IDs need to match
998 * between Xen and dom0. This is also a requirement because
999 * the hypercall to pass these port ranges only uses a u32.
1001 * NB - non-dom0 driver domains may only have a subset of the
1002 * I/O port spaces and thus will number port spaces differently.
1003 * This is ok, they don't make use of this interface.
1004 */
1005 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
1006 if (ret != 0)
1007 return ret;
1009 space = &io_space[IO_SPACE_NR(fp)];
1011 /* Legacy I/O on dom0 is already setup */
1012 if (d == dom0 && space == &io_space[0])
1013 return 0;
1015 fp = IO_SPACE_PORT(fp);
1016 lp = IO_SPACE_PORT(lp);
1018 if (space->sparse) {
1019 mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
1020 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
1021 } else {
1022 mmio_start = fp & PAGE_MASK;
1023 mmio_end = PAGE_ALIGN(lp);
1026 /*
1027 * The "machine first port" is not necessarily identity mapped
1028 * to the guest first port. At least for the legacy range.
1029 */
1030 mach_start = mmio_start | __pa(space->mmio_base);
1032 if (space == &io_space[0]) {
1033 mmio_start |= IO_PORTS_PADDR;
1034 mmio_end |= IO_PORTS_PADDR;
1035 } else {
1036 mmio_start |= __pa(space->mmio_base);
1037 mmio_end |= __pa(space->mmio_base);
1040 while (mmio_start <= mmio_end) {
1041 (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache);
1042 mmio_start += PAGE_SIZE;
1043 mach_start += PAGE_SIZE;
1046 return 0;
1049 static int
1050 ioports_has_allowed(struct domain *d, unsigned int fp, unsigned int lp)
1052 for (; fp < lp; fp++)
1053 if (rangeset_contains_singleton(d->arch.ioport_caps, fp))
1054 return 1;
1056 return 0;
1059 int
1060 ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
1062 int ret;
1063 struct mm_struct *mm = &d->arch.mm;
1064 unsigned long mmio_start, mmio_end, mmio_base;
1065 unsigned int fp_base, lp_base;
1066 struct io_space *space;
1068 if (IO_SPACE_NR(fp) >= num_io_spaces) {
1069 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
1070 return -EFAULT;
1073 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
1074 if (ret != 0)
1075 return ret;
1077 space = &io_space[IO_SPACE_NR(fp)];
1078 fp_base = IO_SPACE_PORT(fp);
1079 lp_base = IO_SPACE_PORT(lp);
1081 if (space->sparse) {
1082 mmio_start = IO_SPACE_SPARSE_ENCODING(fp_base) & PAGE_MASK;
1083 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp_base));
1084 } else {
1085 mmio_start = fp_base & PAGE_MASK;
1086 mmio_end = PAGE_ALIGN(lp_base);
1089 if (space == &io_space[0] && d != dom0)
1090 mmio_base = IO_PORTS_PADDR;
1091 else
1092 mmio_base = __pa(space->mmio_base);
1094 for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
1095 unsigned int port, range;
1096 unsigned long mpaddr;
1097 volatile pte_t *pte;
1098 pte_t old_pte;
1100 if (space->sparse) {
1101 port = IO_SPACE_SPARSE_DECODING(mmio_start);
1102 range = IO_SPACE_SPARSE_PORTS_PER_PAGE - 1;
1103 } else {
1104 port = mmio_start;
1105 range = PAGE_SIZE - 1;
1108 port |= IO_SPACE_BASE(IO_SPACE_NR(fp));
1110 if (port < fp || port + range > lp) {
1111 /* Maybe this covers an allowed port. */
1112 if (ioports_has_allowed(d, port, port + range))
1113 continue;
1116 mpaddr = mmio_start | mmio_base;
1117 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1118 BUG_ON(pte == NULL);
1119 BUG_ON(pte_none(*pte));
1121 /* clear pte */
1122 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1124 domain_flush_vtlb_all(d);
1125 return 0;
1128 static void
1129 assign_domain_same_page(struct domain *d,
1130 unsigned long mpaddr, unsigned long size,
1131 unsigned long flags)
1133 //XXX optimization
1134 unsigned long end = PAGE_ALIGN(mpaddr + size);
1135 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
1136 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
1140 int
1141 efi_mmio(unsigned long physaddr, unsigned long size)
1143 void *efi_map_start, *efi_map_end;
1144 u64 efi_desc_size;
1145 void* p;
1147 efi_map_start = __va(ia64_boot_param->efi_memmap);
1148 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1149 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1151 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1152 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1153 unsigned long start = md->phys_addr;
1154 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1156 if (start <= physaddr && physaddr < end) {
1157 if ((physaddr + size) > end) {
1158 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1159 __func__, physaddr, size);
1160 return 0;
1163 // for io space
1164 if (md->type == EFI_MEMORY_MAPPED_IO ||
1165 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1166 return 1;
1169 // for runtime
1170 // see efi_enter_virtual_mode(void)
1171 // in linux/arch/ia64/kernel/efi.c
1172 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1173 !(md->attribute & EFI_MEMORY_WB)) {
1174 return 1;
1177 return 0;
1180 if (physaddr < start) {
1181 break;
1185 return 1;
1188 unsigned long
1189 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1190 unsigned long phys_addr, unsigned long size,
1191 unsigned long flags)
1193 unsigned long addr = mpaddr & PAGE_MASK;
1194 unsigned long end = PAGE_ALIGN(mpaddr + size);
1196 if (size == 0) {
1197 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1198 __func__, d, mpaddr, size);
1200 if (!efi_mmio(phys_addr, size)) {
1201 #ifndef NDEBUG
1202 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1203 __func__, d, mpaddr, size);
1204 #endif
1205 return -EINVAL;
1208 for (phys_addr &= PAGE_MASK; addr < end;
1209 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1210 __assign_domain_page(d, addr, phys_addr, flags);
1213 return mpaddr;
1216 unsigned long
1217 assign_domain_mach_page(struct domain *d,
1218 unsigned long mpaddr, unsigned long size,
1219 unsigned long flags)
1221 BUG_ON(flags & ASSIGN_pgc_allocated);
1222 assign_domain_same_page(d, mpaddr, size, flags);
1223 return mpaddr;
1226 static void
1227 adjust_page_count_info(struct page_info* page)
1229 struct domain* d = page_get_owner(page);
1230 BUG_ON((page->count_info & PGC_count_mask) < 1);
1231 if (d != NULL) {
1232 int ret = get_page(page, d);
1233 BUG_ON(ret == 0);
1234 } else {
1235 u64 x, nx, y;
1237 y = *((u64*)&page->count_info);
1238 do {
1239 x = y;
1240 nx = x + 1;
1242 BUG_ON((x >> 32) != 0);
1243 BUG_ON((nx & PGC_count_mask) != 2);
1244 y = cmpxchg((u64*)&page->count_info, x, nx);
1245 } while (unlikely(y != x));
1249 static void
1250 domain_put_page(struct domain* d, unsigned long mpaddr,
1251 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1253 unsigned long mfn = pte_pfn(old_pte);
1254 struct page_info* page = mfn_to_page(mfn);
1256 if (pte_pgc_allocated(old_pte)) {
1257 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1258 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1259 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1260 } else {
1261 BUG();
1264 if (likely(clear_PGC_allocate)) {
1265 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1266 BUG();
1267 /* put_page() is done by domain_page_flush_and_put() */
1268 } else {
1269 // In this case, page reference count mustn't touched.
1270 // domain_page_flush_and_put() decrements it, we increment
1271 // it in advence. This patch is slow path.
1272 //
1273 // guest_remove_page(): owner = d, count_info = 1
1274 // memory_exchange(): owner = NULL, count_info = 1
1275 // XENMEM_add_to_physmap: ower = d, count_info >= 1
1276 adjust_page_count_info(page);
1279 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1282 // caller must get_page(mfn_to_page(mfn)) before call.
1283 // caller must call set_gpfn_from_mfn() before call if necessary.
1284 // because set_gpfn_from_mfn() result must be visible before pte xchg
1285 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1286 // flags: ASSIGN_xxx
1287 static void
1288 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1289 unsigned long mfn, unsigned long flags)
1291 struct mm_struct *mm = &d->arch.mm;
1292 volatile pte_t* pte;
1293 pte_t old_pte;
1294 pte_t npte;
1295 unsigned long prot = flags_to_prot(flags);
1297 pte = lookup_alloc_domain_pte(d, mpaddr);
1299 // update pte
1300 npte = pfn_pte(mfn, __pgprot(prot));
1301 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1302 if (pte_mem(old_pte)) {
1303 unsigned long old_mfn = pte_pfn(old_pte);
1305 // mfn = old_mfn case can happen when domain maps a granted page
1306 // twice with the same pseudo physial address.
1307 // It's non sense, but allowed.
1308 // __gnttab_map_grant_ref()
1309 // => create_host_mapping()
1310 // => assign_domain_page_replace()
1311 if (mfn != old_mfn) {
1312 domain_put_page(d, mpaddr, pte, old_pte, 1);
1315 perfc_incr(assign_domain_page_replace);
1318 // caller must get_page(new_page) before
1319 // Only steal_page() calls this function.
1320 static int
1321 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1322 struct page_info* old_page,
1323 struct page_info* new_page,
1324 unsigned long flags, int clear_PGC_allocate)
1326 struct mm_struct *mm = &d->arch.mm;
1327 volatile pte_t* pte;
1328 unsigned long old_mfn;
1329 unsigned long old_prot;
1330 pte_t old_pte;
1331 unsigned long new_mfn;
1332 unsigned long new_prot;
1333 pte_t new_pte;
1334 pte_t ret_pte;
1336 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1337 pte = lookup_alloc_domain_pte(d, mpaddr);
1339 again:
1340 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1341 old_mfn = page_to_mfn(old_page);
1342 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1343 if (!pte_present(old_pte)) {
1344 gdprintk(XENLOG_INFO,
1345 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1346 __func__, pte_val(old_pte), old_prot, old_mfn);
1347 return -EINVAL;
1350 new_prot = flags_to_prot(flags);
1351 new_mfn = page_to_mfn(new_page);
1352 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1354 // update pte
1355 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1356 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1357 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1358 goto again;
1361 gdprintk(XENLOG_INFO,
1362 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1363 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1364 __func__,
1365 pte_val(old_pte), old_prot, old_mfn,
1366 pte_val(ret_pte), pte_pfn(ret_pte));
1367 return -EINVAL;
1370 BUG_ON(!pte_mem(old_pte));
1371 BUG_ON(!pte_pgc_allocated(old_pte));
1372 BUG_ON(page_get_owner(old_page) != d);
1373 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1374 BUG_ON(old_mfn == new_mfn);
1376 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1377 if (likely(clear_PGC_allocate)) {
1378 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1379 BUG();
1380 } else {
1381 int ret;
1382 // adjust for count_info for domain_page_flush_and_put()
1383 // This is slow path.
1384 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1385 BUG_ON(d == NULL);
1386 ret = get_page(old_page, d);
1387 BUG_ON(ret == 0);
1390 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1391 perfc_incr(assign_domain_pge_cmpxchg_rel);
1392 return 0;
1395 static void
1396 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1397 int clear_PGC_allocate, unsigned long mfn)
1399 struct mm_struct *mm = &d->arch.mm;
1400 volatile pte_t *pte;
1401 pte_t old_pte;
1402 struct page_info *page;
1404 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1405 if (pte == NULL)
1406 return;
1407 if (pte_none(*pte))
1408 return;
1410 if (mfn == INVALID_MFN) {
1411 // clear pte
1412 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1413 mfn = pte_pfn(old_pte);
1414 } else {
1415 unsigned long old_arflags;
1416 pte_t new_pte;
1417 pte_t ret_pte;
1419 again:
1420 // memory_exchange() calls guest_physmap_remove_page() with
1421 // a stealed page. i.e. page owner = NULL.
1422 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1423 page_get_owner(mfn_to_page(mfn)) != NULL);
1424 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1425 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1426 new_pte = __pte(0);
1428 // update pte
1429 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1430 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1431 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1432 goto again;
1435 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1436 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1437 __func__,
1438 pte_val(old_pte), old_arflags, mfn,
1439 pte_val(ret_pte), pte_pfn(ret_pte));
1440 return;
1442 BUG_ON(mfn != pte_pfn(ret_pte));
1445 page = mfn_to_page(mfn);
1446 BUG_ON((page->count_info & PGC_count_mask) == 0);
1448 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1449 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1450 perfc_incr(zap_domain_page_one);
1453 unsigned long
1454 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1455 unsigned int extent_order)
1457 if (extent_order != 0) {
1458 //XXX
1459 return -ENOSYS;
1462 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1463 perfc_incr(dom0vp_zap_physmap);
1464 return 0;
1467 static unsigned long
1468 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1469 unsigned long mfn_or_gmfn,
1470 unsigned long flags, domid_t domid, int is_gmfn)
1472 int error = -EINVAL;
1473 struct domain* rd;
1474 unsigned long mfn;
1476 /* Not allowed by a domain. */
1477 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1478 return -EINVAL;
1480 rd = rcu_lock_domain_by_id(domid);
1481 if (unlikely(rd == NULL)) {
1482 switch (domid) {
1483 case DOMID_XEN:
1484 rd = dom_xen;
1485 break;
1486 case DOMID_IO:
1487 rd = dom_io;
1488 break;
1489 default:
1490 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1491 "gpfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1492 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1493 return -ESRCH;
1495 BUG_ON(rd == NULL);
1496 rcu_lock_domain(rd);
1499 if (unlikely(rd == d))
1500 goto out1;
1501 /*
1502 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1503 * It can be considered that their p2m conversion is p==m.
1504 */
1505 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1506 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1507 else
1508 mfn = mfn_or_gmfn;
1509 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1510 goto out1;
1512 error = 0;
1513 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1514 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1515 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1516 //don't update p2m table because this page belongs to rd, not d.
1517 perfc_incr(dom0vp_add_physmap);
1518 out1:
1519 rcu_unlock_domain(rd);
1520 return error;
1523 unsigned long
1524 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1525 unsigned long flags, domid_t domid)
1527 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1530 unsigned long
1531 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1532 unsigned long gmfn, unsigned long flags,
1533 domid_t domid)
1535 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1538 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1539 #define P2M_PFN_ROUNDUP(x) (((x) + PTRS_PER_PTE - 1) & \
1540 ~(PTRS_PER_PTE - 1))
1541 #define P2M_PFN_ROUNDDOWN(x) ((x) & ~(PTRS_PER_PTE - 1))
1542 #define P2M_NUM_PFN(x) (((x) + PTRS_PER_PTE - 1) / PTRS_PER_PTE)
1543 #define MD_END(md) ((md)->phys_addr + \
1544 ((md)->num_pages << EFI_PAGE_SHIFT))
1545 static struct page_info* p2m_pte_zero_page = NULL;
1547 /* This must called before dom0 p2m table allocation */
1548 void __init
1549 expose_p2m_init(void)
1551 pte_t* pte;
1553 /*
1554 * Initialise our DOMID_P2M domain.
1555 * This domain owns m2p table pages.
1556 */
1557 dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
1558 BUG_ON(dom_p2m == NULL);
1559 dom_p2m->max_pages = ~0U;
1561 pte = pte_alloc_one_kernel(NULL, 0);
1562 BUG_ON(pte == NULL);
1563 smp_mb();// make contents of the page visible.
1564 p2m_pte_zero_page = virt_to_page(pte);
1567 // allocate pgd, pmd of dest_dom if necessary
1568 static int
1569 allocate_pgd_pmd(struct domain* dest_dom, unsigned long dest_gpfn,
1570 struct domain* src_dom,
1571 unsigned long src_gpfn, unsigned long num_src_gpfn)
1573 unsigned long i = 0;
1575 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1576 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1578 while (i < num_src_gpfn) {
1579 volatile pte_t* src_pte;
1580 volatile pte_t* dest_pte;
1582 src_pte = lookup_noalloc_domain_pte(src_dom,
1583 (src_gpfn + i) << PAGE_SHIFT);
1584 if (src_pte == NULL) {
1585 i++;
1586 continue;
1589 dest_pte = lookup_alloc_domain_pte(dest_dom,
1590 (dest_gpfn << PAGE_SHIFT) +
1591 i * sizeof(pte_t));
1592 if (dest_pte == NULL) {
1593 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n",
1594 __func__);
1595 return -ENOMEM;
1598 // skip to next pte page
1599 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1601 return 0;
1604 static int
1605 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1607 int ret = get_page(page, dom_p2m);
1608 BUG_ON(ret != 1);
1609 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1610 ASSIGN_readonly);
1613 // expose pte page
1614 static int
1615 expose_p2m_range(struct domain* dest_dom, unsigned long dest_gpfn,
1616 struct domain* src_dom,
1617 unsigned long src_gpfn, unsigned long num_src_gpfn)
1619 unsigned long i = 0;
1621 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1622 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1624 while (i < num_src_gpfn) {
1625 volatile pte_t* pte;
1627 pte = lookup_noalloc_domain_pte(src_dom, (src_gpfn + i) << PAGE_SHIFT);
1628 if (pte == NULL) {
1629 i++;
1630 continue;
1633 if (expose_p2m_page(dest_dom,
1634 (dest_gpfn << PAGE_SHIFT) + i * sizeof(pte_t),
1635 virt_to_page(pte)) < 0) {
1636 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1637 return -EAGAIN;
1640 // skip to next pte page
1641 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1643 return 0;
1646 // expose p2m_pte_zero_page
1647 static int
1648 expose_zero_page(struct domain* dest_dom, unsigned long dest_gpfn,
1649 unsigned long num_src_gpfn)
1651 unsigned long i;
1653 for (i = 0; i < P2M_NUM_PFN(num_src_gpfn); i++) {
1654 volatile pte_t* pte;
1655 pte = lookup_noalloc_domain_pte(dest_dom,
1656 (dest_gpfn + i) << PAGE_SHIFT);
1657 if (pte == NULL || pte_present(*pte))
1658 continue;
1660 if (expose_p2m_page(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1661 p2m_pte_zero_page) < 0) {
1662 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n",
1663 __func__);
1664 return -EAGAIN;
1667 return 0;
1670 static int
1671 expose_p2m(struct domain* dest_dom, unsigned long dest_gpfn,
1672 struct domain* src_dom,
1673 unsigned long src_gpfn, unsigned long num_src_gpfn)
1675 if (allocate_pgd_pmd(dest_dom, dest_gpfn,
1676 src_dom, src_gpfn, num_src_gpfn))
1677 return -ENOMEM;
1679 if (expose_p2m_range(dest_dom, dest_gpfn,
1680 src_dom, src_gpfn, num_src_gpfn))
1681 return -EAGAIN;
1683 if (expose_zero_page(dest_dom, dest_gpfn, num_src_gpfn))
1684 return -EAGAIN;
1686 return 0;
1689 static void
1690 unexpose_p2m(struct domain* dest_dom,
1691 unsigned long dest_gpfn, unsigned long num_dest_gpfn)
1693 unsigned long i;
1695 for (i = 0; i < num_dest_gpfn; i++) {
1696 zap_domain_page_one(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1697 0, INVALID_MFN);
1701 // It is possible to optimize loop, But this isn't performance critical.
1702 unsigned long
1703 dom0vp_expose_p2m(struct domain* d,
1704 unsigned long conv_start_gpfn,
1705 unsigned long assign_start_gpfn,
1706 unsigned long expose_size, unsigned long granule_pfn)
1708 unsigned long ret;
1709 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1711 if ((expose_size % PAGE_SIZE) != 0 ||
1712 (granule_pfn % PTRS_PER_PTE) != 0 ||
1713 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1714 (conv_start_gpfn % granule_pfn) != 0 ||
1715 (assign_start_gpfn % granule_pfn) != 0 ||
1716 (expose_num_pfn % granule_pfn) != 0) {
1717 gdprintk(XENLOG_INFO,
1718 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1719 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1720 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1721 return -EINVAL;
1724 if (granule_pfn != PTRS_PER_PTE) {
1725 gdprintk(XENLOG_INFO,
1726 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1727 __func__, granule_pfn, PTRS_PER_PTE);
1728 return -ENOSYS;
1730 ret = expose_p2m(d, assign_start_gpfn,
1731 d, conv_start_gpfn, expose_num_pfn);
1732 return ret;
1735 static int
1736 memmap_info_copy_from_guest(struct xen_ia64_memmap_info* memmap_info,
1737 char** memmap_p,
1738 XEN_GUEST_HANDLE(char) buffer)
1740 char *memmap;
1741 char *p;
1742 char *memmap_end;
1743 efi_memory_desc_t *md;
1744 unsigned long start;
1745 unsigned long end;
1746 efi_memory_desc_t *prev_md;
1748 if (copy_from_guest((char*)memmap_info, buffer, sizeof(*memmap_info)))
1749 return -EFAULT;
1750 if (memmap_info->efi_memdesc_size < sizeof(efi_memory_desc_t) ||
1751 memmap_info->efi_memmap_size < memmap_info->efi_memdesc_size ||
1752 (memmap_info->efi_memmap_size % memmap_info->efi_memdesc_size) != 0)
1753 return -EINVAL;
1755 memmap = _xmalloc(memmap_info->efi_memmap_size,
1756 __alignof__(efi_memory_desc_t));
1757 if (memmap == NULL)
1758 return -ENOMEM;
1759 if (copy_from_guest_offset(memmap, buffer, sizeof(*memmap_info),
1760 memmap_info->efi_memmap_size)) {
1761 xfree(memmap);
1762 return -EFAULT;
1765 /* intergirty check & simplify */
1766 sort(memmap, memmap_info->efi_memmap_size / memmap_info->efi_memdesc_size,
1767 memmap_info->efi_memdesc_size, efi_mdt_cmp, NULL);
1769 /* alignement & overlap check */
1770 prev_md = NULL;
1771 p = memmap;
1772 memmap_end = memmap + memmap_info->efi_memmap_size;
1773 for (p = memmap; p < memmap_end; p += memmap_info->efi_memmap_size) {
1774 md = (efi_memory_desc_t*)p;
1775 start = md->phys_addr;
1777 if (start & ((1UL << EFI_PAGE_SHIFT) - 1) || md->num_pages == 0) {
1778 xfree(memmap);
1779 return -EINVAL;
1782 if (prev_md != NULL) {
1783 unsigned long prev_end = MD_END(prev_md);
1784 if (prev_end > start) {
1785 xfree(memmap);
1786 return -EINVAL;
1790 prev_md = (efi_memory_desc_t *)p;
1793 /* coalease */
1794 prev_md = NULL;
1795 p = memmap;
1796 while (p < memmap_end) {
1797 md = (efi_memory_desc_t*)p;
1798 start = md->phys_addr;
1799 end = MD_END(md);
1801 start = P2M_PFN_ROUNDDOWN(start >> PAGE_SHIFT) << PAGE_SHIFT;
1802 end = P2M_PFN_ROUNDUP(end >> PAGE_SHIFT) << PAGE_SHIFT;
1803 md->phys_addr = start;
1804 md->num_pages = (end - start) >> EFI_PAGE_SHIFT;
1806 if (prev_md != NULL) {
1807 unsigned long prev_end = MD_END(prev_md);
1808 if (prev_end >= start) {
1809 size_t left;
1810 end = max(prev_end, end);
1811 prev_md->num_pages = (end - prev_md->phys_addr) >> EFI_PAGE_SHIFT;
1813 left = memmap_end - p;
1814 if (left > memmap_info->efi_memdesc_size) {
1815 left -= memmap_info->efi_memdesc_size;
1816 memmove(p, p + memmap_info->efi_memdesc_size, left);
1819 memmap_info->efi_memmap_size -= memmap_info->efi_memdesc_size;
1820 memmap_end -= memmap_info->efi_memdesc_size;
1821 continue;
1825 prev_md = md;
1826 p += memmap_info->efi_memdesc_size;
1829 if (copy_to_guest(buffer, (char*)memmap_info, sizeof(*memmap_info)) ||
1830 copy_to_guest_offset(buffer, sizeof(*memmap_info),
1831 (char*)memmap, memmap_info->efi_memmap_size)) {
1832 xfree(memmap);
1833 return -EFAULT;
1836 *memmap_p = memmap;
1837 return 0;
1840 static int
1841 foreign_p2m_allocate_pte(struct domain* d,
1842 const struct xen_ia64_memmap_info* memmap_info,
1843 const void* memmap)
1845 const void* memmap_end = memmap + memmap_info->efi_memmap_size;
1846 const void* p;
1848 for (p = memmap; p < memmap_end; p += memmap_info->efi_memdesc_size) {
1849 const efi_memory_desc_t* md = p;
1850 unsigned long start = md->phys_addr;
1851 unsigned long end = MD_END(md);
1852 unsigned long gpaddr;
1854 for (gpaddr = start; gpaddr < end; gpaddr += PAGE_SIZE) {
1855 if (lookup_alloc_domain_pte(d, gpaddr) == NULL) {
1856 return -ENOMEM;
1861 return 0;
1864 struct foreign_p2m_region {
1865 unsigned long gpfn;
1866 unsigned long num_gpfn;
1867 };
1869 struct foreign_p2m_entry {
1870 struct list_head list;
1871 int busy;
1873 /* src domain */
1874 struct domain* src_dom;
1876 /* region into which foreign p2m table is mapped */
1877 unsigned long gpfn;
1878 unsigned long num_gpfn;
1879 unsigned int num_region;
1880 struct foreign_p2m_region region[0];
1881 };
1883 /* caller must increment the reference count of src_dom */
1884 static int
1885 foreign_p2m_alloc(struct foreign_p2m* foreign_p2m,
1886 unsigned long dest_gpfn, struct domain* src_dom,
1887 struct xen_ia64_memmap_info* memmap_info, void* memmap,
1888 struct foreign_p2m_entry** entryp)
1890 void* memmap_end = memmap + memmap_info->efi_memmap_size;
1891 efi_memory_desc_t* md;
1892 unsigned long dest_gpfn_end;
1893 unsigned long src_gpfn;
1894 unsigned long src_gpfn_end;
1896 unsigned int num_region;
1897 struct foreign_p2m_entry* entry;
1898 struct foreign_p2m_entry* prev;
1899 struct foreign_p2m_entry* pos;
1901 num_region = (memmap_end - memmap) / memmap_info->efi_memdesc_size;
1903 md = memmap;
1904 src_gpfn = P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
1906 md = memmap + (num_region - 1) * memmap_info->efi_memdesc_size;
1907 src_gpfn_end = MD_END(md) >> PAGE_SHIFT;
1908 if (src_gpfn_end >
1909 P2M_PFN_ROUNDUP(src_dom->arch.convmem_end >> PAGE_SHIFT))
1910 return -EINVAL;
1912 src_gpfn_end = P2M_PFN_ROUNDUP(src_gpfn_end);
1913 dest_gpfn_end = dest_gpfn + P2M_NUM_PFN(src_gpfn_end - src_gpfn);
1914 entry = _xmalloc(sizeof(*entry) + num_region * sizeof(entry->region[0]),
1915 __alignof__(*entry));
1916 if (entry == NULL)
1917 return -ENOMEM;
1919 entry->busy = 1;
1920 entry->gpfn = dest_gpfn;
1921 entry->num_gpfn = dest_gpfn_end - dest_gpfn;
1922 entry->src_dom = src_dom;
1923 entry->num_region = 0;
1924 memset(entry->region, 0, sizeof(entry->region[0]) * num_region);
1925 prev = NULL;
1927 spin_lock(&foreign_p2m->lock);
1928 if (list_empty(&foreign_p2m->head))
1929 prev = (struct foreign_p2m_entry*)&foreign_p2m->head;
1931 list_for_each_entry(pos, &foreign_p2m->head, list) {
1932 if (pos->gpfn + pos->num_gpfn < dest_gpfn) {
1933 prev = pos;
1934 continue;
1937 if (dest_gpfn_end < pos->gpfn) {
1938 if (prev != NULL && prev->gpfn + prev->num_gpfn > dest_gpfn)
1939 prev = NULL;/* overlap */
1940 break;
1943 /* overlap */
1944 prev = NULL;
1945 break;
1947 if (prev != NULL) {
1948 list_add(&entry->list, &prev->list);
1949 spin_unlock(&foreign_p2m->lock);
1950 *entryp = entry;
1951 return 0;
1953 spin_unlock(&foreign_p2m->lock);
1954 xfree(entry);
1955 return -EBUSY;
1958 static void
1959 foreign_p2m_unexpose(struct domain* dest_dom, struct foreign_p2m_entry* entry)
1961 unsigned int i;
1963 BUG_ON(!entry->busy);
1964 for (i = 0; i < entry->num_region; i++)
1965 unexpose_p2m(dest_dom,
1966 entry->region[i].gpfn, entry->region[i].num_gpfn);
1969 static void
1970 foreign_p2m_unbusy(struct foreign_p2m* foreign_p2m,
1971 struct foreign_p2m_entry* entry)
1973 spin_lock(&foreign_p2m->lock);
1974 BUG_ON(!entry->busy);
1975 entry->busy = 0;
1976 spin_unlock(&foreign_p2m->lock);
1979 static void
1980 foreign_p2m_free(struct foreign_p2m* foreign_p2m,
1981 struct foreign_p2m_entry* entry)
1983 spin_lock(&foreign_p2m->lock);
1984 BUG_ON(!entry->busy);
1985 list_del(&entry->list);
1986 spin_unlock(&foreign_p2m->lock);
1988 put_domain(entry->src_dom);
1989 xfree(entry);
1992 void
1993 foreign_p2m_init(struct domain* d)
1995 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
1996 INIT_LIST_HEAD(&foreign_p2m->head);
1997 spin_lock_init(&foreign_p2m->lock);
2000 void
2001 foreign_p2m_destroy(struct domain* d)
2003 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
2004 struct foreign_p2m_entry* entry;
2005 struct foreign_p2m_entry* n;
2007 spin_lock(&foreign_p2m->lock);
2008 list_for_each_entry_safe(entry, n, &foreign_p2m->head, list) {
2009 /* mm_teardown() cleared p2m table already */
2010 /* foreign_p2m_unexpose(d, entry);*/
2011 list_del(&entry->list);
2012 put_domain(entry->src_dom);
2013 xfree(entry);
2015 spin_unlock(&foreign_p2m->lock);
2018 unsigned long
2019 dom0vp_expose_foreign_p2m(struct domain* dest_dom,
2020 unsigned long dest_gpfn,
2021 domid_t domid,
2022 XEN_GUEST_HANDLE(char) buffer,
2023 unsigned long flags)
2025 unsigned long ret = 0;
2026 struct domain* src_dom;
2027 struct xen_ia64_memmap_info memmap_info;
2028 char* memmap;
2029 void* memmap_end;
2030 void* p;
2032 struct foreign_p2m_entry* entry;
2034 ret = memmap_info_copy_from_guest(&memmap_info, &memmap, buffer);
2035 if (ret != 0)
2036 return ret;
2038 dest_dom = rcu_lock_domain(dest_dom);
2039 if (dest_dom == NULL) {
2040 ret = -EINVAL;
2041 goto out;
2043 #if 1
2044 // Self foreign domain p2m exposure isn't allowed.
2045 // Otherwise the domain can't be destroyed because
2046 // no one decrements the domain reference count.
2047 if (domid == dest_dom->domain_id) {
2048 ret = -EINVAL;
2049 goto out;
2051 #endif
2053 src_dom = get_domain_by_id(domid);
2054 if (src_dom == NULL) {
2055 ret = -EINVAL;
2056 goto out_unlock;
2059 if (flags & IA64_DOM0VP_EFP_ALLOC_PTE) {
2060 ret = foreign_p2m_allocate_pte(src_dom, &memmap_info, memmap);
2061 if (ret != 0)
2062 goto out_unlock;
2065 ret = foreign_p2m_alloc(&dest_dom->arch.foreign_p2m, dest_gpfn,
2066 src_dom, &memmap_info, memmap, &entry);
2067 if (ret != 0)
2068 goto out_unlock;
2070 memmap_end = memmap + memmap_info.efi_memmap_size;
2071 for (p = memmap; p < memmap_end; p += memmap_info.efi_memdesc_size) {
2072 efi_memory_desc_t* md = p;
2073 unsigned long src_gpfn =
2074 P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
2075 unsigned long src_gpfn_end =
2076 P2M_PFN_ROUNDUP(MD_END(md) >> PAGE_SHIFT);
2077 unsigned long num_src_gpfn = src_gpfn_end - src_gpfn;
2079 ret = expose_p2m(dest_dom, dest_gpfn + src_gpfn / PTRS_PER_PTE,
2080 src_dom, src_gpfn, num_src_gpfn);
2081 if (ret != 0)
2082 break;
2084 entry->region[entry->num_region].gpfn =
2085 dest_gpfn + src_gpfn / PTRS_PER_PTE;
2086 entry->region[entry->num_region].num_gpfn = P2M_NUM_PFN(num_src_gpfn);
2087 entry->num_region++;
2090 if (ret == 0) {
2091 foreign_p2m_unbusy(&dest_dom->arch.foreign_p2m, entry);
2092 } else {
2093 foreign_p2m_unexpose(dest_dom, entry);
2094 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2097 out_unlock:
2098 rcu_unlock_domain(dest_dom);
2099 out:
2100 xfree(memmap);
2101 return ret;
2104 unsigned long
2105 dom0vp_unexpose_foreign_p2m(struct domain* dest_dom,
2106 unsigned long dest_gpfn,
2107 domid_t domid)
2109 int ret = -ENOENT;
2110 struct foreign_p2m* foreign_p2m = &dest_dom->arch.foreign_p2m;
2111 struct foreign_p2m_entry* entry;
2113 dest_dom = rcu_lock_domain(dest_dom);
2114 if (dest_dom == NULL)
2115 return ret;
2116 spin_lock(&foreign_p2m->lock);
2117 list_for_each_entry(entry, &foreign_p2m->head, list) {
2118 if (entry->gpfn < dest_gpfn)
2119 continue;
2120 if (dest_gpfn < entry->gpfn)
2121 break;
2123 if (domid == entry->src_dom->domain_id)
2124 ret = 0;
2125 else
2126 ret = -EINVAL;
2127 break;
2129 if (ret == 0) {
2130 if (entry->busy == 0)
2131 entry->busy = 1;
2132 else
2133 ret = -EBUSY;
2135 spin_unlock(&foreign_p2m->lock);
2137 if (ret == 0) {
2138 foreign_p2m_unexpose(dest_dom, entry);
2139 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2141 rcu_unlock_domain(dest_dom);
2142 return ret;
2144 #endif
2146 // grant table host mapping
2147 // mpaddr: host_addr: pseudo physical address
2148 // mfn: frame: machine page frame
2149 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
2150 int
2151 create_grant_host_mapping(unsigned long gpaddr, unsigned long mfn,
2152 unsigned int flags, unsigned int cache_flags)
2154 struct domain* d = current->domain;
2155 struct page_info* page;
2156 int ret;
2158 if ((flags & (GNTMAP_device_map |
2159 GNTMAP_application_map | GNTMAP_contains_pte)) ||
2160 (cache_flags)) {
2161 gdprintk(XENLOG_INFO, "%s: flags 0x%x cache_flags 0x%x\n",
2162 __func__, flags, cache_flags);
2163 return GNTST_general_error;
2166 BUG_ON(!mfn_valid(mfn));
2167 page = mfn_to_page(mfn);
2168 ret = get_page(page, page_get_owner(page));
2169 BUG_ON(ret == 0);
2170 assign_domain_page_replace(d, gpaddr, mfn,
2171 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2172 ASSIGN_tlb_track |
2173 #endif
2174 ((flags & GNTMAP_readonly) ?
2175 ASSIGN_readonly : ASSIGN_writable));
2176 perfc_incr(create_grant_host_mapping);
2177 return GNTST_okay;
2180 // grant table host unmapping
2181 int
2182 replace_grant_host_mapping(unsigned long gpaddr,
2183 unsigned long mfn, unsigned long new_gpaddr, unsigned int flags)
2185 struct domain* d = current->domain;
2186 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
2187 volatile pte_t* pte;
2188 unsigned long cur_arflags;
2189 pte_t cur_pte;
2190 pte_t new_pte = __pte(0);
2191 pte_t old_pte;
2192 struct page_info* page = mfn_to_page(mfn);
2193 struct page_info* new_page = NULL;
2194 volatile pte_t* new_page_pte = NULL;
2195 unsigned long new_page_mfn = INVALID_MFN;
2197 if (new_gpaddr) {
2198 new_page_pte = lookup_noalloc_domain_pte_none(d, new_gpaddr);
2199 if (likely(new_page_pte != NULL)) {
2200 new_pte = ptep_get_and_clear(&d->arch.mm,
2201 new_gpaddr, new_page_pte);
2202 if (likely(pte_present(new_pte))) {
2203 struct domain* page_owner;
2205 new_page_mfn = pte_pfn(new_pte);
2206 new_page = mfn_to_page(new_page_mfn);
2207 page_owner = page_get_owner(new_page);
2208 if (unlikely(page_owner == NULL)) {
2209 gdprintk(XENLOG_INFO,
2210 "%s: page_owner == NULL "
2211 "gpaddr 0x%lx mfn 0x%lx "
2212 "new_gpaddr 0x%lx mfn 0x%lx\n",
2213 __func__, gpaddr, mfn, new_gpaddr, new_page_mfn);
2214 new_page = NULL; /* prevent domain_put_page() */
2215 return GNTST_general_error;
2218 /*
2219 * domain_put_page(clear_PGC_allcoated = 0)
2220 * doesn't decrement refcount of page with
2221 * pte_ptc_allocated() = 1. Be carefull.
2222 */
2223 if (unlikely(!pte_pgc_allocated(new_pte))) {
2224 /* domain_put_page() decrements page refcount. adjust it. */
2225 if (get_page(new_page, page_owner)) {
2226 gdprintk(XENLOG_INFO,
2227 "%s: get_page() failed. "
2228 "gpaddr 0x%lx mfn 0x%lx "
2229 "new_gpaddr 0x%lx mfn 0x%lx\n",
2230 __func__, gpaddr, mfn,
2231 new_gpaddr, new_page_mfn);
2232 return GNTST_general_error;
2235 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 0);
2236 } else
2237 new_pte = __pte(0);
2241 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
2242 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
2243 return GNTST_general_error;
2246 pte = lookup_noalloc_domain_pte(d, gpaddr);
2247 if (pte == NULL) {
2248 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
2249 __func__, gpaddr, mfn);
2250 return GNTST_general_error;
2253 again:
2254 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
2255 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
2256 if (!pte_present(cur_pte) ||
2257 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
2258 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
2259 __func__, gpaddr, mfn, pte_val(cur_pte));
2260 return GNTST_general_error;
2263 if (new_page) {
2264 BUG_ON(new_page_mfn == INVALID_MFN);
2265 set_gpfn_from_mfn(new_page_mfn, gpfn);
2266 /* smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2267 has release semantics. */
2269 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
2270 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
2271 if (pte_pfn(old_pte) == mfn) {
2272 goto again;
2274 if (new_page) {
2275 BUG_ON(new_page_mfn == INVALID_MFN);
2276 set_gpfn_from_mfn(new_page_mfn, INVALID_M2P_ENTRY);
2277 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 1);
2279 goto out;
2281 if (unlikely(!pte_present(old_pte)))
2282 goto out;
2283 BUG_ON(pte_pfn(old_pte) != mfn);
2285 /* try_to_clear_PGC_allocate(d, page) is not needed. */
2286 BUG_ON(page_get_owner(page) == d &&
2287 get_gpfn_from_mfn(mfn) == gpfn);
2288 BUG_ON(pte_pgc_allocated(old_pte));
2289 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
2291 perfc_incr(replace_grant_host_mapping);
2292 return GNTST_okay;
2294 out:
2295 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
2296 "0x%lx old_pte 0x%lx\n",
2297 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
2298 return GNTST_general_error;
2301 // heavily depends on the struct page layout.
2302 // gnttab_transfer() calls steal_page() with memflags = 0
2303 // For grant table transfer, we must fill the page.
2304 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
2305 // For memory exchange, we don't have to fill the page because
2306 // memory_exchange() does it.
2307 int
2308 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
2310 #if 0 /* if big endian */
2311 # error "implement big endian version of steal_page()"
2312 #endif
2313 u32 _d, _nd;
2314 u64 x, nx, y;
2316 if (page_get_owner(page) != d) {
2317 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
2318 __func__, d, page_get_owner(page));
2319 return -1;
2322 if (!(memflags & MEMF_no_refcount)) {
2323 unsigned long gpfn;
2324 struct page_info *new;
2325 unsigned long new_mfn;
2326 int ret;
2328 new = alloc_domheap_page(d, 0);
2329 if (new == NULL) {
2330 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
2331 return -1;
2333 // zero out pages for security reasons
2334 clear_page(page_to_virt(new));
2335 // assign_domain_page_cmpxchg_rel() has release semantics
2336 // so smp_mb() isn't needed.
2338 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
2339 if (gpfn == INVALID_M2P_ENTRY) {
2340 free_domheap_page(new);
2341 return -1;
2343 new_mfn = page_to_mfn(new);
2344 set_gpfn_from_mfn(new_mfn, gpfn);
2345 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2346 // has release semantics.
2348 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
2349 ASSIGN_writable |
2350 ASSIGN_pgc_allocated, 0);
2351 if (ret < 0) {
2352 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
2353 ret);
2354 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
2355 free_domheap_page(new);
2356 return -1;
2358 perfc_incr(steal_page_refcount);
2361 spin_lock(&d->page_alloc_lock);
2363 /*
2364 * The tricky bit: atomically release ownership while there is just one
2365 * benign reference to the page (PGC_allocated). If that reference
2366 * disappears then the deallocation routine will safely spin.
2367 */
2368 _d = pickle_domptr(d);
2369 y = *((u64*)&page->count_info);
2370 do {
2371 x = y;
2372 nx = x & 0xffffffff;
2373 // page->count_info: untouched
2374 // page->u.inused._domain = 0;
2375 _nd = x >> 32;
2377 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
2378 (1 | PGC_allocated))) ||
2379 unlikely(_nd != _d)) {
2380 struct domain* nd = unpickle_domptr(_nd);
2381 if (nd == NULL) {
2382 gdprintk(XENLOG_INFO, "gnttab_transfer: "
2383 "Bad page %p: ed=%p(%u) 0x%x, "
2384 "sd=%p 0x%x,"
2385 " caf=%016lx, taf=%" PRtype_info
2386 " memflags 0x%x\n",
2387 (void *) page_to_mfn(page),
2388 d, d->domain_id, _d,
2389 nd, _nd,
2390 x,
2391 page->u.inuse.type_info,
2392 memflags);
2393 } else {
2394 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
2395 "Bad page %p: ed=%p(%u) 0x%x, "
2396 "sd=%p(%u) 0x%x,"
2397 " caf=%016lx, taf=%" PRtype_info
2398 " memflags 0x%x\n",
2399 (void *) page_to_mfn(page),
2400 d, d->domain_id, _d,
2401 nd, nd->domain_id, _nd,
2402 x,
2403 page->u.inuse.type_info,
2404 memflags);
2406 spin_unlock(&d->page_alloc_lock);
2407 return -1;
2410 y = cmpxchg((u64*)&page->count_info, x, nx);
2411 } while (unlikely(y != x));
2413 /*
2414 * Unlink from 'd'. At least one reference remains (now anonymous), so
2415 * noone else is spinning to try to delete this page from 'd'.
2416 */
2417 if ( !(memflags & MEMF_no_refcount) )
2418 d->tot_pages--;
2419 list_del(&page->list);
2421 spin_unlock(&d->page_alloc_lock);
2422 perfc_incr(steal_page);
2423 return 0;
2426 static void
2427 __guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2428 unsigned long mfn)
2430 set_gpfn_from_mfn(mfn, gpfn);
2431 smp_mb();
2432 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
2433 ASSIGN_writable | ASSIGN_pgc_allocated);
2436 int
2437 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2438 unsigned long mfn, unsigned int page_order)
2440 unsigned long i;
2442 for (i = 0; i < (1UL << page_order); i++) {
2443 BUG_ON(!mfn_valid(mfn));
2444 BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
2445 __guest_physmap_add_page(d, gpfn, mfn);
2446 mfn++;
2447 gpfn++;
2450 perfc_incr(guest_physmap_add_page);
2451 return 0;
2454 void
2455 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
2456 unsigned long mfn, unsigned int page_order)
2458 unsigned long i;
2460 BUG_ON(mfn == 0);//XXX
2462 for (i = 0; i < (1UL << page_order); i++)
2463 zap_domain_page_one(d, (gpfn+i) << PAGE_SHIFT, 0, mfn+i);
2465 perfc_incr(guest_physmap_remove_page);
2468 static void
2469 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
2470 volatile pte_t* ptep, pte_t old_pte,
2471 struct page_info* page)
2473 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2474 struct tlb_track_entry* entry;
2475 #endif
2477 if (shadow_mode_enabled(d))
2478 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
2480 #ifndef CONFIG_XEN_IA64_TLB_TRACK
2481 //XXX sledgehammer.
2482 // flush finer range.
2483 domain_flush_vtlb_all(d);
2484 put_page(page);
2485 #else
2486 switch (tlb_track_search_and_remove(d->arch.tlb_track,
2487 ptep, old_pte, &entry)) {
2488 case TLB_TRACK_NOT_TRACKED:
2489 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
2490 /* This page is zapped from this domain
2491 * by memory decrease or exchange or dom0vp_zap_physmap.
2492 * I.e. the page is zapped for returning this page to xen
2493 * (balloon driver or DMA page allocation) or
2494 * foreign domain mapped page is unmapped from the domain.
2495 * In the former case the page is to be freed so that
2496 * we can defer freeing page to batch.
2497 * In the latter case the page is unmapped so that
2498 * we need to flush it. But to optimize it, we
2499 * queue the page and flush vTLB only once.
2500 * I.e. The caller must call dfree_flush() explicitly.
2501 */
2502 domain_flush_vtlb_all(d);
2503 put_page(page);
2504 break;
2505 case TLB_TRACK_NOT_FOUND:
2506 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
2507 /* This page is zapped from this domain
2508 * by grant table page unmap.
2509 * Luckily the domain that mapped this page didn't
2510 * access this page so that we don't have to flush vTLB.
2511 * Probably the domain did only DMA.
2512 */
2513 /* do nothing */
2514 put_page(page);
2515 break;
2516 case TLB_TRACK_FOUND:
2517 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
2518 /* This page is zapped from this domain
2519 * by grant table page unmap.
2520 * Fortunately this page is accessced via only one virtual
2521 * memory address. So it is easy to flush it.
2522 */
2523 domain_flush_vtlb_track_entry(d, entry);
2524 tlb_track_free_entry(d->arch.tlb_track, entry);
2525 put_page(page);
2526 break;
2527 case TLB_TRACK_MANY:
2528 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
2529 /* This page is zapped from this domain
2530 * by grant table page unmap.
2531 * Unfortunately this page is accessced via many virtual
2532 * memory address (or too many times with single virtual address).
2533 * So we abondaned to track virtual addresses.
2534 * full vTLB flush is necessary.
2535 */
2536 domain_flush_vtlb_all(d);
2537 put_page(page);
2538 break;
2539 case TLB_TRACK_AGAIN:
2540 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
2541 BUG();
2542 break;
2544 #endif
2545 perfc_incr(domain_page_flush_and_put);
2548 int
2549 domain_page_mapped(struct domain* d, unsigned long mpaddr)
2551 volatile pte_t * pte;
2553 pte = lookup_noalloc_domain_pte(d, mpaddr);
2554 if(pte != NULL && !pte_none(*pte))
2555 return 1;
2556 return 0;
2559 /* Flush cache of domain d. */
2560 void domain_cache_flush (struct domain *d, int sync_only)
2562 struct mm_struct *mm = &d->arch.mm;
2563 volatile pgd_t *pgd = mm->pgd;
2564 unsigned long maddr;
2565 int i,j,k, l;
2566 int nbr_page = 0;
2567 void (*flush_func)(unsigned long start, unsigned long end);
2568 extern void flush_dcache_range (unsigned long, unsigned long);
2570 if (sync_only)
2571 flush_func = &flush_icache_range;
2572 else
2573 flush_func = &flush_dcache_range;
2575 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
2576 volatile pud_t *pud;
2577 if (!pgd_present(*pgd)) // acquire semantics
2578 continue;
2579 pud = pud_offset(pgd, 0);
2580 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
2581 volatile pmd_t *pmd;
2582 if (!pud_present(*pud)) // acquire semantics
2583 continue;
2584 pmd = pmd_offset(pud, 0);
2585 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
2586 volatile pte_t *pte;
2587 if (!pmd_present(*pmd)) // acquire semantics
2588 continue;
2589 pte = pte_offset_map(pmd, 0);
2590 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
2591 if (!pte_present(*pte)) // acquire semantics
2592 continue;
2593 /* Convert PTE to maddr. */
2594 maddr = __va_ul (pte_val(*pte)
2595 & _PAGE_PPN_MASK);
2596 (*flush_func)(maddr, maddr+ PAGE_SIZE);
2597 nbr_page++;
2602 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
2605 #ifdef VERBOSE
2606 #define MEM_LOG(_f, _a...) \
2607 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
2608 current->domain->domain_id , __LINE__ , ## _a )
2609 #else
2610 #define MEM_LOG(_f, _a...) ((void)0)
2611 #endif
2613 static void free_page_type(struct page_info *page, u32 type)
2617 static int alloc_page_type(struct page_info *page, u32 type)
2619 return 1;
2622 static int opt_p2m_xenheap;
2623 boolean_param("p2m_xenheap", opt_p2m_xenheap);
2625 void *pgtable_quicklist_alloc(void)
2627 void *p;
2629 BUG_ON(dom_p2m == NULL);
2630 if (!opt_p2m_xenheap) {
2631 struct page_info *page = alloc_domheap_page(dom_p2m, 0);
2632 if (page == NULL)
2633 return NULL;
2634 p = page_to_virt(page);
2635 clear_page(p);
2636 return p;
2638 p = alloc_xenheap_pages(0);
2639 if (p) {
2640 clear_page(p);
2641 /*
2642 * This page should be read only. At this moment, the third
2643 * argument doesn't make sense. It should be 1 when supported.
2644 */
2645 share_xen_page_with_guest(virt_to_page(p), dom_p2m, 0);
2647 return p;
2650 void pgtable_quicklist_free(void *pgtable_entry)
2652 struct page_info* page = virt_to_page(pgtable_entry);
2654 BUG_ON(page_get_owner(page) != dom_p2m);
2655 BUG_ON(page->count_info != (1 | PGC_allocated));
2657 put_page(page);
2658 if (opt_p2m_xenheap)
2659 free_xenheap_page(pgtable_entry);
2662 void put_page_type(struct page_info *page)
2664 u64 nx, x, y = page->u.inuse.type_info;
2666 again:
2667 do {
2668 x = y;
2669 nx = x - 1;
2671 ASSERT((x & PGT_count_mask) != 0);
2673 /*
2674 * The page should always be validated while a reference is held. The
2675 * exception is during domain destruction, when we forcibly invalidate
2676 * page-table pages if we detect a referential loop.
2677 * See domain.c:relinquish_list().
2678 */
2679 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
2681 if ( unlikely((nx & PGT_count_mask) == 0) )
2683 /* Record TLB information for flush later. Races are harmless. */
2684 page->tlbflush_timestamp = tlbflush_current_time();
2686 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2687 likely(nx & PGT_validated) )
2689 /*
2690 * Page-table pages must be unvalidated when count is zero. The
2691 * 'free' is safe because the refcnt is non-zero and validated
2692 * bit is clear => other ops will spin or fail.
2693 */
2694 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
2695 x & ~PGT_validated)) != x) )
2696 goto again;
2697 /* We cleared the 'valid bit' so we do the clean up. */
2698 free_page_type(page, x);
2699 /* Carry on, but with the 'valid bit' now clear. */
2700 x &= ~PGT_validated;
2701 nx &= ~PGT_validated;
2705 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
2709 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
2711 struct page_info *page = mfn_to_page(page_nr);
2713 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
2715 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
2716 return 0;
2719 return 1;
2723 int get_page_type(struct page_info *page, u32 type)
2725 u64 nx, x, y = page->u.inuse.type_info;
2727 ASSERT(!(type & ~PGT_type_mask));
2729 again:
2730 do {
2731 x = y;
2732 nx = x + 1;
2733 if ( unlikely((nx & PGT_count_mask) == 0) )
2735 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2736 return 0;
2738 else if ( unlikely((x & PGT_count_mask) == 0) )
2740 if ( (x & PGT_type_mask) != type )
2742 /*
2743 * On type change we check to flush stale TLB entries. This
2744 * may be unnecessary (e.g., page was GDT/LDT) but those
2745 * circumstances should be very rare.
2746 */
2747 cpumask_t mask =
2748 page_get_owner(page)->domain_dirty_cpumask;
2749 tlbflush_filter(mask, page->tlbflush_timestamp);
2751 if ( unlikely(!cpus_empty(mask)) )
2753 perfc_incr(need_flush_tlb_flush);
2754 flush_tlb_mask(mask);
2757 /* We lose existing type, back pointer, and validity. */
2758 nx &= ~(PGT_type_mask | PGT_validated);
2759 nx |= type;
2761 /* No special validation needed for writable pages. */
2762 /* Page tables and GDT/LDT need to be scanned for validity. */
2763 if ( type == PGT_writable_page )
2764 nx |= PGT_validated;
2767 else if ( unlikely((x & PGT_type_mask) != type) )
2769 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
2770 (type != PGT_l1_page_table) )
2771 MEM_LOG("Bad type (saw %08lx != exp %08x) "
2772 "for mfn %016lx (pfn %016lx)",
2773 x, type, page_to_mfn(page),
2774 get_gpfn_from_mfn(page_to_mfn(page)));
2775 return 0;
2777 else if ( unlikely(!(x & PGT_validated)) )
2779 /* Someone else is updating validation of this page. Wait... */
2780 while ( (y = page->u.inuse.type_info) == x )
2781 cpu_relax();
2782 goto again;
2785 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
2787 if ( unlikely(!(nx & PGT_validated)) )
2789 /* Try to validate page type; drop the new reference on failure. */
2790 if ( unlikely(!alloc_page_type(page, type)) )
2792 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
2793 ": caf=%08x taf=%" PRtype_info,
2794 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2795 type, page->count_info, page->u.inuse.type_info);
2796 /* Noone else can get a reference. We hold the only ref. */
2797 page->u.inuse.type_info = 0;
2798 return 0;
2801 /* Noone else is updating simultaneously. */
2802 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2805 return 1;
2808 int memory_is_conventional_ram(paddr_t p)
2810 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
2814 long
2815 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2817 struct page_info *page = NULL;
2819 switch (op) {
2820 case XENMEM_add_to_physmap:
2822 struct xen_add_to_physmap xatp;
2823 unsigned long prev_mfn, mfn = 0, gpfn;
2824 struct domain *d;
2826 if (copy_from_guest(&xatp, arg, 1))
2827 return -EFAULT;
2829 if (xatp.domid == DOMID_SELF)
2830 d = rcu_lock_current_domain();
2831 else {
2832 if ((d = rcu_lock_domain_by_id(xatp.domid)) == NULL)
2833 return -ESRCH;
2834 if (!IS_PRIV_FOR(current->domain,d)) {
2835 rcu_lock_domain(d);
2836 return -EPERM;
2840 /* This hypercall is used for VT-i domain only */
2841 if (!is_hvm_domain(d)) {
2842 rcu_unlock_domain(d);
2843 return -ENOSYS;
2846 switch (xatp.space) {
2847 case XENMAPSPACE_shared_info:
2848 if (xatp.idx == 0)
2849 mfn = virt_to_mfn(d->shared_info);
2850 break;
2851 case XENMAPSPACE_grant_table:
2852 spin_lock(&d->grant_table->lock);
2854 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
2855 (xatp.idx < max_nr_grant_frames))
2856 gnttab_grow_table(d, xatp.idx + 1);
2858 if (xatp.idx < nr_grant_frames(d->grant_table))
2859 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
2861 spin_unlock(&d->grant_table->lock);
2862 break;
2863 case XENMAPSPACE_mfn:
2865 if ( get_page_from_pagenr(xatp.idx, d) ) {
2866 mfn = xatp.idx;
2867 page = mfn_to_page(mfn);
2869 break;
2871 default:
2872 break;
2875 if (mfn == 0) {
2876 if ( page )
2877 put_page(page);
2878 rcu_unlock_domain(d);
2879 return -EINVAL;
2882 domain_lock(d);
2884 /* Check remapping necessity */
2885 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2886 if (mfn == prev_mfn)
2887 goto out;
2889 /* Remove previously mapped page if it was present. */
2890 if (prev_mfn && mfn_valid(prev_mfn)) {
2891 if (is_xen_heap_mfn(prev_mfn))
2892 /* Xen heap frames are simply unhooked from this phys slot. */
2893 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
2894 else
2895 /* Normal domain memory is freed, to avoid leaking memory. */
2896 guest_remove_page(d, xatp.gpfn);
2899 /* Unmap from old location, if any. */
2900 gpfn = get_gpfn_from_mfn(mfn);
2901 if (gpfn != INVALID_M2P_ENTRY)
2902 guest_physmap_remove_page(d, gpfn, mfn, 0);
2904 /* Map at new location. */
2905 /* Here page->count_info = PGC_allocated | N where N >= 1*/
2906 __guest_physmap_add_page(d, xatp.gpfn, mfn);
2907 page = NULL; /* prevent put_page() */
2909 out:
2910 domain_unlock(d);
2912 if ( page )
2913 put_page(page);
2915 rcu_unlock_domain(d);
2917 break;
2920 case XENMEM_remove_from_physmap:
2922 struct xen_remove_from_physmap xrfp;
2923 unsigned long mfn;
2924 struct domain *d;
2926 if ( copy_from_guest(&xrfp, arg, 1) )
2927 return -EFAULT;
2929 if ( xrfp.domid == DOMID_SELF )
2931 d = rcu_lock_current_domain();
2933 else
2935 if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
2936 return -ESRCH;
2937 if ( !IS_PRIV_FOR(current->domain, d) )
2939 rcu_unlock_domain(d);
2940 return -EPERM;
2944 domain_lock(d);
2946 mfn = gmfn_to_mfn(d, xrfp.gpfn);
2948 if ( mfn_valid(mfn) )
2949 guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
2951 domain_unlock(d);
2953 rcu_unlock_domain(d);
2955 break;
2959 case XENMEM_machine_memory_map:
2961 struct xen_memory_map memmap;
2962 struct xen_ia64_memmap_info memmap_info;
2963 XEN_GUEST_HANDLE(char) buffer;
2965 if (!IS_PRIV(current->domain))
2966 return -EINVAL;
2967 if (copy_from_guest(&memmap, arg, 1))
2968 return -EFAULT;
2969 if (memmap.nr_entries <
2970 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size)
2971 return -EINVAL;
2973 memmap.nr_entries =
2974 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size;
2975 memset(&memmap_info, 0, sizeof(memmap_info));
2976 memmap_info.efi_memmap_size = ia64_boot_param->efi_memmap_size;
2977 memmap_info.efi_memdesc_size = ia64_boot_param->efi_memdesc_size;
2978 memmap_info.efi_memdesc_version = ia64_boot_param->efi_memdesc_version;
2980 buffer = guest_handle_cast(memmap.buffer, char);
2981 if (copy_to_guest(buffer, (char*)&memmap_info, sizeof(memmap_info)) ||
2982 copy_to_guest_offset(buffer, sizeof(memmap_info),
2983 (char*)__va(ia64_boot_param->efi_memmap),
2984 ia64_boot_param->efi_memmap_size) ||
2985 copy_to_guest(arg, &memmap, 1))
2986 return -EFAULT;
2987 return 0;
2990 default:
2991 return -ENOSYS;
2994 return 0;
2997 int is_iomem_page(unsigned long mfn)
2999 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
3002 void xencomm_mark_dirty(unsigned long addr, unsigned int len)
3004 struct domain *d = current->domain;
3005 unsigned long gpfn;
3006 unsigned long end_addr = addr + len;
3008 if (shadow_mode_enabled(d)) {
3009 for (addr &= PAGE_MASK; addr < end_addr; addr += PAGE_SIZE) {
3010 gpfn = get_gpfn_from_mfn(virt_to_mfn(addr));
3011 shadow_mark_page_dirty(d, gpfn);
3016 int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
3018 /* STUB to compile */
3019 return -ENOSYS;
3022 int iommu_unmap_page(struct domain *d, unsigned long gfn)
3024 /* STUB to compile */
3025 return -ENOSYS;
3028 /*
3029 * Local variables:
3030 * mode: C
3031 * c-set-style: "BSD"
3032 * c-basic-offset: 4
3033 * tab-width: 4
3034 * indent-tabs-mode: nil
3035 * End:
3036 */