ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 15892:b2a02f7ed849

[IA64] Make use of PAGE_MASK and PAGE_ALIGN()

As suggested by Isaku Yamahata

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author Alex Williamson <alex.williamson@hp.com>
date Mon Sep 17 13:38:25 2007 -0600 (2007-09-17)
parents fdd298b75fb5
children f3173d151e1d
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * replace_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <linux/sort.h>
179 #include <xen/guest_access.h>
180 #include <asm/page.h>
181 #include <asm/dom_fw_common.h>
182 #include <public/memory.h>
183 #include <asm/event.h>
185 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
186 volatile pte_t* ptep, pte_t old_pte,
187 struct page_info* page);
189 extern unsigned long ia64_iobase;
191 static struct domain *dom_xen, *dom_io;
193 /*
194 * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO.
195 * If more reserved domain ids are introduced, this might be increased.
196 */
197 #define DOMID_P2M (0x7FF8U)
198 static struct domain *dom_p2m;
200 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
201 void
202 alloc_dom_xen_and_dom_io(void)
203 {
204 /*
205 * Initialise our DOMID_XEN domain.
206 * Any Xen-heap pages that we will allow to be mapped will have
207 * their domain field set to dom_xen.
208 */
209 dom_xen = alloc_domain(DOMID_XEN);
210 BUG_ON(dom_xen == NULL);
212 /*
213 * Initialise our DOMID_IO domain.
214 * This domain owns I/O pages that are within the range of the page_info
215 * array. Mappings occur at the priv of the caller.
216 */
217 dom_io = alloc_domain(DOMID_IO);
218 BUG_ON(dom_io == NULL);
219 }
221 static int
222 mm_teardown_can_skip(struct domain* d, unsigned long offset)
223 {
224 return d->arch.mm_teardown_offset > offset;
225 }
227 static void
228 mm_teardown_update_offset(struct domain* d, unsigned long offset)
229 {
230 d->arch.mm_teardown_offset = offset;
231 }
233 static void
234 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
235 {
236 pte_t old_pte;
237 unsigned long mfn;
238 struct page_info* page;
240 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
242 // vmx domain use bit[58:56] to distinguish io region from memory.
243 // see vmx_build_physmap_table() in vmx_init.c
244 if (!pte_mem(old_pte))
245 return;
247 // domain might map IO space or acpi table pages. check it.
248 mfn = pte_pfn(old_pte);
249 if (!mfn_valid(mfn))
250 return;
251 page = mfn_to_page(mfn);
252 BUG_ON(page_get_owner(page) == NULL);
254 // struct page_info corresponding to mfn may exist or not depending
255 // on CONFIG_VIRTUAL_FRAME_TABLE.
256 // The above check is too easy.
257 // The right way is to check whether this page is of io area or acpi pages
259 if (pte_pgc_allocated(old_pte)) {
260 BUG_ON(page_get_owner(page) != d);
261 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
262 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
263 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
264 put_page(page);
265 } else {
266 put_page(page);
267 }
268 }
270 static int
271 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
272 {
273 unsigned long i;
274 volatile pte_t* pte = pte_offset_map(pmd, offset);
276 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
277 unsigned long cur_offset = offset + (i << PAGE_SHIFT);
278 if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
279 continue;
280 if (!pte_present(*pte)) { // acquire semantics
281 mm_teardown_update_offset(d, cur_offset);
282 continue;
283 }
284 mm_teardown_update_offset(d, cur_offset);
285 mm_teardown_pte(d, pte, cur_offset);
286 if (hypercall_preempt_check())
287 return -EAGAIN;
288 }
289 return 0;
290 }
292 static int
293 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
294 {
295 unsigned long i;
296 volatile pmd_t *pmd = pmd_offset(pud, offset);
298 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
299 unsigned long cur_offset = offset + (i << PMD_SHIFT);
300 if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
301 continue;
302 if (!pmd_present(*pmd)) { // acquire semantics
303 mm_teardown_update_offset(d, cur_offset);
304 continue;
305 }
306 if (mm_teardown_pmd(d, pmd, cur_offset))
307 return -EAGAIN;
308 }
309 return 0;
310 }
312 static int
313 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
314 {
315 unsigned long i;
316 volatile pud_t *pud = pud_offset(pgd, offset);
318 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
319 unsigned long cur_offset = offset + (i << PUD_SHIFT);
320 #ifndef __PAGETABLE_PUD_FOLDED
321 if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
322 continue;
323 #endif
324 if (!pud_present(*pud)) { // acquire semantics
325 #ifndef __PAGETABLE_PUD_FOLDED
326 mm_teardown_update_offset(d, cur_offset);
327 #endif
328 continue;
329 }
330 if (mm_teardown_pud(d, pud, cur_offset))
331 return -EAGAIN;
332 }
333 return 0;
334 }
336 int
337 mm_teardown(struct domain* d)
338 {
339 struct mm_struct* mm = &d->arch.mm;
340 unsigned long i;
341 volatile pgd_t* pgd;
343 if (mm->pgd == NULL)
344 return 0;
346 pgd = pgd_offset(mm, 0);
347 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
348 unsigned long cur_offset = i << PGDIR_SHIFT;
350 if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE))
351 continue;
352 if (!pgd_present(*pgd)) { // acquire semantics
353 mm_teardown_update_offset(d, cur_offset);
354 continue;
355 }
356 if (mm_teardown_pgd(d, pgd, cur_offset))
357 return -EAGAIN;
358 }
360 foreign_p2m_destroy(d);
361 return 0;
362 }
364 static void
365 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
366 unsigned long offset)
367 {
368 pte_free_kernel(pte_offset_map(pmd, offset));
369 }
371 static void
372 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
373 unsigned long offset)
374 {
375 unsigned long i;
376 volatile pmd_t *pmd = pmd_offset(pud, offset);
378 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
379 if (!pmd_present(*pmd))
380 continue;
381 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
382 }
383 pmd_free(pmd_offset(pud, offset));
384 }
386 static void
387 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
388 unsigned long offset)
389 {
390 unsigned long i;
391 volatile pud_t *pud = pud_offset(pgd, offset);
393 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
394 if (!pud_present(*pud))
395 continue;
396 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
397 }
398 pud_free(pud_offset(pgd, offset));
399 }
401 static void
402 mm_p2m_teardown(struct domain* d)
403 {
404 struct mm_struct* mm = &d->arch.mm;
405 unsigned long i;
406 volatile pgd_t* pgd;
408 BUG_ON(mm->pgd == NULL);
409 pgd = pgd_offset(mm, 0);
410 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
411 if (!pgd_present(*pgd))
412 continue;
413 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
414 }
415 pgd_free(mm->pgd);
416 mm->pgd = NULL;
417 }
419 void
420 mm_final_teardown(struct domain* d)
421 {
422 if (d->arch.shadow_bitmap != NULL) {
423 xfree(d->arch.shadow_bitmap);
424 d->arch.shadow_bitmap = NULL;
425 }
426 mm_p2m_teardown(d);
427 }
429 unsigned long
430 domain_get_maximum_gpfn(struct domain *d)
431 {
432 return (d->arch.convmem_end + PAGE_SIZE - 1) >> PAGE_SHIFT;
433 }
435 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
436 void
437 share_xen_page_with_guest(struct page_info *page,
438 struct domain *d, int readonly)
439 {
440 if ( page_get_owner(page) == d )
441 return;
443 #if 1
444 if (readonly) {
445 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
446 }
447 #endif
449 // alloc_xenheap_pages() doesn't initialize page owner.
450 //BUG_ON(page_get_owner(page) != NULL);
452 spin_lock(&d->page_alloc_lock);
454 #ifndef __ia64__
455 /* The incremented type count pins as writable or read-only. */
456 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
457 page->u.inuse.type_info |= PGT_validated | 1;
458 #endif
460 page_set_owner(page, d);
461 wmb(); /* install valid domain ptr before updating refcnt. */
462 ASSERT(page->count_info == 0);
464 /* Only add to the allocation list if the domain isn't dying. */
465 if ( !d->is_dying )
466 {
467 page->count_info |= PGC_allocated | 1;
468 if ( unlikely(d->xenheap_pages++ == 0) )
469 get_knownalive_domain(d);
470 list_add_tail(&page->list, &d->xenpage_list);
471 }
473 // grant_table_destroy() releases these pages.
474 // but it doesn't clear their m2p entry. So there might remain stale
475 // entries. such a stale entry is cleared here.
476 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
478 spin_unlock(&d->page_alloc_lock);
479 }
481 void
482 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
483 {
484 share_xen_page_with_guest(page, dom_xen, readonly);
485 }
487 unsigned long
488 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
489 {
490 unsigned long pte;
492 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
493 if (!pte) {
494 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
495 }
496 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
497 }
499 // given a domain virtual address, pte and pagesize, extract the metaphysical
500 // address, convert the pte for a physical address for (possibly different)
501 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
502 // current->arch.vhpt_pg_shift!)
503 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* itir,
504 struct p2m_entry* entry)
505 {
506 struct domain *d = current->domain;
507 ia64_itir_t _itir = {.itir = itir__};
508 u64 mask, mpaddr, pteval2;
509 u64 arflags;
510 u64 arflags2;
511 u64 maflags2;
513 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
515 // FIXME address had better be pre-validated on insert
516 mask = ~itir_mask(_itir.itir);
517 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
519 if (_itir.ps > PAGE_SHIFT)
520 _itir.ps = PAGE_SHIFT;
522 ((ia64_itir_t*)itir)->itir = _itir.itir;/* Copy the whole register. */
523 ((ia64_itir_t*)itir)->ps = _itir.ps; /* Overwrite ps part! */
525 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
526 if (_itir.ps < PAGE_SHIFT)
527 pteval2 |= mpaddr & ~PAGE_MASK & ~((1L << _itir.ps) - 1);
529 /* Check access rights. */
530 arflags = pteval & _PAGE_AR_MASK;
531 arflags2 = pteval2 & _PAGE_AR_MASK;
532 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
533 #if 0
534 dprintk(XENLOG_WARNING,
535 "%s:%d "
536 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
537 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
538 __func__, __LINE__,
539 pteval, arflags, address, itir__,
540 pteval2, arflags2, mpaddr);
541 #endif
542 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
543 }
545 /* Check memory attribute. The switch is on the *requested* memory
546 attribute. */
547 maflags2 = pteval2 & _PAGE_MA_MASK;
548 switch (pteval & _PAGE_MA_MASK) {
549 case _PAGE_MA_NAT:
550 /* NaT pages are always accepted! */
551 break;
552 case _PAGE_MA_UC:
553 case _PAGE_MA_UCE:
554 case _PAGE_MA_WC:
555 if (maflags2 == _PAGE_MA_WB) {
556 /* Don't let domains WB-map uncached addresses.
557 This can happen when domU tries to touch i/o
558 port space. Also prevents possible address
559 aliasing issues. */
560 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) {
561 u64 ucwb;
563 /*
564 * If dom0 page has both UC & WB attributes
565 * don't warn about attempted UC access.
566 */
567 ucwb = efi_mem_attribute(mpaddr, PAGE_SIZE);
568 ucwb &= EFI_MEMORY_UC | EFI_MEMORY_WB;
569 ucwb ^= EFI_MEMORY_UC | EFI_MEMORY_WB;
571 if (d != dom0 || ucwb != 0)
572 gdprintk(XENLOG_WARNING, "Warning: UC"
573 " to WB for mpaddr=%lx\n",
574 mpaddr);
575 }
576 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
577 }
578 break;
579 case _PAGE_MA_WB:
580 if (maflags2 != _PAGE_MA_WB) {
581 /* Forbid non-coherent access to coherent memory. */
582 panic_domain(NULL, "try to use WB mem attr on "
583 "UC page, mpaddr=%lx\n", mpaddr);
584 }
585 break;
586 default:
587 panic_domain(NULL, "try to use unknown mem attribute\n");
588 }
590 /* If shadow mode is enabled, virtualize dirty bit. */
591 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
592 u64 mp_page = mpaddr >> PAGE_SHIFT;
593 pteval |= _PAGE_VIRT_D;
595 /* If the page is not already dirty, don't set the dirty bit! */
596 if (mp_page < d->arch.shadow_bitmap_size * 8
597 && !test_bit(mp_page, d->arch.shadow_bitmap))
598 pteval &= ~_PAGE_D;
599 }
601 /* Ignore non-addr bits of pteval2 and force PL0->1
602 (PL3 is unaffected) */
603 return (pteval & ~(_PAGE_PPN_MASK | _PAGE_PL_MASK)) |
604 (pteval2 & _PAGE_PPN_MASK) |
605 (vcpu_pl_adjust(pteval, 7) & _PAGE_PL_MASK);
606 }
608 // given a current domain metaphysical address, return the physical address
609 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
610 struct p2m_entry* entry)
611 {
612 unsigned long pteval;
614 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
615 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
616 }
618 //XXX !xxx_present() should be used instread of !xxx_none()?
619 // pud, pmd, pte page is zero cleared when they are allocated.
620 // Their area must be visible before population so that
621 // cmpxchg must have release semantics.
622 static volatile pte_t*
623 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
624 {
625 struct mm_struct *mm = &d->arch.mm;
626 volatile pgd_t *pgd;
627 volatile pud_t *pud;
628 volatile pmd_t *pmd;
630 BUG_ON(mm->pgd == NULL);
632 pgd = pgd_offset(mm, mpaddr);
633 again_pgd:
634 if (unlikely(pgd_none(*pgd))) { // acquire semantics
635 pud_t *old_pud = NULL;
636 pud = pud_alloc_one(mm, mpaddr);
637 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
638 pud_free(pud);
639 goto again_pgd;
640 }
641 }
643 pud = pud_offset(pgd, mpaddr);
644 again_pud:
645 if (unlikely(pud_none(*pud))) { // acquire semantics
646 pmd_t* old_pmd = NULL;
647 pmd = pmd_alloc_one(mm, mpaddr);
648 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
649 pmd_free(pmd);
650 goto again_pud;
651 }
652 }
654 pmd = pmd_offset(pud, mpaddr);
655 again_pmd:
656 if (unlikely(pmd_none(*pmd))) { // acquire semantics
657 pte_t* old_pte = NULL;
658 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
659 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
660 pte_free_kernel(pte);
661 goto again_pmd;
662 }
663 }
665 return pte_offset_map(pmd, mpaddr);
666 }
668 //XXX xxx_none() should be used instread of !xxx_present()?
669 volatile pte_t*
670 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
671 {
672 struct mm_struct *mm = &d->arch.mm;
673 volatile pgd_t *pgd;
674 volatile pud_t *pud;
675 volatile pmd_t *pmd;
677 BUG_ON(mm->pgd == NULL);
678 pgd = pgd_offset(mm, mpaddr);
679 if (unlikely(!pgd_present(*pgd))) // acquire semantics
680 return NULL;
682 pud = pud_offset(pgd, mpaddr);
683 if (unlikely(!pud_present(*pud))) // acquire semantics
684 return NULL;
686 pmd = pmd_offset(pud, mpaddr);
687 if (unlikely(!pmd_present(*pmd))) // acquire semantics
688 return NULL;
690 return pte_offset_map(pmd, mpaddr);
691 }
693 static volatile pte_t*
694 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
695 {
696 struct mm_struct *mm = &d->arch.mm;
697 volatile pgd_t *pgd;
698 volatile pud_t *pud;
699 volatile pmd_t *pmd;
701 BUG_ON(mm->pgd == NULL);
702 pgd = pgd_offset(mm, mpaddr);
703 if (unlikely(pgd_none(*pgd))) // acquire semantics
704 return NULL;
706 pud = pud_offset(pgd, mpaddr);
707 if (unlikely(pud_none(*pud))) // acquire semantics
708 return NULL;
710 pmd = pmd_offset(pud, mpaddr);
711 if (unlikely(pmd_none(*pmd))) // acquire semantics
712 return NULL;
714 return pte_offset_map(pmd, mpaddr);
715 }
717 unsigned long
718 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
719 {
720 volatile pte_t *pte;
722 pte = lookup_noalloc_domain_pte(d, mpaddr);
723 if (pte == NULL)
724 return INVALID_MFN;
726 if (pte_present(*pte))
727 return (pte->pte & _PFN_MASK);
728 else if (VMX_DOMAIN(d->vcpu[0]))
729 return GPFN_INV_MASK;
730 return INVALID_MFN;
731 }
733 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
734 struct p2m_entry* entry)
735 {
736 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
738 if (pte != NULL) {
739 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
740 if (pte_present(tmp_pte)) {
741 if (entry != NULL)
742 p2m_entry_set(entry, pte, tmp_pte);
743 return pte_val(tmp_pte);
744 } else if (VMX_DOMAIN(d->vcpu[0]))
745 return GPFN_INV_MASK;
746 }
748 if (mpaddr < d->arch.convmem_end && !d->is_dying) {
749 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
750 "d %"PRId16" 0x%lx (< 0x%lx)\n",
751 current->vcpu_id, PSCB(current, iip),
752 d->domain_id, mpaddr, d->arch.convmem_end);
753 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
754 /* Log I/O port probing, but complain less loudly about it */
755 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
756 "d %"PRId16" 0x%lx\n",
757 current->vcpu_id, PSCB(current, iip), d->domain_id,
758 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
759 } else {
760 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
761 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
762 current->vcpu_id, PSCB(current, iip),
763 d->domain_id, mpaddr, d->arch.convmem_end);
764 }
766 if (entry != NULL)
767 p2m_entry_set(entry, NULL, __pte(0));
768 //XXX This is a work around until the emulation memory access to a region
769 // where memory or device are attached is implemented.
770 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_PRIV |
771 _PAGE_AR_RWX)));
772 }
774 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
775 #if 1
776 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
777 {
778 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
779 unsigned long imva;
781 pte &= _PAGE_PPN_MASK;
782 imva = (unsigned long) __va(pte);
783 imva |= mpaddr & ~PAGE_MASK;
784 return (void*)imva;
785 }
786 #else
787 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
788 {
789 unsigned long imva = __gpa_to_mpa(d, mpaddr);
791 return (void *)__va(imva);
792 }
793 #endif
795 unsigned long
796 paddr_to_maddr(unsigned long paddr)
797 {
798 struct vcpu *v = current;
799 struct domain *d = v->domain;
800 u64 pa;
802 pa = ____lookup_domain_mpa(d, paddr);
803 if (pa == INVALID_MFN) {
804 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
805 __func__, paddr, vcpu_regs(v)->cr_iip);
806 return 0;
807 }
808 return (pa & _PFN_MASK) | (paddr & ~PAGE_MASK);
809 }
811 /* Allocate a new page for domain and map it to the specified metaphysical
812 address. */
813 static struct page_info *
814 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
815 volatile pte_t* pte)
816 {
817 struct page_info *p;
818 unsigned long maddr;
820 BUG_ON(!pte_none(*pte));
822 p = alloc_domheap_page(d);
823 if (unlikely(!p)) {
824 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
825 return(p);
826 }
828 // zero out pages for security reasons
829 clear_page(page_to_virt(p));
830 maddr = page_to_maddr (p);
831 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
832 && maddr < __get_cpu_var(vhpt_pend))) {
833 /* FIXME: how can this happen ?
834 vhpt is allocated by alloc_domheap_page. */
835 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
836 maddr);
837 }
839 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
840 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
841 // because set_pte_rel() has release semantics
842 set_pte_rel(pte,
843 pfn_pte(maddr >> PAGE_SHIFT,
844 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
845 _PAGE_PL_PRIV | _PAGE_AR_RWX)));
847 smp_mb();
848 return p;
849 }
851 struct page_info *
852 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
853 {
854 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
856 if (!pte_none(*pte))
857 return NULL;
859 return __assign_new_domain_page(d, mpaddr, pte);
860 }
862 void __init
863 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
864 {
865 volatile pte_t *pte;
867 BUG_ON(d != dom0);
868 pte = lookup_alloc_domain_pte(d, mpaddr);
869 if (pte_none(*pte)) {
870 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
871 if (p == NULL) {
872 panic("%s: can't allocate page for dom0\n", __func__);
873 }
874 }
875 }
877 static unsigned long
878 flags_to_prot (unsigned long flags)
879 {
880 unsigned long res = _PAGE_PL_PRIV | __DIRTY_BITS;
882 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
883 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
884 #ifdef CONFIG_XEN_IA64_TLB_TRACK
885 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
886 #endif
887 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
889 return res;
890 }
892 /* map a physical address to the specified metaphysical addr */
893 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
894 // This is called by assign_domain_mmio_page().
895 // So accessing to pte is racy.
896 int
897 __assign_domain_page(struct domain *d,
898 unsigned long mpaddr, unsigned long physaddr,
899 unsigned long flags)
900 {
901 volatile pte_t *pte;
902 pte_t old_pte;
903 pte_t new_pte;
904 pte_t ret_pte;
905 unsigned long prot = flags_to_prot(flags);
907 pte = lookup_alloc_domain_pte(d, mpaddr);
909 old_pte = __pte(0);
910 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
911 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
912 if (pte_val(ret_pte) == pte_val(old_pte)) {
913 smp_mb();
914 return 0;
915 }
917 // dom0 tries to map real machine's I/O region, but failed.
918 // It is very likely that dom0 doesn't boot correctly because
919 // it can't access I/O. So complain here.
920 if (flags & ASSIGN_nocache) {
921 int warn = 0;
923 if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT))
924 warn = 1;
925 else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) {
926 u32 type;
927 u64 attr;
929 warn = 1;
931 /*
932 * See
933 * complete_dom0_memmap()
934 * case EFI_RUNTIME_SERVICES_CODE:
935 * case EFI_RUNTIME_SERVICES_DATA:
936 * case EFI_ACPI_RECLAIM_MEMORY:
937 * case EFI_ACPI_MEMORY_NVS:
938 * case EFI_RESERVED_TYPE:
939 *
940 * Currently only EFI_RUNTIME_SERVICES_CODE is found
941 * so that we suppress only EFI_RUNTIME_SERVICES_CODE case.
942 */
943 type = efi_mem_type(physaddr);
944 attr = efi_mem_attributes(physaddr);
945 if (type == EFI_RUNTIME_SERVICES_CODE &&
946 (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB))
947 warn = 0;
948 }
949 if (warn)
950 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
951 "\talready assigned pte_val 0x%016lx\n"
952 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
953 __func__, __LINE__,
954 d, d->domain_id, pte_val(ret_pte),
955 mpaddr, physaddr, flags);
956 }
958 return -EAGAIN;
959 }
961 /* get_page() and map a physical address to the specified metaphysical addr */
962 void
963 assign_domain_page(struct domain *d,
964 unsigned long mpaddr, unsigned long physaddr)
965 {
966 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
968 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
969 BUG_ON(page->count_info != (PGC_allocated | 1));
970 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
971 // because __assign_domain_page() uses set_pte_rel() which has
972 // release semantics, smp_mb() isn't needed.
973 (void)__assign_domain_page(d, mpaddr, physaddr,
974 ASSIGN_writable | ASSIGN_pgc_allocated);
975 }
977 int
978 ioports_permit_access(struct domain *d, unsigned int fp, unsigned int lp)
979 {
980 struct io_space *space;
981 unsigned long mmio_start, mmio_end, mach_start;
982 int ret;
984 if (IO_SPACE_NR(fp) >= num_io_spaces) {
985 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
986 return -EFAULT;
987 }
989 /*
990 * The ioport_cap rangeset tracks the I/O port address including
991 * the port space ID. This means port space IDs need to match
992 * between Xen and dom0. This is also a requirement because
993 * the hypercall to pass these port ranges only uses a u32.
994 *
995 * NB - non-dom0 driver domains may only have a subset of the
996 * I/O port spaces and thus will number port spaces differently.
997 * This is ok, they don't make use of this interface.
998 */
999 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
1000 if (ret != 0)
1001 return ret;
1003 space = &io_space[IO_SPACE_NR(fp)];
1005 /* Legacy I/O on dom0 is already setup */
1006 if (d == dom0 && space == &io_space[0])
1007 return 0;
1009 fp = IO_SPACE_PORT(fp);
1010 lp = IO_SPACE_PORT(lp);
1012 if (space->sparse) {
1013 mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
1014 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
1015 } else {
1016 mmio_start = fp & ~PAGE_MASK;
1017 mmio_end = PAGE_ALIGN(lp);
1020 /*
1021 * The "machine first port" is not necessarily identity mapped
1022 * to the guest first port. At least for the legacy range.
1023 */
1024 mach_start = mmio_start | __pa(space->mmio_base);
1026 if (space == &io_space[0]) {
1027 mmio_start |= IO_PORTS_PADDR;
1028 mmio_end |= IO_PORTS_PADDR;
1029 } else {
1030 mmio_start |= __pa(space->mmio_base);
1031 mmio_end |= __pa(space->mmio_base);
1034 while (mmio_start <= mmio_end) {
1035 (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache);
1036 mmio_start += PAGE_SIZE;
1037 mach_start += PAGE_SIZE;
1040 return 0;
1043 static int
1044 ioports_has_allowed(struct domain *d, unsigned int fp, unsigned int lp)
1046 for (; fp < lp; fp++)
1047 if (rangeset_contains_singleton(d->arch.ioport_caps, fp))
1048 return 1;
1050 return 0;
1053 int
1054 ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
1056 int ret;
1057 struct mm_struct *mm = &d->arch.mm;
1058 unsigned long mmio_start, mmio_end, mmio_base;
1059 unsigned int fp_base, lp_base;
1060 struct io_space *space;
1062 if (IO_SPACE_NR(fp) >= num_io_spaces) {
1063 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
1064 return -EFAULT;
1067 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
1068 if (ret != 0)
1069 return ret;
1071 space = &io_space[IO_SPACE_NR(fp)];
1072 fp_base = IO_SPACE_PORT(fp);
1073 lp_base = IO_SPACE_PORT(lp);
1075 if (space->sparse) {
1076 mmio_start = IO_SPACE_SPARSE_ENCODING(fp_base) & ~PAGE_MASK;
1077 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp_base));
1078 } else {
1079 mmio_start = fp_base & ~PAGE_MASK;
1080 mmio_end = PAGE_ALIGN(lp_base);
1083 if (space == &io_space[0] && d != dom0)
1084 mmio_base = IO_PORTS_PADDR;
1085 else
1086 mmio_base = __pa(space->mmio_base);
1088 for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
1089 unsigned int port, range;
1090 unsigned long mpaddr;
1091 volatile pte_t *pte;
1092 pte_t old_pte;
1094 if (space->sparse) {
1095 port = IO_SPACE_SPARSE_DECODING(mmio_start);
1096 range = IO_SPACE_SPARSE_PORTS_PER_PAGE - 1;
1097 } else {
1098 port = mmio_start;
1099 range = PAGE_SIZE - 1;
1102 port |= IO_SPACE_BASE(IO_SPACE_NR(fp));
1104 if (port < fp || port + range > lp) {
1105 /* Maybe this covers an allowed port. */
1106 if (ioports_has_allowed(d, port, port + range))
1107 continue;
1110 mpaddr = mmio_start | mmio_base;
1111 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1112 BUG_ON(pte == NULL);
1113 BUG_ON(pte_none(*pte));
1115 /* clear pte */
1116 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1118 domain_flush_vtlb_all(d);
1119 return 0;
1122 static void
1123 assign_domain_same_page(struct domain *d,
1124 unsigned long mpaddr, unsigned long size,
1125 unsigned long flags)
1127 //XXX optimization
1128 unsigned long end = PAGE_ALIGN(mpaddr + size);
1129 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
1130 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
1134 int
1135 efi_mmio(unsigned long physaddr, unsigned long size)
1137 void *efi_map_start, *efi_map_end;
1138 u64 efi_desc_size;
1139 void* p;
1141 efi_map_start = __va(ia64_boot_param->efi_memmap);
1142 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1143 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1145 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1146 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1147 unsigned long start = md->phys_addr;
1148 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1150 if (start <= physaddr && physaddr < end) {
1151 if ((physaddr + size) > end) {
1152 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1153 __func__, physaddr, size);
1154 return 0;
1157 // for io space
1158 if (md->type == EFI_MEMORY_MAPPED_IO ||
1159 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1160 return 1;
1163 // for runtime
1164 // see efi_enter_virtual_mode(void)
1165 // in linux/arch/ia64/kernel/efi.c
1166 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1167 !(md->attribute & EFI_MEMORY_WB)) {
1168 return 1;
1171 return 0;
1174 if (physaddr < start) {
1175 break;
1179 return 1;
1182 unsigned long
1183 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1184 unsigned long phys_addr, unsigned long size,
1185 unsigned long flags)
1187 unsigned long addr = mpaddr & PAGE_MASK;
1188 unsigned long end = PAGE_ALIGN(mpaddr + size);
1190 if (size == 0) {
1191 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1192 __func__, d, mpaddr, size);
1194 if (!efi_mmio(mpaddr, size)) {
1195 #ifndef NDEBUG
1196 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1197 __func__, d, mpaddr, size);
1198 #endif
1199 return -EINVAL;
1202 for (phys_addr &= PAGE_MASK; addr < end;
1203 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1204 __assign_domain_page(d, addr, phys_addr, flags);
1207 return mpaddr;
1210 unsigned long
1211 assign_domain_mach_page(struct domain *d,
1212 unsigned long mpaddr, unsigned long size,
1213 unsigned long flags)
1215 BUG_ON(flags & ASSIGN_pgc_allocated);
1216 assign_domain_same_page(d, mpaddr, size, flags);
1217 return mpaddr;
1220 static void
1221 adjust_page_count_info(struct page_info* page)
1223 struct domain* d = page_get_owner(page);
1224 BUG_ON((page->count_info & PGC_count_mask) != 1);
1225 if (d != NULL) {
1226 int ret = get_page(page, d);
1227 BUG_ON(ret == 0);
1228 } else {
1229 u64 x, nx, y;
1231 y = *((u64*)&page->count_info);
1232 do {
1233 x = y;
1234 nx = x + 1;
1236 BUG_ON((x >> 32) != 0);
1237 BUG_ON((nx & PGC_count_mask) != 2);
1238 y = cmpxchg((u64*)&page->count_info, x, nx);
1239 } while (unlikely(y != x));
1243 static void
1244 domain_put_page(struct domain* d, unsigned long mpaddr,
1245 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1247 unsigned long mfn = pte_pfn(old_pte);
1248 struct page_info* page = mfn_to_page(mfn);
1250 if (pte_pgc_allocated(old_pte)) {
1251 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1252 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1253 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1254 } else {
1255 BUG();
1258 if (likely(clear_PGC_allocate)) {
1259 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1260 BUG();
1261 /* put_page() is done by domain_page_flush_and_put() */
1262 } else {
1263 // In this case, page reference count mustn't touched.
1264 // domain_page_flush_and_put() decrements it, we increment
1265 // it in advence. This patch is slow path.
1266 //
1267 // guest_remove_page(): owner = d, count_info = 1
1268 // memory_exchange(): owner = NULL, count_info = 1
1269 adjust_page_count_info(page);
1272 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1275 // caller must get_page(mfn_to_page(mfn)) before call.
1276 // caller must call set_gpfn_from_mfn() before call if necessary.
1277 // because set_gpfn_from_mfn() result must be visible before pte xchg
1278 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1279 // flags: ASSIGN_xxx
1280 static void
1281 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1282 unsigned long mfn, unsigned long flags)
1284 struct mm_struct *mm = &d->arch.mm;
1285 volatile pte_t* pte;
1286 pte_t old_pte;
1287 pte_t npte;
1288 unsigned long prot = flags_to_prot(flags);
1290 pte = lookup_alloc_domain_pte(d, mpaddr);
1292 // update pte
1293 npte = pfn_pte(mfn, __pgprot(prot));
1294 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1295 if (pte_mem(old_pte)) {
1296 unsigned long old_mfn = pte_pfn(old_pte);
1298 // mfn = old_mfn case can happen when domain maps a granted page
1299 // twice with the same pseudo physial address.
1300 // It's non sense, but allowed.
1301 // __gnttab_map_grant_ref()
1302 // => create_host_mapping()
1303 // => assign_domain_page_replace()
1304 if (mfn != old_mfn) {
1305 domain_put_page(d, mpaddr, pte, old_pte, 1);
1308 perfc_incr(assign_domain_page_replace);
1311 // caller must get_page(new_page) before
1312 // Only steal_page() calls this function.
1313 static int
1314 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1315 struct page_info* old_page,
1316 struct page_info* new_page,
1317 unsigned long flags, int clear_PGC_allocate)
1319 struct mm_struct *mm = &d->arch.mm;
1320 volatile pte_t* pte;
1321 unsigned long old_mfn;
1322 unsigned long old_prot;
1323 pte_t old_pte;
1324 unsigned long new_mfn;
1325 unsigned long new_prot;
1326 pte_t new_pte;
1327 pte_t ret_pte;
1329 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1330 pte = lookup_alloc_domain_pte(d, mpaddr);
1332 again:
1333 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1334 old_mfn = page_to_mfn(old_page);
1335 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1336 if (!pte_present(old_pte)) {
1337 gdprintk(XENLOG_INFO,
1338 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1339 __func__, pte_val(old_pte), old_prot, old_mfn);
1340 return -EINVAL;
1343 new_prot = flags_to_prot(flags);
1344 new_mfn = page_to_mfn(new_page);
1345 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1347 // update pte
1348 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1349 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1350 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1351 goto again;
1354 gdprintk(XENLOG_INFO,
1355 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1356 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1357 __func__,
1358 pte_val(old_pte), old_prot, old_mfn,
1359 pte_val(ret_pte), pte_pfn(ret_pte));
1360 return -EINVAL;
1363 BUG_ON(!pte_mem(old_pte));
1364 BUG_ON(!pte_pgc_allocated(old_pte));
1365 BUG_ON(page_get_owner(old_page) != d);
1366 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1367 BUG_ON(old_mfn == new_mfn);
1369 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1370 if (likely(clear_PGC_allocate)) {
1371 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1372 BUG();
1373 } else {
1374 int ret;
1375 // adjust for count_info for domain_page_flush_and_put()
1376 // This is slow path.
1377 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1378 BUG_ON(d == NULL);
1379 ret = get_page(old_page, d);
1380 BUG_ON(ret == 0);
1383 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1384 perfc_incr(assign_domain_pge_cmpxchg_rel);
1385 return 0;
1388 static void
1389 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1390 int clear_PGC_allocate, unsigned long mfn)
1392 struct mm_struct *mm = &d->arch.mm;
1393 volatile pte_t *pte;
1394 pte_t old_pte;
1395 struct page_info *page;
1397 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1398 if (pte == NULL)
1399 return;
1400 if (pte_none(*pte))
1401 return;
1403 if (mfn == INVALID_MFN) {
1404 // clear pte
1405 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1406 mfn = pte_pfn(old_pte);
1407 } else {
1408 unsigned long old_arflags;
1409 pte_t new_pte;
1410 pte_t ret_pte;
1412 again:
1413 // memory_exchange() calls guest_physmap_remove_page() with
1414 // a stealed page. i.e. page owner = NULL.
1415 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1416 page_get_owner(mfn_to_page(mfn)) != NULL);
1417 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1418 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1419 new_pte = __pte(0);
1421 // update pte
1422 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1423 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1424 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1425 goto again;
1428 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1429 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1430 __func__,
1431 pte_val(old_pte), old_arflags, mfn,
1432 pte_val(ret_pte), pte_pfn(ret_pte));
1433 return;
1435 BUG_ON(mfn != pte_pfn(ret_pte));
1438 page = mfn_to_page(mfn);
1439 BUG_ON((page->count_info & PGC_count_mask) == 0);
1441 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1442 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1443 perfc_incr(zap_dcomain_page_one);
1446 unsigned long
1447 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1448 unsigned int extent_order)
1450 if (extent_order != 0) {
1451 //XXX
1452 return -ENOSYS;
1455 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1456 perfc_incr(dom0vp_zap_physmap);
1457 return 0;
1460 static unsigned long
1461 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1462 unsigned long mfn_or_gmfn,
1463 unsigned long flags, domid_t domid, int is_gmfn)
1465 int error = -EINVAL;
1466 struct domain* rd;
1467 unsigned long mfn;
1469 /* Not allowed by a domain. */
1470 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1471 return -EINVAL;
1473 rd = get_domain_by_id(domid);
1474 if (unlikely(rd == NULL)) {
1475 switch (domid) {
1476 case DOMID_XEN:
1477 rd = dom_xen;
1478 break;
1479 case DOMID_IO:
1480 rd = dom_io;
1481 break;
1482 default:
1483 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1484 "gpfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1485 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1486 return -ESRCH;
1488 BUG_ON(rd == NULL);
1489 get_knownalive_domain(rd);
1492 if (unlikely(rd == d))
1493 goto out1;
1494 /*
1495 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1496 * It can be considered that their p2m conversion is p==m.
1497 */
1498 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1499 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1500 else
1501 mfn = mfn_or_gmfn;
1502 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1503 goto out1;
1505 error = 0;
1506 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1507 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1508 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1509 //don't update p2m table because this page belongs to rd, not d.
1510 perfc_incr(dom0vp_add_physmap);
1511 out1:
1512 put_domain(rd);
1513 return error;
1516 unsigned long
1517 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1518 unsigned long flags, domid_t domid)
1520 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1523 unsigned long
1524 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1525 unsigned long gmfn, unsigned long flags,
1526 domid_t domid)
1528 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1531 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1532 #define P2M_PFN_ROUNDUP(x) (((x) + PTRS_PER_PTE - 1) & \
1533 ~(PTRS_PER_PTE - 1))
1534 #define P2M_PFN_ROUNDDOWN(x) ((x) & ~(PTRS_PER_PTE - 1))
1535 #define P2M_NUM_PFN(x) (((x) + PTRS_PER_PTE - 1) / PTRS_PER_PTE)
1536 #define MD_END(md) ((md)->phys_addr + \
1537 ((md)->num_pages << EFI_PAGE_SHIFT))
1538 static struct page_info* p2m_pte_zero_page = NULL;
1540 /* This must called before dom0 p2m table allocation */
1541 void __init
1542 expose_p2m_init(void)
1544 pte_t* pte;
1546 /*
1547 * Initialise our DOMID_P2M domain.
1548 * This domain owns m2p table pages.
1549 */
1550 dom_p2m = alloc_domain(DOMID_P2M);
1551 BUG_ON(dom_p2m == NULL);
1552 dom_p2m->max_pages = ~0U;
1554 pte = pte_alloc_one_kernel(NULL, 0);
1555 BUG_ON(pte == NULL);
1556 smp_mb();// make contents of the page visible.
1557 p2m_pte_zero_page = virt_to_page(pte);
1560 // allocate pgd, pmd of dest_dom if necessary
1561 static int
1562 allocate_pgd_pmd(struct domain* dest_dom, unsigned long dest_gpfn,
1563 struct domain* src_dom,
1564 unsigned long src_gpfn, unsigned long num_src_gpfn)
1566 unsigned long i = 0;
1568 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1569 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1571 while (i < num_src_gpfn) {
1572 volatile pte_t* src_pte;
1573 volatile pte_t* dest_pte;
1575 src_pte = lookup_noalloc_domain_pte(src_dom,
1576 (src_gpfn + i) << PAGE_SHIFT);
1577 if (src_pte == NULL) {
1578 i++;
1579 continue;
1582 dest_pte = lookup_alloc_domain_pte(dest_dom,
1583 (dest_gpfn << PAGE_SHIFT) +
1584 i * sizeof(pte_t));
1585 if (dest_pte == NULL) {
1586 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n",
1587 __func__);
1588 return -ENOMEM;
1591 // skip to next pte page
1592 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1594 return 0;
1597 static int
1598 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1600 int ret = get_page(page, dom_p2m);
1601 BUG_ON(ret != 1);
1602 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1603 ASSIGN_readonly);
1606 // expose pte page
1607 static int
1608 expose_p2m_range(struct domain* dest_dom, unsigned long dest_gpfn,
1609 struct domain* src_dom,
1610 unsigned long src_gpfn, unsigned long num_src_gpfn)
1612 unsigned long i = 0;
1614 BUG_ON((src_gpfn % PTRS_PER_PTE) != 0);
1615 BUG_ON((num_src_gpfn % PTRS_PER_PTE) != 0);
1617 while (i < num_src_gpfn) {
1618 volatile pte_t* pte;
1620 pte = lookup_noalloc_domain_pte(src_dom, (src_gpfn + i) << PAGE_SHIFT);
1621 if (pte == NULL) {
1622 i++;
1623 continue;
1626 if (expose_p2m_page(dest_dom,
1627 (dest_gpfn << PAGE_SHIFT) + i * sizeof(pte_t),
1628 virt_to_page(pte)) < 0) {
1629 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1630 return -EAGAIN;
1633 // skip to next pte page
1634 i = P2M_PFN_ROUNDDOWN(i + PTRS_PER_PTE);
1636 return 0;
1639 // expose p2m_pte_zero_page
1640 static int
1641 expose_zero_page(struct domain* dest_dom, unsigned long dest_gpfn,
1642 unsigned long num_src_gpfn)
1644 unsigned long i;
1646 for (i = 0; i < P2M_NUM_PFN(num_src_gpfn); i++) {
1647 volatile pte_t* pte;
1648 pte = lookup_noalloc_domain_pte(dest_dom,
1649 (dest_gpfn + i) << PAGE_SHIFT);
1650 if (pte == NULL || pte_present(*pte))
1651 continue;
1653 if (expose_p2m_page(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1654 p2m_pte_zero_page) < 0) {
1655 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n",
1656 __func__);
1657 return -EAGAIN;
1660 return 0;
1663 static int
1664 expose_p2m(struct domain* dest_dom, unsigned long dest_gpfn,
1665 struct domain* src_dom,
1666 unsigned long src_gpfn, unsigned long num_src_gpfn)
1668 if (allocate_pgd_pmd(dest_dom, dest_gpfn,
1669 src_dom, src_gpfn, num_src_gpfn))
1670 return -ENOMEM;
1672 if (expose_p2m_range(dest_dom, dest_gpfn,
1673 src_dom, src_gpfn, num_src_gpfn))
1674 return -EAGAIN;
1676 if (expose_zero_page(dest_dom, dest_gpfn, num_src_gpfn))
1677 return -EAGAIN;
1679 return 0;
1682 static void
1683 unexpose_p2m(struct domain* dest_dom,
1684 unsigned long dest_gpfn, unsigned long num_dest_gpfn)
1686 unsigned long i;
1688 for (i = 0; i < num_dest_gpfn; i++) {
1689 zap_domain_page_one(dest_dom, (dest_gpfn + i) << PAGE_SHIFT,
1690 0, INVALID_MFN);
1694 // It is possible to optimize loop, But this isn't performance critical.
1695 unsigned long
1696 dom0vp_expose_p2m(struct domain* d,
1697 unsigned long conv_start_gpfn,
1698 unsigned long assign_start_gpfn,
1699 unsigned long expose_size, unsigned long granule_pfn)
1701 unsigned long ret;
1702 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1704 if ((expose_size % PAGE_SIZE) != 0 ||
1705 (granule_pfn % PTRS_PER_PTE) != 0 ||
1706 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1707 (conv_start_gpfn % granule_pfn) != 0 ||
1708 (assign_start_gpfn % granule_pfn) != 0 ||
1709 (expose_num_pfn % granule_pfn) != 0) {
1710 gdprintk(XENLOG_INFO,
1711 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1712 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1713 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1714 return -EINVAL;
1717 if (granule_pfn != PTRS_PER_PTE) {
1718 gdprintk(XENLOG_INFO,
1719 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1720 __func__, granule_pfn, PTRS_PER_PTE);
1721 return -ENOSYS;
1723 ret = expose_p2m(d, assign_start_gpfn,
1724 d, conv_start_gpfn, expose_num_pfn);
1725 return ret;
1728 static int
1729 memmap_info_copy_from_guest(struct xen_ia64_memmap_info* memmap_info,
1730 char** memmap_p,
1731 XEN_GUEST_HANDLE(char) buffer)
1733 char *memmap;
1734 char *p;
1735 char *memmap_end;
1736 efi_memory_desc_t *md;
1737 unsigned long start;
1738 unsigned long end;
1739 efi_memory_desc_t *prev_md;
1741 if (copy_from_guest((char*)memmap_info, buffer, sizeof(*memmap_info)))
1742 return -EFAULT;
1743 if (memmap_info->efi_memdesc_size < sizeof(efi_memory_desc_t) ||
1744 memmap_info->efi_memmap_size < memmap_info->efi_memdesc_size ||
1745 (memmap_info->efi_memmap_size % memmap_info->efi_memdesc_size) != 0)
1746 return -EINVAL;
1748 memmap = _xmalloc(memmap_info->efi_memmap_size,
1749 __alignof__(efi_memory_desc_t));
1750 if (memmap == NULL)
1751 return -ENOMEM;
1752 if (copy_from_guest_offset(memmap, buffer, sizeof(*memmap_info),
1753 memmap_info->efi_memmap_size)) {
1754 xfree(memmap);
1755 return -EFAULT;
1758 /* intergirty check & simplify */
1759 sort(memmap, memmap_info->efi_memmap_size / memmap_info->efi_memdesc_size,
1760 memmap_info->efi_memdesc_size, efi_mdt_cmp, NULL);
1762 /* alignement & overlap check */
1763 prev_md = NULL;
1764 p = memmap;
1765 memmap_end = memmap + memmap_info->efi_memmap_size;
1766 for (p = memmap; p < memmap_end; p += memmap_info->efi_memmap_size) {
1767 md = (efi_memory_desc_t*)p;
1768 start = md->phys_addr;
1770 if (start & ((1UL << EFI_PAGE_SHIFT) - 1) || md->num_pages == 0) {
1771 xfree(memmap);
1772 return -EINVAL;
1775 if (prev_md != NULL) {
1776 unsigned long prev_end = MD_END(prev_md);
1777 if (prev_end > start) {
1778 xfree(memmap);
1779 return -EINVAL;
1783 prev_md = (efi_memory_desc_t *)p;
1786 /* coalease */
1787 prev_md = NULL;
1788 p = memmap;
1789 while (p < memmap_end) {
1790 md = (efi_memory_desc_t*)p;
1791 start = md->phys_addr;
1792 end = MD_END(md);
1794 start = P2M_PFN_ROUNDDOWN(start >> PAGE_SHIFT) << PAGE_SHIFT;
1795 end = P2M_PFN_ROUNDUP(end >> PAGE_SHIFT) << PAGE_SHIFT;
1796 md->phys_addr = start;
1797 md->num_pages = (end - start) >> EFI_PAGE_SHIFT;
1799 if (prev_md != NULL) {
1800 unsigned long prev_end = MD_END(prev_md);
1801 if (prev_end >= start) {
1802 size_t left;
1803 end = max(prev_end, end);
1804 prev_md->num_pages = (end - prev_md->phys_addr) >> EFI_PAGE_SHIFT;
1806 left = memmap_end - p;
1807 if (left > memmap_info->efi_memdesc_size) {
1808 left -= memmap_info->efi_memdesc_size;
1809 memmove(p, p + memmap_info->efi_memdesc_size, left);
1812 memmap_info->efi_memmap_size -= memmap_info->efi_memdesc_size;
1813 memmap_end -= memmap_info->efi_memdesc_size;
1814 continue;
1818 prev_md = md;
1819 p += memmap_info->efi_memdesc_size;
1822 if (copy_to_guest(buffer, (char*)memmap_info, sizeof(*memmap_info)) ||
1823 copy_to_guest_offset(buffer, sizeof(*memmap_info),
1824 (char*)memmap, memmap_info->efi_memmap_size)) {
1825 xfree(memmap);
1826 return -EFAULT;
1829 *memmap_p = memmap;
1830 return 0;
1833 static int
1834 foreign_p2m_allocate_pte(struct domain* d,
1835 const struct xen_ia64_memmap_info* memmap_info,
1836 const void* memmap)
1838 const void* memmap_end = memmap + memmap_info->efi_memmap_size;
1839 const void* p;
1841 for (p = memmap; p < memmap_end; p += memmap_info->efi_memdesc_size) {
1842 const efi_memory_desc_t* md = p;
1843 unsigned long start = md->phys_addr;
1844 unsigned long end = MD_END(md);
1845 unsigned long gpaddr;
1847 for (gpaddr = start; gpaddr < end; gpaddr += PAGE_SIZE) {
1848 if (lookup_alloc_domain_pte(d, gpaddr) == NULL) {
1849 return -ENOMEM;
1854 return 0;
1857 struct foreign_p2m_region {
1858 unsigned long gpfn;
1859 unsigned long num_gpfn;
1860 };
1862 struct foreign_p2m_entry {
1863 struct list_head list;
1864 int busy;
1866 /* src domain */
1867 struct domain* src_dom;
1869 /* region into which foreign p2m table is mapped */
1870 unsigned long gpfn;
1871 unsigned long num_gpfn;
1872 unsigned int num_region;
1873 struct foreign_p2m_region region[0];
1874 };
1876 /* caller must increment the reference count of src_dom */
1877 static int
1878 foreign_p2m_alloc(struct foreign_p2m* foreign_p2m,
1879 unsigned long dest_gpfn, struct domain* src_dom,
1880 struct xen_ia64_memmap_info* memmap_info, void* memmap,
1881 struct foreign_p2m_entry** entryp)
1883 void* memmap_end = memmap + memmap_info->efi_memmap_size;
1884 efi_memory_desc_t* md;
1885 unsigned long dest_gpfn_end;
1886 unsigned long src_gpfn;
1887 unsigned long src_gpfn_end;
1889 unsigned int num_region;
1890 struct foreign_p2m_entry* entry;
1891 struct foreign_p2m_entry* prev;
1892 struct foreign_p2m_entry* pos;
1894 num_region = (memmap_end - memmap) / memmap_info->efi_memdesc_size;
1896 md = memmap;
1897 src_gpfn = P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
1899 md = memmap + (num_region - 1) * memmap_info->efi_memdesc_size;
1900 src_gpfn_end = MD_END(md) >> PAGE_SHIFT;
1901 if (src_gpfn_end >
1902 P2M_PFN_ROUNDUP(src_dom->arch.convmem_end >> PAGE_SHIFT))
1903 return -EINVAL;
1905 src_gpfn_end = P2M_PFN_ROUNDUP(src_gpfn_end);
1906 dest_gpfn_end = dest_gpfn + P2M_NUM_PFN(src_gpfn_end - src_gpfn);
1907 entry = _xmalloc(sizeof(*entry) + num_region * sizeof(entry->region[0]),
1908 __alignof__(*entry));
1909 if (entry == NULL)
1910 return -ENOMEM;
1912 entry->busy = 1;
1913 entry->gpfn = dest_gpfn;
1914 entry->num_gpfn = dest_gpfn_end - dest_gpfn;
1915 entry->src_dom = src_dom;
1916 entry->num_region = 0;
1917 memset(entry->region, 0, sizeof(entry->region[0]) * num_region);
1918 prev = NULL;
1920 spin_lock(&foreign_p2m->lock);
1921 if (list_empty(&foreign_p2m->head))
1922 prev = (struct foreign_p2m_entry*)&foreign_p2m->head;
1924 list_for_each_entry(pos, &foreign_p2m->head, list) {
1925 if (pos->gpfn + pos->num_gpfn < dest_gpfn) {
1926 prev = pos;
1927 continue;
1930 if (dest_gpfn_end < pos->gpfn) {
1931 if (prev != NULL && prev->gpfn + prev->num_gpfn > dest_gpfn)
1932 prev = NULL;/* overlap */
1933 break;
1936 /* overlap */
1937 prev = NULL;
1938 break;
1940 if (prev != NULL) {
1941 list_add(&entry->list, &prev->list);
1942 spin_unlock(&foreign_p2m->lock);
1943 *entryp = entry;
1944 return 0;
1946 spin_unlock(&foreign_p2m->lock);
1947 xfree(entry);
1948 return -EBUSY;
1951 static void
1952 foreign_p2m_unexpose(struct domain* dest_dom, struct foreign_p2m_entry* entry)
1954 unsigned int i;
1956 BUG_ON(!entry->busy);
1957 for (i = 0; i < entry->num_region; i++)
1958 unexpose_p2m(dest_dom,
1959 entry->region[i].gpfn, entry->region[i].num_gpfn);
1962 static void
1963 foreign_p2m_unbusy(struct foreign_p2m* foreign_p2m,
1964 struct foreign_p2m_entry* entry)
1966 spin_lock(&foreign_p2m->lock);
1967 BUG_ON(!entry->busy);
1968 entry->busy = 0;
1969 spin_unlock(&foreign_p2m->lock);
1972 static void
1973 foreign_p2m_free(struct foreign_p2m* foreign_p2m,
1974 struct foreign_p2m_entry* entry)
1976 spin_lock(&foreign_p2m->lock);
1977 BUG_ON(!entry->busy);
1978 list_del(&entry->list);
1979 spin_unlock(&foreign_p2m->lock);
1981 put_domain(entry->src_dom);
1982 xfree(entry);
1985 void
1986 foreign_p2m_init(struct domain* d)
1988 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
1989 INIT_LIST_HEAD(&foreign_p2m->head);
1990 spin_lock_init(&foreign_p2m->lock);
1993 void
1994 foreign_p2m_destroy(struct domain* d)
1996 struct foreign_p2m* foreign_p2m = &d->arch.foreign_p2m;
1997 struct foreign_p2m_entry* entry;
1998 struct foreign_p2m_entry* n;
2000 spin_lock(&foreign_p2m->lock);
2001 list_for_each_entry_safe(entry, n, &foreign_p2m->head, list) {
2002 /* mm_teardown() cleared p2m table already */
2003 /* foreign_p2m_unexpose(d, entry);*/
2004 list_del(&entry->list);
2005 put_domain(entry->src_dom);
2006 xfree(entry);
2008 spin_unlock(&foreign_p2m->lock);
2011 unsigned long
2012 dom0vp_expose_foreign_p2m(struct domain* dest_dom,
2013 unsigned long dest_gpfn,
2014 domid_t domid,
2015 XEN_GUEST_HANDLE(char) buffer,
2016 unsigned long flags)
2018 unsigned long ret = 0;
2019 struct domain* src_dom;
2020 struct xen_ia64_memmap_info memmap_info;
2021 char* memmap;
2022 void* memmap_end;
2023 void* p;
2025 struct foreign_p2m_entry* entry;
2027 ret = memmap_info_copy_from_guest(&memmap_info, &memmap, buffer);
2028 if (ret != 0)
2029 return ret;
2031 dest_dom = rcu_lock_domain(dest_dom);
2032 if (dest_dom == NULL) {
2033 ret = -EINVAL;
2034 goto out;
2036 #if 1
2037 // Self foreign domain p2m exposure isn't allowed.
2038 // Otherwise the domain can't be destroyed because
2039 // no one decrements the domain reference count.
2040 if (domid == dest_dom->domain_id) {
2041 ret = -EINVAL;
2042 goto out;
2044 #endif
2046 src_dom = get_domain_by_id(domid);
2047 if (src_dom == NULL) {
2048 ret = -EINVAL;
2049 goto out_unlock;
2052 if (flags & IA64_DOM0VP_EFP_ALLOC_PTE) {
2053 ret = foreign_p2m_allocate_pte(src_dom, &memmap_info, memmap);
2054 if (ret != 0)
2055 goto out_unlock;
2058 ret = foreign_p2m_alloc(&dest_dom->arch.foreign_p2m, dest_gpfn,
2059 src_dom, &memmap_info, memmap, &entry);
2060 if (ret != 0)
2061 goto out_unlock;
2063 memmap_end = memmap + memmap_info.efi_memmap_size;
2064 for (p = memmap; p < memmap_end; p += memmap_info.efi_memdesc_size) {
2065 efi_memory_desc_t* md = p;
2066 unsigned long src_gpfn =
2067 P2M_PFN_ROUNDDOWN(md->phys_addr >> PAGE_SHIFT);
2068 unsigned long src_gpfn_end =
2069 P2M_PFN_ROUNDUP(MD_END(md) >> PAGE_SHIFT);
2070 unsigned long num_src_gpfn = src_gpfn_end - src_gpfn;
2072 ret = expose_p2m(dest_dom, dest_gpfn + src_gpfn / PTRS_PER_PTE,
2073 src_dom, src_gpfn, num_src_gpfn);
2074 if (ret != 0)
2075 break;
2077 entry->region[entry->num_region].gpfn =
2078 dest_gpfn + src_gpfn / PTRS_PER_PTE;
2079 entry->region[entry->num_region].num_gpfn = P2M_NUM_PFN(num_src_gpfn);
2080 entry->num_region++;
2083 if (ret == 0) {
2084 foreign_p2m_unbusy(&dest_dom->arch.foreign_p2m, entry);
2085 } else {
2086 foreign_p2m_unexpose(dest_dom, entry);
2087 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2090 out_unlock:
2091 rcu_unlock_domain(dest_dom);
2092 out:
2093 xfree(memmap);
2094 return ret;
2097 unsigned long
2098 dom0vp_unexpose_foreign_p2m(struct domain* dest_dom,
2099 unsigned long dest_gpfn,
2100 domid_t domid)
2102 int ret = -ENOENT;
2103 struct foreign_p2m* foreign_p2m = &dest_dom->arch.foreign_p2m;
2104 struct foreign_p2m_entry* entry;
2106 dest_dom = rcu_lock_domain(dest_dom);
2107 if (dest_dom == NULL)
2108 return ret;
2109 spin_lock(&foreign_p2m->lock);
2110 list_for_each_entry(entry, &foreign_p2m->head, list) {
2111 if (entry->gpfn < dest_gpfn)
2112 continue;
2113 if (dest_gpfn < entry->gpfn)
2114 break;
2116 if (domid == entry->src_dom->domain_id)
2117 ret = 0;
2118 else
2119 ret = -EINVAL;
2120 break;
2122 if (ret == 0) {
2123 if (entry->busy == 0)
2124 entry->busy = 1;
2125 else
2126 ret = -EBUSY;
2128 spin_unlock(&foreign_p2m->lock);
2130 if (ret == 0) {
2131 foreign_p2m_unexpose(dest_dom, entry);
2132 foreign_p2m_free(&dest_dom->arch.foreign_p2m, entry);
2134 rcu_unlock_domain(dest_dom);
2135 return ret;
2137 #endif
2139 // grant table host mapping
2140 // mpaddr: host_addr: pseudo physical address
2141 // mfn: frame: machine page frame
2142 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
2143 int
2144 create_grant_host_mapping(unsigned long gpaddr,
2145 unsigned long mfn, unsigned int flags)
2147 struct domain* d = current->domain;
2148 struct page_info* page;
2149 int ret;
2151 if (flags & (GNTMAP_device_map |
2152 GNTMAP_application_map | GNTMAP_contains_pte)) {
2153 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
2154 return GNTST_general_error;
2157 BUG_ON(!mfn_valid(mfn));
2158 page = mfn_to_page(mfn);
2159 ret = get_page(page, page_get_owner(page));
2160 BUG_ON(ret == 0);
2161 assign_domain_page_replace(d, gpaddr, mfn,
2162 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2163 ASSIGN_tlb_track |
2164 #endif
2165 ((flags & GNTMAP_readonly) ?
2166 ASSIGN_readonly : ASSIGN_writable));
2167 perfc_incr(create_grant_host_mapping);
2168 return GNTST_okay;
2171 // grant table host unmapping
2172 int
2173 replace_grant_host_mapping(unsigned long gpaddr,
2174 unsigned long mfn, unsigned long new_gpaddr, unsigned int flags)
2176 struct domain* d = current->domain;
2177 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
2178 volatile pte_t* pte;
2179 unsigned long cur_arflags;
2180 pte_t cur_pte;
2181 pte_t new_pte = __pte(0);
2182 pte_t old_pte;
2183 struct page_info* page = mfn_to_page(mfn);
2184 struct page_info* new_page = NULL;
2185 volatile pte_t* new_page_pte = NULL;
2187 if (new_gpaddr) {
2188 new_page_pte = lookup_noalloc_domain_pte_none(d, new_gpaddr);
2189 if (likely(new_page_pte != NULL)) {
2190 new_pte = ptep_get_and_clear(&d->arch.mm,
2191 new_gpaddr, new_page_pte);
2192 if (likely(pte_present(new_pte))) {
2193 unsigned long new_page_mfn;
2194 struct domain* page_owner;
2196 new_page_mfn = pte_pfn(new_pte);
2197 new_page = mfn_to_page(new_page_mfn);
2198 page_owner = page_get_owner(new_page);
2199 if (unlikely(page_owner == NULL)) {
2200 gdprintk(XENLOG_INFO,
2201 "%s: page_owner == NULL "
2202 "gpaddr 0x%lx mfn 0x%lx "
2203 "new_gpaddr 0x%lx mfn 0x%lx\n",
2204 __func__, gpaddr, mfn, new_gpaddr, new_page_mfn);
2205 new_page = NULL; /* prevent domain_put_page() */
2206 goto out;
2209 /*
2210 * domain_put_page(clear_PGC_allcoated = 0)
2211 * doesn't decrement refcount of page with
2212 * pte_ptc_allocated() = 1. Be carefull.
2213 */
2214 if (unlikely(!pte_pgc_allocated(new_pte))) {
2215 /* domain_put_page() decrements page refcount. adjust it. */
2216 if (get_page(new_page, page_owner)) {
2217 gdprintk(XENLOG_INFO,
2218 "%s: get_page() failed. "
2219 "gpaddr 0x%lx mfn 0x%lx "
2220 "new_gpaddr 0x%lx mfn 0x%lx\n",
2221 __func__, gpaddr, mfn,
2222 new_gpaddr, new_page_mfn);
2223 goto out;
2226 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 0);
2227 } else
2228 new_pte = __pte(0);
2232 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
2233 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
2234 return GNTST_general_error;
2237 pte = lookup_noalloc_domain_pte(d, gpaddr);
2238 if (pte == NULL) {
2239 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
2240 __func__, gpaddr, mfn);
2241 goto out;
2244 again:
2245 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
2246 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
2247 if (!pte_present(cur_pte) ||
2248 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
2249 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
2250 __func__, gpaddr, mfn, pte_val(cur_pte));
2251 goto out;
2254 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
2255 if (unlikely(!pte_present(old_pte))) {
2256 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx"
2257 " cur_pte 0x%lx old_pte 0x%lx\n",
2258 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
2259 goto out;
2261 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
2262 if (pte_pfn(old_pte) == mfn) {
2263 goto again;
2265 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
2266 "0x%lx old_pte 0x%lx\n",
2267 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
2268 goto out;
2270 BUG_ON(pte_pfn(old_pte) != mfn);
2272 /* try_to_clear_PGC_allocate(d, page) is not needed. */
2273 BUG_ON(page_get_owner(page) == d &&
2274 get_gpfn_from_mfn(mfn) == gpfn);
2275 BUG_ON(pte_pgc_allocated(old_pte));
2276 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
2278 perfc_incr(replace_grant_host_mapping);
2279 return GNTST_okay;
2281 out:
2282 if (new_page)
2283 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 1);
2284 return GNTST_general_error;
2287 // heavily depends on the struct page layout.
2288 // gnttab_transfer() calls steal_page() with memflags = 0
2289 // For grant table transfer, we must fill the page.
2290 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
2291 // For memory exchange, we don't have to fill the page because
2292 // memory_exchange() does it.
2293 int
2294 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
2296 #if 0 /* if big endian */
2297 # error "implement big endian version of steal_page()"
2298 #endif
2299 u32 _d, _nd;
2300 u64 x, nx, y;
2302 if (page_get_owner(page) != d) {
2303 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
2304 __func__, d, page_get_owner(page));
2305 return -1;
2308 if (!(memflags & MEMF_no_refcount)) {
2309 unsigned long gpfn;
2310 struct page_info *new;
2311 unsigned long new_mfn;
2312 int ret;
2314 new = alloc_domheap_page(d);
2315 if (new == NULL) {
2316 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
2317 return -1;
2319 // zero out pages for security reasons
2320 clear_page(page_to_virt(new));
2321 // assign_domain_page_cmpxchg_rel() has release semantics
2322 // so smp_mb() isn't needed.
2324 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
2325 if (gpfn == INVALID_M2P_ENTRY) {
2326 free_domheap_page(new);
2327 return -1;
2329 new_mfn = page_to_mfn(new);
2330 set_gpfn_from_mfn(new_mfn, gpfn);
2331 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
2332 // has release semantics.
2334 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
2335 ASSIGN_writable |
2336 ASSIGN_pgc_allocated, 0);
2337 if (ret < 0) {
2338 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
2339 ret);
2340 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
2341 free_domheap_page(new);
2342 return -1;
2344 perfc_incr(steal_page_refcount);
2347 spin_lock(&d->page_alloc_lock);
2349 /*
2350 * The tricky bit: atomically release ownership while there is just one
2351 * benign reference to the page (PGC_allocated). If that reference
2352 * disappears then the deallocation routine will safely spin.
2353 */
2354 _d = pickle_domptr(d);
2355 y = *((u64*)&page->count_info);
2356 do {
2357 x = y;
2358 nx = x & 0xffffffff;
2359 // page->count_info: untouched
2360 // page->u.inused._domain = 0;
2361 _nd = x >> 32;
2363 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
2364 (1 | PGC_allocated))) ||
2365 unlikely(_nd != _d)) {
2366 struct domain* nd = unpickle_domptr(_nd);
2367 if (nd == NULL) {
2368 gdprintk(XENLOG_INFO, "gnttab_transfer: "
2369 "Bad page %p: ed=%p(%u) 0x%x, "
2370 "sd=%p 0x%x,"
2371 " caf=%016lx, taf=%" PRtype_info
2372 " memflags 0x%x\n",
2373 (void *) page_to_mfn(page),
2374 d, d->domain_id, _d,
2375 nd, _nd,
2376 x,
2377 page->u.inuse.type_info,
2378 memflags);
2379 } else {
2380 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
2381 "Bad page %p: ed=%p(%u) 0x%x, "
2382 "sd=%p(%u) 0x%x,"
2383 " caf=%016lx, taf=%" PRtype_info
2384 " memflags 0x%x\n",
2385 (void *) page_to_mfn(page),
2386 d, d->domain_id, _d,
2387 nd, nd->domain_id, _nd,
2388 x,
2389 page->u.inuse.type_info,
2390 memflags);
2392 spin_unlock(&d->page_alloc_lock);
2393 return -1;
2396 y = cmpxchg((u64*)&page->count_info, x, nx);
2397 } while (unlikely(y != x));
2399 /*
2400 * Unlink from 'd'. At least one reference remains (now anonymous), so
2401 * noone else is spinning to try to delete this page from 'd'.
2402 */
2403 if ( !(memflags & MEMF_no_refcount) )
2404 d->tot_pages--;
2405 list_del(&page->list);
2407 spin_unlock(&d->page_alloc_lock);
2408 perfc_incr(steal_page);
2409 return 0;
2412 void
2413 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
2414 unsigned long mfn)
2416 BUG_ON(!mfn_valid(mfn));
2417 BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
2418 set_gpfn_from_mfn(mfn, gpfn);
2419 smp_mb();
2420 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
2421 ASSIGN_writable | ASSIGN_pgc_allocated);
2423 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
2425 perfc_incr(guest_physmap_add_page);
2428 void
2429 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
2430 unsigned long mfn)
2432 BUG_ON(mfn == 0);//XXX
2433 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
2434 perfc_incr(guest_physmap_remove_page);
2437 static void
2438 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
2439 volatile pte_t* ptep, pte_t old_pte,
2440 struct page_info* page)
2442 #ifdef CONFIG_XEN_IA64_TLB_TRACK
2443 struct tlb_track_entry* entry;
2444 #endif
2446 if (shadow_mode_enabled(d))
2447 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
2449 #ifndef CONFIG_XEN_IA64_TLB_TRACK
2450 //XXX sledgehammer.
2451 // flush finer range.
2452 domain_flush_vtlb_all(d);
2453 put_page(page);
2454 #else
2455 switch (tlb_track_search_and_remove(d->arch.tlb_track,
2456 ptep, old_pte, &entry)) {
2457 case TLB_TRACK_NOT_TRACKED:
2458 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
2459 /* This page is zapped from this domain
2460 * by memory decrease or exchange or dom0vp_zap_physmap.
2461 * I.e. the page is zapped for returning this page to xen
2462 * (balloon driver or DMA page allocation) or
2463 * foreign domain mapped page is unmapped from the domain.
2464 * In the former case the page is to be freed so that
2465 * we can defer freeing page to batch.
2466 * In the latter case the page is unmapped so that
2467 * we need to flush it. But to optimize it, we
2468 * queue the page and flush vTLB only once.
2469 * I.e. The caller must call dfree_flush() explicitly.
2470 */
2471 domain_flush_vtlb_all(d);
2472 put_page(page);
2473 break;
2474 case TLB_TRACK_NOT_FOUND:
2475 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
2476 /* This page is zapped from this domain
2477 * by grant table page unmap.
2478 * Luckily the domain that mapped this page didn't
2479 * access this page so that we don't have to flush vTLB.
2480 * Probably the domain did only DMA.
2481 */
2482 /* do nothing */
2483 put_page(page);
2484 break;
2485 case TLB_TRACK_FOUND:
2486 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
2487 /* This page is zapped from this domain
2488 * by grant table page unmap.
2489 * Fortunately this page is accessced via only one virtual
2490 * memory address. So it is easy to flush it.
2491 */
2492 domain_flush_vtlb_track_entry(d, entry);
2493 tlb_track_free_entry(d->arch.tlb_track, entry);
2494 put_page(page);
2495 break;
2496 case TLB_TRACK_MANY:
2497 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
2498 /* This page is zapped from this domain
2499 * by grant table page unmap.
2500 * Unfortunately this page is accessced via many virtual
2501 * memory address (or too many times with single virtual address).
2502 * So we abondaned to track virtual addresses.
2503 * full vTLB flush is necessary.
2504 */
2505 domain_flush_vtlb_all(d);
2506 put_page(page);
2507 break;
2508 case TLB_TRACK_AGAIN:
2509 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
2510 BUG();
2511 break;
2513 #endif
2514 perfc_incr(domain_page_flush_and_put);
2517 int
2518 domain_page_mapped(struct domain* d, unsigned long mpaddr)
2520 volatile pte_t * pte;
2522 pte = lookup_noalloc_domain_pte(d, mpaddr);
2523 if(pte != NULL && !pte_none(*pte))
2524 return 1;
2525 return 0;
2528 /* Flush cache of domain d. */
2529 void domain_cache_flush (struct domain *d, int sync_only)
2531 struct mm_struct *mm = &d->arch.mm;
2532 volatile pgd_t *pgd = mm->pgd;
2533 unsigned long maddr;
2534 int i,j,k, l;
2535 int nbr_page = 0;
2536 void (*flush_func)(unsigned long start, unsigned long end);
2537 extern void flush_dcache_range (unsigned long, unsigned long);
2539 if (sync_only)
2540 flush_func = &flush_icache_range;
2541 else
2542 flush_func = &flush_dcache_range;
2544 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
2545 volatile pud_t *pud;
2546 if (!pgd_present(*pgd)) // acquire semantics
2547 continue;
2548 pud = pud_offset(pgd, 0);
2549 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
2550 volatile pmd_t *pmd;
2551 if (!pud_present(*pud)) // acquire semantics
2552 continue;
2553 pmd = pmd_offset(pud, 0);
2554 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
2555 volatile pte_t *pte;
2556 if (!pmd_present(*pmd)) // acquire semantics
2557 continue;
2558 pte = pte_offset_map(pmd, 0);
2559 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
2560 if (!pte_present(*pte)) // acquire semantics
2561 continue;
2562 /* Convert PTE to maddr. */
2563 maddr = __va_ul (pte_val(*pte)
2564 & _PAGE_PPN_MASK);
2565 (*flush_func)(maddr, maddr+ PAGE_SIZE);
2566 nbr_page++;
2571 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
2574 #ifdef VERBOSE
2575 #define MEM_LOG(_f, _a...) \
2576 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
2577 current->domain->domain_id , __LINE__ , ## _a )
2578 #else
2579 #define MEM_LOG(_f, _a...) ((void)0)
2580 #endif
2582 static void free_page_type(struct page_info *page, u32 type)
2586 static int alloc_page_type(struct page_info *page, u32 type)
2588 return 1;
2591 static int opt_p2m_xenheap;
2592 boolean_param("p2m_xenheap", opt_p2m_xenheap);
2594 void *pgtable_quicklist_alloc(void)
2596 void *p;
2598 BUG_ON(dom_p2m == NULL);
2599 if (!opt_p2m_xenheap) {
2600 struct page_info *page = alloc_domheap_page(dom_p2m);
2601 if (page == NULL)
2602 return NULL;
2603 p = page_to_virt(page);
2604 clear_page(p);
2605 return p;
2607 p = alloc_xenheap_pages(0);
2608 if (p) {
2609 clear_page(p);
2610 /*
2611 * This page should be read only. At this moment, the third
2612 * argument doesn't make sense. It should be 1 when supported.
2613 */
2614 share_xen_page_with_guest(virt_to_page(p), dom_p2m, 0);
2616 return p;
2619 void pgtable_quicklist_free(void *pgtable_entry)
2621 struct page_info* page = virt_to_page(pgtable_entry);
2623 BUG_ON(page_get_owner(page) != dom_p2m);
2624 BUG_ON(page->count_info != (1 | PGC_allocated));
2626 put_page(page);
2627 if (opt_p2m_xenheap)
2628 free_xenheap_page(pgtable_entry);
2631 void put_page_type(struct page_info *page)
2633 u64 nx, x, y = page->u.inuse.type_info;
2635 again:
2636 do {
2637 x = y;
2638 nx = x - 1;
2640 ASSERT((x & PGT_count_mask) != 0);
2642 /*
2643 * The page should always be validated while a reference is held. The
2644 * exception is during domain destruction, when we forcibly invalidate
2645 * page-table pages if we detect a referential loop.
2646 * See domain.c:relinquish_list().
2647 */
2648 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
2650 if ( unlikely((nx & PGT_count_mask) == 0) )
2652 /* Record TLB information for flush later. Races are harmless. */
2653 page->tlbflush_timestamp = tlbflush_current_time();
2655 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2656 likely(nx & PGT_validated) )
2658 /*
2659 * Page-table pages must be unvalidated when count is zero. The
2660 * 'free' is safe because the refcnt is non-zero and validated
2661 * bit is clear => other ops will spin or fail.
2662 */
2663 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
2664 x & ~PGT_validated)) != x) )
2665 goto again;
2666 /* We cleared the 'valid bit' so we do the clean up. */
2667 free_page_type(page, x);
2668 /* Carry on, but with the 'valid bit' now clear. */
2669 x &= ~PGT_validated;
2670 nx &= ~PGT_validated;
2674 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
2678 int get_page_type(struct page_info *page, u32 type)
2680 u64 nx, x, y = page->u.inuse.type_info;
2682 ASSERT(!(type & ~PGT_type_mask));
2684 again:
2685 do {
2686 x = y;
2687 nx = x + 1;
2688 if ( unlikely((nx & PGT_count_mask) == 0) )
2690 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2691 return 0;
2693 else if ( unlikely((x & PGT_count_mask) == 0) )
2695 if ( (x & PGT_type_mask) != type )
2697 /*
2698 * On type change we check to flush stale TLB entries. This
2699 * may be unnecessary (e.g., page was GDT/LDT) but those
2700 * circumstances should be very rare.
2701 */
2702 cpumask_t mask =
2703 page_get_owner(page)->domain_dirty_cpumask;
2704 tlbflush_filter(mask, page->tlbflush_timestamp);
2706 if ( unlikely(!cpus_empty(mask)) )
2708 perfc_incr(need_flush_tlb_flush);
2709 flush_tlb_mask(mask);
2712 /* We lose existing type, back pointer, and validity. */
2713 nx &= ~(PGT_type_mask | PGT_validated);
2714 nx |= type;
2716 /* No special validation needed for writable pages. */
2717 /* Page tables and GDT/LDT need to be scanned for validity. */
2718 if ( type == PGT_writable_page )
2719 nx |= PGT_validated;
2722 else if ( unlikely((x & PGT_type_mask) != type) )
2724 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
2725 (type != PGT_l1_page_table) )
2726 MEM_LOG("Bad type (saw %08lx != exp %08x) "
2727 "for mfn %016lx (pfn %016lx)",
2728 x, type, page_to_mfn(page),
2729 get_gpfn_from_mfn(page_to_mfn(page)));
2730 return 0;
2732 else if ( unlikely(!(x & PGT_validated)) )
2734 /* Someone else is updating validation of this page. Wait... */
2735 while ( (y = page->u.inuse.type_info) == x )
2736 cpu_relax();
2737 goto again;
2740 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
2742 if ( unlikely(!(nx & PGT_validated)) )
2744 /* Try to validate page type; drop the new reference on failure. */
2745 if ( unlikely(!alloc_page_type(page, type)) )
2747 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
2748 ": caf=%08x taf=%" PRtype_info,
2749 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2750 type, page->count_info, page->u.inuse.type_info);
2751 /* Noone else can get a reference. We hold the only ref. */
2752 page->u.inuse.type_info = 0;
2753 return 0;
2756 /* Noone else is updating simultaneously. */
2757 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2760 return 1;
2763 int memory_is_conventional_ram(paddr_t p)
2765 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
2769 long
2770 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2772 switch (op) {
2773 case XENMEM_add_to_physmap:
2775 struct xen_add_to_physmap xatp;
2776 unsigned long prev_mfn, mfn = 0, gpfn;
2777 struct domain *d;
2779 if (copy_from_guest(&xatp, arg, 1))
2780 return -EFAULT;
2782 if (xatp.domid == DOMID_SELF) {
2783 d = get_current_domain();
2785 else if (!IS_PRIV(current->domain))
2786 return -EPERM;
2787 else if ((d = get_domain_by_id(xatp.domid)) == NULL)
2788 return -ESRCH;
2790 /* This hypercall is used for VT-i domain only */
2791 if (!VMX_DOMAIN(d->vcpu[0])) {
2792 put_domain(d);
2793 return -ENOSYS;
2796 switch (xatp.space) {
2797 case XENMAPSPACE_shared_info:
2798 if (xatp.idx == 0)
2799 mfn = virt_to_mfn(d->shared_info);
2800 break;
2801 case XENMAPSPACE_grant_table:
2802 spin_lock(&d->grant_table->lock);
2804 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
2805 (xatp.idx < max_nr_grant_frames))
2806 gnttab_grow_table(d, xatp.idx + 1);
2808 if (xatp.idx < nr_grant_frames(d->grant_table))
2809 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
2811 spin_unlock(&d->grant_table->lock);
2812 break;
2813 default:
2814 break;
2817 if (mfn == 0) {
2818 put_domain(d);
2819 return -EINVAL;
2822 LOCK_BIGLOCK(d);
2824 /* Check remapping necessity */
2825 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2826 if (mfn == prev_mfn)
2827 goto out;
2829 /* Remove previously mapped page if it was present. */
2830 if (prev_mfn && mfn_valid(prev_mfn)) {
2831 if (is_xen_heap_frame(mfn_to_page(prev_mfn)))
2832 /* Xen heap frames are simply unhooked from this phys slot. */
2833 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2834 else
2835 /* Normal domain memory is freed, to avoid leaking memory. */
2836 guest_remove_page(d, xatp.gpfn);
2839 /* Unmap from old location, if any. */
2840 gpfn = get_gpfn_from_mfn(mfn);
2841 if (gpfn != INVALID_M2P_ENTRY)
2842 guest_physmap_remove_page(d, gpfn, mfn);
2844 /* Map at new location. */
2845 guest_physmap_add_page(d, xatp.gpfn, mfn);
2847 out:
2848 UNLOCK_BIGLOCK(d);
2850 put_domain(d);
2852 break;
2855 case XENMEM_machine_memory_map:
2857 struct xen_memory_map memmap;
2858 struct xen_ia64_memmap_info memmap_info;
2859 XEN_GUEST_HANDLE(char) buffer;
2861 if (!IS_PRIV(current->domain))
2862 return -EINVAL;
2863 if (copy_from_guest(&memmap, arg, 1))
2864 return -EFAULT;
2865 if (memmap.nr_entries <
2866 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size)
2867 return -EINVAL;
2869 memmap.nr_entries =
2870 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size;
2871 memset(&memmap_info, 0, sizeof(memmap_info));
2872 memmap_info.efi_memmap_size = ia64_boot_param->efi_memmap_size;
2873 memmap_info.efi_memdesc_size = ia64_boot_param->efi_memdesc_size;
2874 memmap_info.efi_memdesc_version = ia64_boot_param->efi_memdesc_version;
2876 buffer = guest_handle_cast(memmap.buffer, char);
2877 if (copy_to_guest(buffer, (char*)&memmap_info, sizeof(memmap_info)) ||
2878 copy_to_guest_offset(buffer, sizeof(memmap_info),
2879 (char*)__va(ia64_boot_param->efi_memmap),
2880 ia64_boot_param->efi_memmap_size) ||
2881 copy_to_guest(arg, &memmap, 1))
2882 return -EFAULT;
2883 return 0;
2886 default:
2887 return -ENOSYS;
2890 return 0;
2893 /*
2894 * Local variables:
2895 * mode: C
2896 * c-set-style: "BSD"
2897 * c-basic-offset: 4
2898 * tab-width: 4
2899 * indent-tabs-mode: nil
2900 * End:
2901 */