ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 15840:af03eea56697

[IA64] Add missing continuable destroy domain chunk

Signed-off-by: Akio Takebe <takebe_akio@jp.fujitsu.com>
author Alex Williamson <alex.williamson@hp.com>
date Thu Sep 06 12:57:13 2007 -0600 (2007-09-06)
parents 8b2e0de43b55
children d956779d8d47
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * replace_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <xen/guest_access.h>
179 #include <asm/page.h>
180 #include <public/memory.h>
181 #include <asm/event.h>
183 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
184 volatile pte_t* ptep, pte_t old_pte,
185 struct page_info* page);
187 extern unsigned long ia64_iobase;
189 static struct domain *dom_xen, *dom_io;
191 /*
192 * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO.
193 * If more reserved domain ids are introduced, this might be increased.
194 */
195 #define DOMID_P2M (0x7FF8U)
196 static struct domain *dom_p2m;
198 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
199 void
200 alloc_dom_xen_and_dom_io(void)
201 {
202 /*
203 * Initialise our DOMID_XEN domain.
204 * Any Xen-heap pages that we will allow to be mapped will have
205 * their domain field set to dom_xen.
206 */
207 dom_xen = alloc_domain(DOMID_XEN);
208 BUG_ON(dom_xen == NULL);
210 /*
211 * Initialise our DOMID_IO domain.
212 * This domain owns I/O pages that are within the range of the page_info
213 * array. Mappings occur at the priv of the caller.
214 */
215 dom_io = alloc_domain(DOMID_IO);
216 BUG_ON(dom_io == NULL);
217 }
219 static int
220 mm_teardown_can_skip(struct domain* d, unsigned long offset)
221 {
222 return d->arch.mm_teardown_offset > offset;
223 }
225 static void
226 mm_teardown_update_offset(struct domain* d, unsigned long offset)
227 {
228 d->arch.mm_teardown_offset = offset;
229 }
231 static void
232 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
233 {
234 pte_t old_pte;
235 unsigned long mfn;
236 struct page_info* page;
238 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
240 // vmx domain use bit[58:56] to distinguish io region from memory.
241 // see vmx_build_physmap_table() in vmx_init.c
242 if (!pte_mem(old_pte))
243 return;
245 // domain might map IO space or acpi table pages. check it.
246 mfn = pte_pfn(old_pte);
247 if (!mfn_valid(mfn))
248 return;
249 page = mfn_to_page(mfn);
250 BUG_ON(page_get_owner(page) == NULL);
252 // struct page_info corresponding to mfn may exist or not depending
253 // on CONFIG_VIRTUAL_FRAME_TABLE.
254 // The above check is too easy.
255 // The right way is to check whether this page is of io area or acpi pages
257 if (pte_pgc_allocated(old_pte)) {
258 BUG_ON(page_get_owner(page) != d);
259 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
260 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
261 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
262 put_page(page);
263 } else {
264 put_page(page);
265 }
266 }
268 static int
269 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
270 {
271 unsigned long i;
272 volatile pte_t* pte = pte_offset_map(pmd, offset);
274 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
275 unsigned long cur_offset = offset + (i << PAGE_SHIFT);
276 if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
277 continue;
278 if (!pte_present(*pte)) { // acquire semantics
279 mm_teardown_update_offset(d, cur_offset);
280 continue;
281 }
282 mm_teardown_update_offset(d, cur_offset);
283 mm_teardown_pte(d, pte, cur_offset);
284 if (hypercall_preempt_check())
285 return -EAGAIN;
286 }
287 return 0;
288 }
290 static int
291 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
292 {
293 unsigned long i;
294 volatile pmd_t *pmd = pmd_offset(pud, offset);
296 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
297 unsigned long cur_offset = offset + (i << PMD_SHIFT);
298 if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
299 continue;
300 if (!pmd_present(*pmd)) { // acquire semantics
301 mm_teardown_update_offset(d, cur_offset);
302 continue;
303 }
304 if (mm_teardown_pmd(d, pmd, cur_offset))
305 return -EAGAIN;
306 }
307 return 0;
308 }
310 static int
311 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
312 {
313 unsigned long i;
314 volatile pud_t *pud = pud_offset(pgd, offset);
316 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
317 unsigned long cur_offset = offset + (i << PUD_SHIFT);
318 #ifndef __PAGETABLE_PUD_FOLDED
319 if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
320 continue;
321 #endif
322 if (!pud_present(*pud)) { // acquire semantics
323 #ifndef __PAGETABLE_PUD_FOLDED
324 mm_teardown_update_offset(d, cur_offset);
325 #endif
326 continue;
327 }
328 if (mm_teardown_pud(d, pud, cur_offset))
329 return -EAGAIN;
330 }
331 return 0;
332 }
334 int
335 mm_teardown(struct domain* d)
336 {
337 struct mm_struct* mm = &d->arch.mm;
338 unsigned long i;
339 volatile pgd_t* pgd;
341 if (mm->pgd == NULL)
342 return 0;
344 pgd = pgd_offset(mm, 0);
345 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
346 unsigned long cur_offset = i << PGDIR_SHIFT;
348 if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE))
349 continue;
350 if (!pgd_present(*pgd)) { // acquire semantics
351 mm_teardown_update_offset(d, cur_offset);
352 continue;
353 }
354 if (mm_teardown_pgd(d, pgd, cur_offset))
355 return -EAGAIN;
356 }
357 return 0;
358 }
360 static void
361 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
362 unsigned long offset)
363 {
364 pte_free_kernel(pte_offset_map(pmd, offset));
365 }
367 static void
368 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
369 unsigned long offset)
370 {
371 unsigned long i;
372 volatile pmd_t *pmd = pmd_offset(pud, offset);
374 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
375 if (!pmd_present(*pmd))
376 continue;
377 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
378 }
379 pmd_free(pmd_offset(pud, offset));
380 }
382 static void
383 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
384 unsigned long offset)
385 {
386 unsigned long i;
387 volatile pud_t *pud = pud_offset(pgd, offset);
389 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
390 if (!pud_present(*pud))
391 continue;
392 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
393 }
394 pud_free(pud_offset(pgd, offset));
395 }
397 static void
398 mm_p2m_teardown(struct domain* d)
399 {
400 struct mm_struct* mm = &d->arch.mm;
401 unsigned long i;
402 volatile pgd_t* pgd;
404 BUG_ON(mm->pgd == NULL);
405 pgd = pgd_offset(mm, 0);
406 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
407 if (!pgd_present(*pgd))
408 continue;
409 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
410 }
411 pgd_free(mm->pgd);
412 mm->pgd = NULL;
413 }
415 void
416 mm_final_teardown(struct domain* d)
417 {
418 if (d->arch.shadow_bitmap != NULL) {
419 xfree(d->arch.shadow_bitmap);
420 d->arch.shadow_bitmap = NULL;
421 }
422 mm_p2m_teardown(d);
423 }
425 unsigned long
426 domain_get_maximum_gpfn(struct domain *d)
427 {
428 return (d->arch.convmem_end + PAGE_SIZE - 1) >> PAGE_SHIFT;
429 }
431 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
432 void
433 share_xen_page_with_guest(struct page_info *page,
434 struct domain *d, int readonly)
435 {
436 if ( page_get_owner(page) == d )
437 return;
439 #if 1
440 if (readonly) {
441 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
442 }
443 #endif
445 // alloc_xenheap_pages() doesn't initialize page owner.
446 //BUG_ON(page_get_owner(page) != NULL);
448 spin_lock(&d->page_alloc_lock);
450 #ifndef __ia64__
451 /* The incremented type count pins as writable or read-only. */
452 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
453 page->u.inuse.type_info |= PGT_validated | 1;
454 #endif
456 page_set_owner(page, d);
457 wmb(); /* install valid domain ptr before updating refcnt. */
458 ASSERT(page->count_info == 0);
460 /* Only add to the allocation list if the domain isn't dying. */
461 if ( !d->is_dying )
462 {
463 page->count_info |= PGC_allocated | 1;
464 if ( unlikely(d->xenheap_pages++ == 0) )
465 get_knownalive_domain(d);
466 list_add_tail(&page->list, &d->xenpage_list);
467 }
469 // grant_table_destroy() releases these pages.
470 // but it doesn't clear their m2p entry. So there might remain stale
471 // entries. such a stale entry is cleared here.
472 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
474 spin_unlock(&d->page_alloc_lock);
475 }
477 void
478 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
479 {
480 share_xen_page_with_guest(page, dom_xen, readonly);
481 }
483 unsigned long
484 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
485 {
486 unsigned long pte;
488 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
489 if (!pte) {
490 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
491 }
492 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
493 }
495 // given a domain virtual address, pte and pagesize, extract the metaphysical
496 // address, convert the pte for a physical address for (possibly different)
497 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
498 // current->arch.vhpt_pg_shift!)
499 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* itir,
500 struct p2m_entry* entry)
501 {
502 struct domain *d = current->domain;
503 ia64_itir_t _itir = {.itir = itir__};
504 u64 mask, mpaddr, pteval2;
505 u64 arflags;
506 u64 arflags2;
507 u64 maflags2;
508 u64 ps;
510 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
512 // FIXME address had better be pre-validated on insert
513 mask = ~itir_mask(_itir.itir);
514 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
515 ps = current->arch.vhpt_pg_shift ? current->arch.vhpt_pg_shift :
516 PAGE_SHIFT;
518 if (_itir.ps > ps)
519 _itir.ps = ps;
521 ((ia64_itir_t*)itir)->itir = _itir.itir;/* Copy the whole register. */
522 ((ia64_itir_t*)itir)->ps = _itir.ps; /* Overwrite ps part! */
524 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
525 if (ps < PAGE_SHIFT)
526 pteval2 |= address & (PAGE_SIZE - 1) & ~((1L << ps) - 1);
528 /* Check access rights. */
529 arflags = pteval & _PAGE_AR_MASK;
530 arflags2 = pteval2 & _PAGE_AR_MASK;
531 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
532 #if 0
533 dprintk(XENLOG_WARNING,
534 "%s:%d "
535 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
536 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
537 __func__, __LINE__,
538 pteval, arflags, address, itir__,
539 pteval2, arflags2, mpaddr);
540 #endif
541 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
542 }
544 /* Check memory attribute. The switch is on the *requested* memory
545 attribute. */
546 maflags2 = pteval2 & _PAGE_MA_MASK;
547 switch (pteval & _PAGE_MA_MASK) {
548 case _PAGE_MA_NAT:
549 /* NaT pages are always accepted! */
550 break;
551 case _PAGE_MA_UC:
552 case _PAGE_MA_UCE:
553 case _PAGE_MA_WC:
554 if (maflags2 == _PAGE_MA_WB) {
555 /* Don't let domains WB-map uncached addresses.
556 This can happen when domU tries to touch i/o
557 port space. Also prevents possible address
558 aliasing issues. */
559 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) {
560 u64 ucwb;
562 /*
563 * If dom0 page has both UC & WB attributes
564 * don't warn about attempted UC access.
565 */
566 ucwb = efi_mem_attribute(mpaddr, PAGE_SIZE);
567 ucwb &= EFI_MEMORY_UC | EFI_MEMORY_WB;
568 ucwb ^= EFI_MEMORY_UC | EFI_MEMORY_WB;
570 if (d != dom0 || ucwb != 0)
571 gdprintk(XENLOG_WARNING, "Warning: UC"
572 " to WB for mpaddr=%lx\n",
573 mpaddr);
574 }
575 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
576 }
577 break;
578 case _PAGE_MA_WB:
579 if (maflags2 != _PAGE_MA_WB) {
580 /* Forbid non-coherent access to coherent memory. */
581 panic_domain(NULL, "try to use WB mem attr on "
582 "UC page, mpaddr=%lx\n", mpaddr);
583 }
584 break;
585 default:
586 panic_domain(NULL, "try to use unknown mem attribute\n");
587 }
589 /* If shadow mode is enabled, virtualize dirty bit. */
590 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
591 u64 mp_page = mpaddr >> PAGE_SHIFT;
592 pteval |= _PAGE_VIRT_D;
594 /* If the page is not already dirty, don't set the dirty bit! */
595 if (mp_page < d->arch.shadow_bitmap_size * 8
596 && !test_bit(mp_page, d->arch.shadow_bitmap))
597 pteval &= ~_PAGE_D;
598 }
600 /* Ignore non-addr bits of pteval2 and force PL0->1
601 (PL3 is unaffected) */
602 return (pteval & ~(_PAGE_PPN_MASK | _PAGE_PL_MASK)) |
603 (pteval2 & _PAGE_PPN_MASK) |
604 (vcpu_pl_adjust(pteval, 7) & _PAGE_PL_MASK);
605 }
607 // given a current domain metaphysical address, return the physical address
608 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
609 struct p2m_entry* entry)
610 {
611 unsigned long pteval;
613 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
614 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
615 }
617 //XXX !xxx_present() should be used instread of !xxx_none()?
618 // pud, pmd, pte page is zero cleared when they are allocated.
619 // Their area must be visible before population so that
620 // cmpxchg must have release semantics.
621 static volatile pte_t*
622 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
623 {
624 struct mm_struct *mm = &d->arch.mm;
625 volatile pgd_t *pgd;
626 volatile pud_t *pud;
627 volatile pmd_t *pmd;
629 BUG_ON(mm->pgd == NULL);
631 pgd = pgd_offset(mm, mpaddr);
632 again_pgd:
633 if (unlikely(pgd_none(*pgd))) { // acquire semantics
634 pud_t *old_pud = NULL;
635 pud = pud_alloc_one(mm, mpaddr);
636 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
637 pud_free(pud);
638 goto again_pgd;
639 }
640 }
642 pud = pud_offset(pgd, mpaddr);
643 again_pud:
644 if (unlikely(pud_none(*pud))) { // acquire semantics
645 pmd_t* old_pmd = NULL;
646 pmd = pmd_alloc_one(mm, mpaddr);
647 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
648 pmd_free(pmd);
649 goto again_pud;
650 }
651 }
653 pmd = pmd_offset(pud, mpaddr);
654 again_pmd:
655 if (unlikely(pmd_none(*pmd))) { // acquire semantics
656 pte_t* old_pte = NULL;
657 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
658 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
659 pte_free_kernel(pte);
660 goto again_pmd;
661 }
662 }
664 return pte_offset_map(pmd, mpaddr);
665 }
667 //XXX xxx_none() should be used instread of !xxx_present()?
668 volatile pte_t*
669 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
670 {
671 struct mm_struct *mm = &d->arch.mm;
672 volatile pgd_t *pgd;
673 volatile pud_t *pud;
674 volatile pmd_t *pmd;
676 BUG_ON(mm->pgd == NULL);
677 pgd = pgd_offset(mm, mpaddr);
678 if (unlikely(!pgd_present(*pgd))) // acquire semantics
679 return NULL;
681 pud = pud_offset(pgd, mpaddr);
682 if (unlikely(!pud_present(*pud))) // acquire semantics
683 return NULL;
685 pmd = pmd_offset(pud, mpaddr);
686 if (unlikely(!pmd_present(*pmd))) // acquire semantics
687 return NULL;
689 return pte_offset_map(pmd, mpaddr);
690 }
692 static volatile pte_t*
693 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
694 {
695 struct mm_struct *mm = &d->arch.mm;
696 volatile pgd_t *pgd;
697 volatile pud_t *pud;
698 volatile pmd_t *pmd;
700 BUG_ON(mm->pgd == NULL);
701 pgd = pgd_offset(mm, mpaddr);
702 if (unlikely(pgd_none(*pgd))) // acquire semantics
703 return NULL;
705 pud = pud_offset(pgd, mpaddr);
706 if (unlikely(pud_none(*pud))) // acquire semantics
707 return NULL;
709 pmd = pmd_offset(pud, mpaddr);
710 if (unlikely(pmd_none(*pmd))) // acquire semantics
711 return NULL;
713 return pte_offset_map(pmd, mpaddr);
714 }
716 unsigned long
717 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
718 {
719 volatile pte_t *pte;
721 pte = lookup_noalloc_domain_pte(d, mpaddr);
722 if (pte == NULL)
723 return INVALID_MFN;
725 if (pte_present(*pte))
726 return (pte->pte & _PFN_MASK);
727 else if (VMX_DOMAIN(d->vcpu[0]))
728 return GPFN_INV_MASK;
729 return INVALID_MFN;
730 }
732 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
733 struct p2m_entry* entry)
734 {
735 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
737 if (pte != NULL) {
738 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
739 if (pte_present(tmp_pte)) {
740 if (entry != NULL)
741 p2m_entry_set(entry, pte, tmp_pte);
742 return pte_val(tmp_pte);
743 } else if (VMX_DOMAIN(d->vcpu[0]))
744 return GPFN_INV_MASK;
745 }
747 if (mpaddr < d->arch.convmem_end && !d->is_dying) {
748 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
749 "d %"PRId16" 0x%lx (< 0x%lx)\n",
750 current->vcpu_id, PSCB(current, iip),
751 d->domain_id, mpaddr, d->arch.convmem_end);
752 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
753 /* Log I/O port probing, but complain less loudly about it */
754 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
755 "d %"PRId16" 0x%lx\n",
756 current->vcpu_id, PSCB(current, iip), d->domain_id,
757 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
758 } else {
759 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
760 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
761 current->vcpu_id, PSCB(current, iip),
762 d->domain_id, mpaddr, d->arch.convmem_end);
763 }
765 if (entry != NULL)
766 p2m_entry_set(entry, NULL, __pte(0));
767 //XXX This is a work around until the emulation memory access to a region
768 // where memory or device are attached is implemented.
769 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_PRIV |
770 _PAGE_AR_RWX)));
771 }
773 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
774 #if 1
775 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
776 {
777 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
778 unsigned long imva;
780 pte &= _PAGE_PPN_MASK;
781 imva = (unsigned long) __va(pte);
782 imva |= mpaddr & ~PAGE_MASK;
783 return (void*)imva;
784 }
785 #else
786 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
787 {
788 unsigned long imva = __gpa_to_mpa(d, mpaddr);
790 return (void *)__va(imva);
791 }
792 #endif
794 unsigned long
795 paddr_to_maddr(unsigned long paddr)
796 {
797 struct vcpu *v = current;
798 struct domain *d = v->domain;
799 u64 pa;
801 pa = ____lookup_domain_mpa(d, paddr);
802 if (pa == INVALID_MFN) {
803 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
804 __func__, paddr, vcpu_regs(v)->cr_iip);
805 return 0;
806 }
807 return (pa & _PFN_MASK) | (paddr & ~PAGE_MASK);
808 }
810 /* Allocate a new page for domain and map it to the specified metaphysical
811 address. */
812 static struct page_info *
813 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
814 volatile pte_t* pte)
815 {
816 struct page_info *p;
817 unsigned long maddr;
819 BUG_ON(!pte_none(*pte));
821 p = alloc_domheap_page(d);
822 if (unlikely(!p)) {
823 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
824 return(p);
825 }
827 // zero out pages for security reasons
828 clear_page(page_to_virt(p));
829 maddr = page_to_maddr (p);
830 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
831 && maddr < __get_cpu_var(vhpt_pend))) {
832 /* FIXME: how can this happen ?
833 vhpt is allocated by alloc_domheap_page. */
834 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
835 maddr);
836 }
838 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
839 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
840 // because set_pte_rel() has release semantics
841 set_pte_rel(pte,
842 pfn_pte(maddr >> PAGE_SHIFT,
843 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
844 _PAGE_PL_PRIV | _PAGE_AR_RWX)));
846 smp_mb();
847 return p;
848 }
850 struct page_info *
851 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
852 {
853 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
855 if (!pte_none(*pte))
856 return NULL;
858 return __assign_new_domain_page(d, mpaddr, pte);
859 }
861 void __init
862 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
863 {
864 volatile pte_t *pte;
866 BUG_ON(d != dom0);
867 pte = lookup_alloc_domain_pte(d, mpaddr);
868 if (pte_none(*pte)) {
869 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
870 if (p == NULL) {
871 panic("%s: can't allocate page for dom0\n", __func__);
872 }
873 }
874 }
876 static unsigned long
877 flags_to_prot (unsigned long flags)
878 {
879 unsigned long res = _PAGE_PL_PRIV | __DIRTY_BITS;
881 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
882 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
883 #ifdef CONFIG_XEN_IA64_TLB_TRACK
884 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
885 #endif
886 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
888 return res;
889 }
891 /* map a physical address to the specified metaphysical addr */
892 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
893 // This is called by assign_domain_mmio_page().
894 // So accessing to pte is racy.
895 int
896 __assign_domain_page(struct domain *d,
897 unsigned long mpaddr, unsigned long physaddr,
898 unsigned long flags)
899 {
900 volatile pte_t *pte;
901 pte_t old_pte;
902 pte_t new_pte;
903 pte_t ret_pte;
904 unsigned long prot = flags_to_prot(flags);
906 pte = lookup_alloc_domain_pte(d, mpaddr);
908 old_pte = __pte(0);
909 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
910 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
911 if (pte_val(ret_pte) == pte_val(old_pte)) {
912 smp_mb();
913 return 0;
914 }
916 // dom0 tries to map real machine's I/O region, but failed.
917 // It is very likely that dom0 doesn't boot correctly because
918 // it can't access I/O. So complain here.
919 if (flags & ASSIGN_nocache) {
920 int warn = 0;
922 if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT))
923 warn = 1;
924 else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) {
925 u32 type;
926 u64 attr;
928 warn = 1;
930 /*
931 * See
932 * complete_dom0_memmap()
933 * case EFI_RUNTIME_SERVICES_CODE:
934 * case EFI_RUNTIME_SERVICES_DATA:
935 * case EFI_ACPI_RECLAIM_MEMORY:
936 * case EFI_ACPI_MEMORY_NVS:
937 * case EFI_RESERVED_TYPE:
938 *
939 * Currently only EFI_RUNTIME_SERVICES_CODE is found
940 * so that we suppress only EFI_RUNTIME_SERVICES_CODE case.
941 */
942 type = efi_mem_type(physaddr);
943 attr = efi_mem_attributes(physaddr);
944 if (type == EFI_RUNTIME_SERVICES_CODE &&
945 (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB))
946 warn = 0;
947 }
948 if (warn)
949 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
950 "\talready assigned pte_val 0x%016lx\n"
951 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
952 __func__, __LINE__,
953 d, d->domain_id, pte_val(ret_pte),
954 mpaddr, physaddr, flags);
955 }
957 return -EAGAIN;
958 }
960 /* get_page() and map a physical address to the specified metaphysical addr */
961 void
962 assign_domain_page(struct domain *d,
963 unsigned long mpaddr, unsigned long physaddr)
964 {
965 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
967 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
968 BUG_ON(page->count_info != (PGC_allocated | 1));
969 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
970 // because __assign_domain_page() uses set_pte_rel() which has
971 // release semantics, smp_mb() isn't needed.
972 (void)__assign_domain_page(d, mpaddr, physaddr,
973 ASSIGN_writable | ASSIGN_pgc_allocated);
974 }
976 int
977 ioports_permit_access(struct domain *d, unsigned int fp, unsigned int lp)
978 {
979 struct io_space *space;
980 unsigned long mmio_start, mmio_end, mach_start;
981 int ret;
983 if (IO_SPACE_NR(fp) >= num_io_spaces) {
984 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
985 return -EFAULT;
986 }
988 /*
989 * The ioport_cap rangeset tracks the I/O port address including
990 * the port space ID. This means port space IDs need to match
991 * between Xen and dom0. This is also a requirement because
992 * the hypercall to pass these port ranges only uses a u32.
993 *
994 * NB - non-dom0 driver domains may only have a subset of the
995 * I/O port spaces and thus will number port spaces differently.
996 * This is ok, they don't make use of this interface.
997 */
998 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
999 if (ret != 0)
1000 return ret;
1002 space = &io_space[IO_SPACE_NR(fp)];
1004 /* Legacy I/O on dom0 is already setup */
1005 if (d == dom0 && space == &io_space[0])
1006 return 0;
1008 fp = IO_SPACE_PORT(fp);
1009 lp = IO_SPACE_PORT(lp);
1011 if (space->sparse) {
1012 mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
1013 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
1014 } else {
1015 mmio_start = fp & ~PAGE_MASK;
1016 mmio_end = PAGE_ALIGN(lp);
1019 /*
1020 * The "machine first port" is not necessarily identity mapped
1021 * to the guest first port. At least for the legacy range.
1022 */
1023 mach_start = mmio_start | __pa(space->mmio_base);
1025 if (space == &io_space[0]) {
1026 mmio_start |= IO_PORTS_PADDR;
1027 mmio_end |= IO_PORTS_PADDR;
1028 } else {
1029 mmio_start |= __pa(space->mmio_base);
1030 mmio_end |= __pa(space->mmio_base);
1033 while (mmio_start <= mmio_end) {
1034 (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache);
1035 mmio_start += PAGE_SIZE;
1036 mach_start += PAGE_SIZE;
1039 return 0;
1042 static int
1043 ioports_has_allowed(struct domain *d, unsigned int fp, unsigned int lp)
1045 for (; fp < lp; fp++)
1046 if (rangeset_contains_singleton(d->arch.ioport_caps, fp))
1047 return 1;
1049 return 0;
1052 int
1053 ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
1055 int ret;
1056 struct mm_struct *mm = &d->arch.mm;
1057 unsigned long mmio_start, mmio_end, mmio_base;
1058 unsigned int fp_base, lp_base;
1059 struct io_space *space;
1061 if (IO_SPACE_NR(fp) >= num_io_spaces) {
1062 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
1063 return -EFAULT;
1066 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
1067 if (ret != 0)
1068 return ret;
1070 space = &io_space[IO_SPACE_NR(fp)];
1071 fp_base = IO_SPACE_PORT(fp);
1072 lp_base = IO_SPACE_PORT(lp);
1074 if (space->sparse) {
1075 mmio_start = IO_SPACE_SPARSE_ENCODING(fp_base) & ~PAGE_MASK;
1076 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp_base));
1077 } else {
1078 mmio_start = fp_base & ~PAGE_MASK;
1079 mmio_end = PAGE_ALIGN(lp_base);
1082 if (space == &io_space[0] && d != dom0)
1083 mmio_base = IO_PORTS_PADDR;
1084 else
1085 mmio_base = __pa(space->mmio_base);
1087 for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
1088 unsigned int port, range;
1089 unsigned long mpaddr;
1090 volatile pte_t *pte;
1091 pte_t old_pte;
1093 if (space->sparse) {
1094 port = IO_SPACE_SPARSE_DECODING(mmio_start);
1095 range = IO_SPACE_SPARSE_PORTS_PER_PAGE - 1;
1096 } else {
1097 port = mmio_start;
1098 range = PAGE_SIZE - 1;
1101 port |= IO_SPACE_BASE(IO_SPACE_NR(fp));
1103 if (port < fp || port + range > lp) {
1104 /* Maybe this covers an allowed port. */
1105 if (ioports_has_allowed(d, port, port + range))
1106 continue;
1109 mpaddr = mmio_start | mmio_base;
1110 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1111 BUG_ON(pte == NULL);
1112 BUG_ON(pte_none(*pte));
1114 /* clear pte */
1115 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1117 domain_flush_vtlb_all(d);
1118 return 0;
1121 static void
1122 assign_domain_same_page(struct domain *d,
1123 unsigned long mpaddr, unsigned long size,
1124 unsigned long flags)
1126 //XXX optimization
1127 unsigned long end = PAGE_ALIGN(mpaddr + size);
1128 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
1129 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
1133 int
1134 efi_mmio(unsigned long physaddr, unsigned long size)
1136 void *efi_map_start, *efi_map_end;
1137 u64 efi_desc_size;
1138 void* p;
1140 efi_map_start = __va(ia64_boot_param->efi_memmap);
1141 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1142 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1144 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1145 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1146 unsigned long start = md->phys_addr;
1147 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1149 if (start <= physaddr && physaddr < end) {
1150 if ((physaddr + size) > end) {
1151 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1152 __func__, physaddr, size);
1153 return 0;
1156 // for io space
1157 if (md->type == EFI_MEMORY_MAPPED_IO ||
1158 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1159 return 1;
1162 // for runtime
1163 // see efi_enter_virtual_mode(void)
1164 // in linux/arch/ia64/kernel/efi.c
1165 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1166 !(md->attribute & EFI_MEMORY_WB)) {
1167 return 1;
1170 return 0;
1173 if (physaddr < start) {
1174 break;
1178 return 1;
1181 unsigned long
1182 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1183 unsigned long phys_addr, unsigned long size,
1184 unsigned long flags)
1186 unsigned long addr = mpaddr & PAGE_MASK;
1187 unsigned long end = PAGE_ALIGN(mpaddr + size);
1189 if (size == 0) {
1190 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1191 __func__, d, mpaddr, size);
1193 if (!efi_mmio(mpaddr, size)) {
1194 #ifndef NDEBUG
1195 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1196 __func__, d, mpaddr, size);
1197 #endif
1198 return -EINVAL;
1201 for (phys_addr &= PAGE_MASK; addr < end;
1202 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1203 __assign_domain_page(d, addr, phys_addr, flags);
1206 return mpaddr;
1209 unsigned long
1210 assign_domain_mach_page(struct domain *d,
1211 unsigned long mpaddr, unsigned long size,
1212 unsigned long flags)
1214 BUG_ON(flags & ASSIGN_pgc_allocated);
1215 assign_domain_same_page(d, mpaddr, size, flags);
1216 return mpaddr;
1219 static void
1220 adjust_page_count_info(struct page_info* page)
1222 struct domain* d = page_get_owner(page);
1223 BUG_ON((page->count_info & PGC_count_mask) != 1);
1224 if (d != NULL) {
1225 int ret = get_page(page, d);
1226 BUG_ON(ret == 0);
1227 } else {
1228 u64 x, nx, y;
1230 y = *((u64*)&page->count_info);
1231 do {
1232 x = y;
1233 nx = x + 1;
1235 BUG_ON((x >> 32) != 0);
1236 BUG_ON((nx & PGC_count_mask) != 2);
1237 y = cmpxchg((u64*)&page->count_info, x, nx);
1238 } while (unlikely(y != x));
1242 static void
1243 domain_put_page(struct domain* d, unsigned long mpaddr,
1244 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1246 unsigned long mfn = pte_pfn(old_pte);
1247 struct page_info* page = mfn_to_page(mfn);
1249 if (pte_pgc_allocated(old_pte)) {
1250 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1251 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1252 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1253 } else {
1254 BUG();
1257 if (likely(clear_PGC_allocate)) {
1258 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1259 BUG();
1260 /* put_page() is done by domain_page_flush_and_put() */
1261 } else {
1262 // In this case, page reference count mustn't touched.
1263 // domain_page_flush_and_put() decrements it, we increment
1264 // it in advence. This patch is slow path.
1265 //
1266 // guest_remove_page(): owner = d, count_info = 1
1267 // memory_exchange(): owner = NULL, count_info = 1
1268 adjust_page_count_info(page);
1271 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1274 // caller must get_page(mfn_to_page(mfn)) before call.
1275 // caller must call set_gpfn_from_mfn() before call if necessary.
1276 // because set_gpfn_from_mfn() result must be visible before pte xchg
1277 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1278 // flags: ASSIGN_xxx
1279 static void
1280 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1281 unsigned long mfn, unsigned long flags)
1283 struct mm_struct *mm = &d->arch.mm;
1284 volatile pte_t* pte;
1285 pte_t old_pte;
1286 pte_t npte;
1287 unsigned long prot = flags_to_prot(flags);
1289 pte = lookup_alloc_domain_pte(d, mpaddr);
1291 // update pte
1292 npte = pfn_pte(mfn, __pgprot(prot));
1293 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1294 if (pte_mem(old_pte)) {
1295 unsigned long old_mfn = pte_pfn(old_pte);
1297 // mfn = old_mfn case can happen when domain maps a granted page
1298 // twice with the same pseudo physial address.
1299 // It's non sense, but allowed.
1300 // __gnttab_map_grant_ref()
1301 // => create_host_mapping()
1302 // => assign_domain_page_replace()
1303 if (mfn != old_mfn) {
1304 domain_put_page(d, mpaddr, pte, old_pte, 1);
1307 perfc_incr(assign_domain_page_replace);
1310 // caller must get_page(new_page) before
1311 // Only steal_page() calls this function.
1312 static int
1313 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1314 struct page_info* old_page,
1315 struct page_info* new_page,
1316 unsigned long flags, int clear_PGC_allocate)
1318 struct mm_struct *mm = &d->arch.mm;
1319 volatile pte_t* pte;
1320 unsigned long old_mfn;
1321 unsigned long old_prot;
1322 pte_t old_pte;
1323 unsigned long new_mfn;
1324 unsigned long new_prot;
1325 pte_t new_pte;
1326 pte_t ret_pte;
1328 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1329 pte = lookup_alloc_domain_pte(d, mpaddr);
1331 again:
1332 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1333 old_mfn = page_to_mfn(old_page);
1334 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1335 if (!pte_present(old_pte)) {
1336 gdprintk(XENLOG_INFO,
1337 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1338 __func__, pte_val(old_pte), old_prot, old_mfn);
1339 return -EINVAL;
1342 new_prot = flags_to_prot(flags);
1343 new_mfn = page_to_mfn(new_page);
1344 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1346 // update pte
1347 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1348 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1349 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1350 goto again;
1353 gdprintk(XENLOG_INFO,
1354 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1355 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1356 __func__,
1357 pte_val(old_pte), old_prot, old_mfn,
1358 pte_val(ret_pte), pte_pfn(ret_pte));
1359 return -EINVAL;
1362 BUG_ON(!pte_mem(old_pte));
1363 BUG_ON(!pte_pgc_allocated(old_pte));
1364 BUG_ON(page_get_owner(old_page) != d);
1365 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1366 BUG_ON(old_mfn == new_mfn);
1368 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1369 if (likely(clear_PGC_allocate)) {
1370 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1371 BUG();
1372 } else {
1373 int ret;
1374 // adjust for count_info for domain_page_flush_and_put()
1375 // This is slow path.
1376 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1377 BUG_ON(d == NULL);
1378 ret = get_page(old_page, d);
1379 BUG_ON(ret == 0);
1382 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1383 perfc_incr(assign_domain_pge_cmpxchg_rel);
1384 return 0;
1387 static void
1388 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1389 int clear_PGC_allocate, unsigned long mfn)
1391 struct mm_struct *mm = &d->arch.mm;
1392 volatile pte_t *pte;
1393 pte_t old_pte;
1394 struct page_info *page;
1396 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1397 if (pte == NULL)
1398 return;
1399 if (pte_none(*pte))
1400 return;
1402 if (mfn == INVALID_MFN) {
1403 // clear pte
1404 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1405 mfn = pte_pfn(old_pte);
1406 } else {
1407 unsigned long old_arflags;
1408 pte_t new_pte;
1409 pte_t ret_pte;
1411 again:
1412 // memory_exchange() calls guest_physmap_remove_page() with
1413 // a stealed page. i.e. page owner = NULL.
1414 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1415 page_get_owner(mfn_to_page(mfn)) != NULL);
1416 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1417 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1418 new_pte = __pte(0);
1420 // update pte
1421 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1422 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1423 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1424 goto again;
1427 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1428 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1429 __func__,
1430 pte_val(old_pte), old_arflags, mfn,
1431 pte_val(ret_pte), pte_pfn(ret_pte));
1432 return;
1434 BUG_ON(mfn != pte_pfn(ret_pte));
1437 page = mfn_to_page(mfn);
1438 BUG_ON((page->count_info & PGC_count_mask) == 0);
1440 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1441 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1442 perfc_incr(zap_dcomain_page_one);
1445 unsigned long
1446 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1447 unsigned int extent_order)
1449 if (extent_order != 0) {
1450 //XXX
1451 return -ENOSYS;
1454 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1455 perfc_incr(dom0vp_zap_physmap);
1456 return 0;
1459 static unsigned long
1460 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1461 unsigned long mfn_or_gmfn,
1462 unsigned long flags, domid_t domid, int is_gmfn)
1464 int error = -EINVAL;
1465 struct domain* rd;
1466 unsigned long mfn;
1468 /* Not allowed by a domain. */
1469 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1470 return -EINVAL;
1472 rd = get_domain_by_id(domid);
1473 if (unlikely(rd == NULL)) {
1474 switch (domid) {
1475 case DOMID_XEN:
1476 rd = dom_xen;
1477 break;
1478 case DOMID_IO:
1479 rd = dom_io;
1480 break;
1481 default:
1482 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1483 "gpfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1484 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1485 return -ESRCH;
1487 BUG_ON(rd == NULL);
1488 get_knownalive_domain(rd);
1491 if (unlikely(rd == d))
1492 goto out1;
1493 /*
1494 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1495 * It can be considered that their p2m conversion is p==m.
1496 */
1497 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1498 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1499 else
1500 mfn = mfn_or_gmfn;
1501 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1502 goto out1;
1504 error = 0;
1505 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1506 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1507 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1508 //don't update p2m table because this page belongs to rd, not d.
1509 perfc_incr(dom0vp_add_physmap);
1510 out1:
1511 put_domain(rd);
1512 return error;
1515 unsigned long
1516 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1517 unsigned long flags, domid_t domid)
1519 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1522 unsigned long
1523 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1524 unsigned long gmfn, unsigned long flags,
1525 domid_t domid)
1527 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1530 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1531 static struct page_info* p2m_pte_zero_page = NULL;
1533 /* This must called before dom0 p2m table allocation */
1534 void __init
1535 expose_p2m_init(void)
1537 pte_t* pte;
1539 /*
1540 * Initialise our DOMID_P2M domain.
1541 * This domain owns m2p table pages.
1542 */
1543 dom_p2m = alloc_domain(DOMID_P2M);
1544 BUG_ON(dom_p2m == NULL);
1545 dom_p2m->max_pages = ~0U;
1547 pte = pte_alloc_one_kernel(NULL, 0);
1548 BUG_ON(pte == NULL);
1549 smp_mb();// make contents of the page visible.
1550 p2m_pte_zero_page = virt_to_page(pte);
1553 static int
1554 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1556 int ret = get_page(page, dom_p2m);
1557 BUG_ON(ret != 1);
1558 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1559 ASSIGN_readonly);
1562 // It is possible to optimize loop, But this isn't performance critical.
1563 unsigned long
1564 dom0vp_expose_p2m(struct domain* d,
1565 unsigned long conv_start_gpfn,
1566 unsigned long assign_start_gpfn,
1567 unsigned long expose_size, unsigned long granule_pfn)
1569 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1570 unsigned long i;
1571 volatile pte_t* conv_pte;
1572 volatile pte_t* assign_pte;
1574 if ((expose_size % PAGE_SIZE) != 0 ||
1575 (granule_pfn % PTRS_PER_PTE) != 0 ||
1576 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1577 (conv_start_gpfn % granule_pfn) != 0 ||
1578 (assign_start_gpfn % granule_pfn) != 0 ||
1579 (expose_num_pfn % granule_pfn) != 0) {
1580 gdprintk(XENLOG_INFO,
1581 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1582 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1583 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1584 return -EINVAL;
1587 if (granule_pfn != PTRS_PER_PTE) {
1588 gdprintk(XENLOG_INFO,
1589 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1590 __func__, granule_pfn, PTRS_PER_PTE);
1591 return -ENOSYS;
1594 // allocate pgd, pmd.
1595 i = conv_start_gpfn;
1596 while (i < expose_num_pfn) {
1597 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1598 PAGE_SHIFT);
1599 if (conv_pte == NULL) {
1600 i++;
1601 continue;
1604 assign_pte = lookup_alloc_domain_pte(d, (assign_start_gpfn <<
1605 PAGE_SHIFT) + i * sizeof(pte_t));
1606 if (assign_pte == NULL) {
1607 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n", __func__);
1608 return -ENOMEM;
1611 // skip to next pte page
1612 i += PTRS_PER_PTE;
1613 i &= ~(PTRS_PER_PTE - 1);
1616 // expose pte page
1617 i = 0;
1618 while (i < expose_num_pfn) {
1619 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1620 PAGE_SHIFT);
1621 if (conv_pte == NULL) {
1622 i++;
1623 continue;
1626 if (expose_p2m_page(d, (assign_start_gpfn << PAGE_SHIFT) +
1627 i * sizeof(pte_t), virt_to_page(conv_pte)) < 0) {
1628 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1629 return -EAGAIN;
1632 // skip to next pte page
1633 i += PTRS_PER_PTE;
1634 i &= ~(PTRS_PER_PTE - 1);
1637 // expose p2m_pte_zero_page
1638 for (i = 0; i < (expose_num_pfn + PTRS_PER_PTE - 1) / PTRS_PER_PTE; i++) {
1639 assign_pte = lookup_noalloc_domain_pte(d, (assign_start_gpfn + i) <<
1640 PAGE_SHIFT);
1641 if (assign_pte == NULL || pte_present(*assign_pte))
1642 continue;
1644 if (expose_p2m_page(d, (assign_start_gpfn + i) << PAGE_SHIFT,
1645 p2m_pte_zero_page) < 0) {
1646 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n", __func__);
1647 return -EAGAIN;
1651 return 0;
1653 #endif
1655 // grant table host mapping
1656 // mpaddr: host_addr: pseudo physical address
1657 // mfn: frame: machine page frame
1658 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1659 int
1660 create_grant_host_mapping(unsigned long gpaddr,
1661 unsigned long mfn, unsigned int flags)
1663 struct domain* d = current->domain;
1664 struct page_info* page;
1665 int ret;
1667 if (flags & (GNTMAP_device_map |
1668 GNTMAP_application_map | GNTMAP_contains_pte)) {
1669 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1670 return GNTST_general_error;
1673 BUG_ON(!mfn_valid(mfn));
1674 page = mfn_to_page(mfn);
1675 ret = get_page(page, page_get_owner(page));
1676 BUG_ON(ret == 0);
1677 assign_domain_page_replace(d, gpaddr, mfn,
1678 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1679 ASSIGN_tlb_track |
1680 #endif
1681 ((flags & GNTMAP_readonly) ?
1682 ASSIGN_readonly : ASSIGN_writable));
1683 perfc_incr(create_grant_host_mapping);
1684 return GNTST_okay;
1687 // grant table host unmapping
1688 int
1689 replace_grant_host_mapping(unsigned long gpaddr,
1690 unsigned long mfn, unsigned long new_gpaddr, unsigned int flags)
1692 struct domain* d = current->domain;
1693 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
1694 volatile pte_t* pte;
1695 unsigned long cur_arflags;
1696 pte_t cur_pte;
1697 pte_t new_pte = __pte(0);
1698 pte_t old_pte;
1699 struct page_info* page = mfn_to_page(mfn);
1700 struct page_info* new_page = NULL;
1701 volatile pte_t* new_page_pte = NULL;
1703 if (new_gpaddr) {
1704 new_page_pte = lookup_noalloc_domain_pte_none(d, new_gpaddr);
1705 if (likely(new_page_pte != NULL)) {
1706 new_pte = ptep_get_and_clear(&d->arch.mm,
1707 new_gpaddr, new_page_pte);
1708 if (likely(pte_present(new_pte))) {
1709 unsigned long new_page_mfn;
1710 struct domain* page_owner;
1712 new_page_mfn = pte_pfn(new_pte);
1713 new_page = mfn_to_page(new_page_mfn);
1714 page_owner = page_get_owner(new_page);
1715 if (unlikely(page_owner == NULL)) {
1716 gdprintk(XENLOG_INFO,
1717 "%s: page_owner == NULL "
1718 "gpaddr 0x%lx mfn 0x%lx "
1719 "new_gpaddr 0x%lx mfn 0x%lx\n",
1720 __func__, gpaddr, mfn, new_gpaddr, new_page_mfn);
1721 new_page = NULL; /* prevent domain_put_page() */
1722 goto out;
1725 /*
1726 * domain_put_page(clear_PGC_allcoated = 0)
1727 * doesn't decrement refcount of page with
1728 * pte_ptc_allocated() = 1. Be carefull.
1729 */
1730 if (unlikely(!pte_pgc_allocated(new_pte))) {
1731 /* domain_put_page() decrements page refcount. adjust it. */
1732 if (get_page(new_page, page_owner)) {
1733 gdprintk(XENLOG_INFO,
1734 "%s: get_page() failed. "
1735 "gpaddr 0x%lx mfn 0x%lx "
1736 "new_gpaddr 0x%lx mfn 0x%lx\n",
1737 __func__, gpaddr, mfn,
1738 new_gpaddr, new_page_mfn);
1739 goto out;
1742 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 0);
1743 } else
1744 new_pte = __pte(0);
1748 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1749 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1750 return GNTST_general_error;
1753 pte = lookup_noalloc_domain_pte(d, gpaddr);
1754 if (pte == NULL) {
1755 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
1756 __func__, gpaddr, mfn);
1757 goto out;
1760 again:
1761 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1762 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1763 if (!pte_present(cur_pte) ||
1764 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
1765 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1766 __func__, gpaddr, mfn, pte_val(cur_pte));
1767 goto out;
1770 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1771 if (unlikely(!pte_present(old_pte))) {
1772 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx"
1773 " cur_pte 0x%lx old_pte 0x%lx\n",
1774 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1775 goto out;
1777 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1778 if (pte_pfn(old_pte) == mfn) {
1779 goto again;
1781 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
1782 "0x%lx old_pte 0x%lx\n",
1783 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1784 goto out;
1786 BUG_ON(pte_pfn(old_pte) != mfn);
1788 /* try_to_clear_PGC_allocate(d, page) is not needed. */
1789 BUG_ON(page_get_owner(page) == d &&
1790 get_gpfn_from_mfn(mfn) == gpfn);
1791 BUG_ON(pte_pgc_allocated(old_pte));
1792 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
1794 perfc_incr(replace_grant_host_mapping);
1795 return GNTST_okay;
1797 out:
1798 if (new_page)
1799 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 1);
1800 return GNTST_general_error;
1803 // heavily depends on the struct page layout.
1804 // gnttab_transfer() calls steal_page() with memflags = 0
1805 // For grant table transfer, we must fill the page.
1806 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1807 // For memory exchange, we don't have to fill the page because
1808 // memory_exchange() does it.
1809 int
1810 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1812 #if 0 /* if big endian */
1813 # error "implement big endian version of steal_page()"
1814 #endif
1815 u32 _d, _nd;
1816 u64 x, nx, y;
1818 if (page_get_owner(page) != d) {
1819 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
1820 __func__, d, page_get_owner(page));
1821 return -1;
1824 if (!(memflags & MEMF_no_refcount)) {
1825 unsigned long gpfn;
1826 struct page_info *new;
1827 unsigned long new_mfn;
1828 int ret;
1830 new = alloc_domheap_page(d);
1831 if (new == NULL) {
1832 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
1833 return -1;
1835 // zero out pages for security reasons
1836 clear_page(page_to_virt(new));
1837 // assign_domain_page_cmpxchg_rel() has release semantics
1838 // so smp_mb() isn't needed.
1840 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1841 if (gpfn == INVALID_M2P_ENTRY) {
1842 free_domheap_page(new);
1843 return -1;
1845 new_mfn = page_to_mfn(new);
1846 set_gpfn_from_mfn(new_mfn, gpfn);
1847 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1848 // has release semantics.
1850 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1851 ASSIGN_writable |
1852 ASSIGN_pgc_allocated, 0);
1853 if (ret < 0) {
1854 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
1855 ret);
1856 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1857 free_domheap_page(new);
1858 return -1;
1860 perfc_incr(steal_page_refcount);
1863 spin_lock(&d->page_alloc_lock);
1865 /*
1866 * The tricky bit: atomically release ownership while there is just one
1867 * benign reference to the page (PGC_allocated). If that reference
1868 * disappears then the deallocation routine will safely spin.
1869 */
1870 _d = pickle_domptr(d);
1871 y = *((u64*)&page->count_info);
1872 do {
1873 x = y;
1874 nx = x & 0xffffffff;
1875 // page->count_info: untouched
1876 // page->u.inused._domain = 0;
1877 _nd = x >> 32;
1879 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
1880 (1 | PGC_allocated))) ||
1881 unlikely(_nd != _d)) {
1882 struct domain* nd = unpickle_domptr(_nd);
1883 if (nd == NULL) {
1884 gdprintk(XENLOG_INFO, "gnttab_transfer: "
1885 "Bad page %p: ed=%p(%u) 0x%x, "
1886 "sd=%p 0x%x,"
1887 " caf=%016lx, taf=%" PRtype_info
1888 " memflags 0x%x\n",
1889 (void *) page_to_mfn(page),
1890 d, d->domain_id, _d,
1891 nd, _nd,
1892 x,
1893 page->u.inuse.type_info,
1894 memflags);
1895 } else {
1896 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
1897 "Bad page %p: ed=%p(%u) 0x%x, "
1898 "sd=%p(%u) 0x%x,"
1899 " caf=%016lx, taf=%" PRtype_info
1900 " memflags 0x%x\n",
1901 (void *) page_to_mfn(page),
1902 d, d->domain_id, _d,
1903 nd, nd->domain_id, _nd,
1904 x,
1905 page->u.inuse.type_info,
1906 memflags);
1908 spin_unlock(&d->page_alloc_lock);
1909 return -1;
1912 y = cmpxchg((u64*)&page->count_info, x, nx);
1913 } while (unlikely(y != x));
1915 /*
1916 * Unlink from 'd'. At least one reference remains (now anonymous), so
1917 * noone else is spinning to try to delete this page from 'd'.
1918 */
1919 if ( !(memflags & MEMF_no_refcount) )
1920 d->tot_pages--;
1921 list_del(&page->list);
1923 spin_unlock(&d->page_alloc_lock);
1924 perfc_incr(steal_page);
1925 return 0;
1928 void
1929 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1930 unsigned long mfn)
1932 BUG_ON(!mfn_valid(mfn));
1933 BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
1934 set_gpfn_from_mfn(mfn, gpfn);
1935 smp_mb();
1936 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
1937 ASSIGN_writable | ASSIGN_pgc_allocated);
1939 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1941 perfc_incr(guest_physmap_add_page);
1944 void
1945 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1946 unsigned long mfn)
1948 BUG_ON(mfn == 0);//XXX
1949 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
1950 perfc_incr(guest_physmap_remove_page);
1953 static void
1954 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
1955 volatile pte_t* ptep, pte_t old_pte,
1956 struct page_info* page)
1958 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1959 struct tlb_track_entry* entry;
1960 #endif
1962 if (shadow_mode_enabled(d))
1963 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1965 #ifndef CONFIG_XEN_IA64_TLB_TRACK
1966 //XXX sledgehammer.
1967 // flush finer range.
1968 domain_flush_vtlb_all(d);
1969 put_page(page);
1970 #else
1971 switch (tlb_track_search_and_remove(d->arch.tlb_track,
1972 ptep, old_pte, &entry)) {
1973 case TLB_TRACK_NOT_TRACKED:
1974 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
1975 /* This page is zapped from this domain
1976 * by memory decrease or exchange or dom0vp_zap_physmap.
1977 * I.e. the page is zapped for returning this page to xen
1978 * (balloon driver or DMA page allocation) or
1979 * foreign domain mapped page is unmapped from the domain.
1980 * In the former case the page is to be freed so that
1981 * we can defer freeing page to batch.
1982 * In the latter case the page is unmapped so that
1983 * we need to flush it. But to optimize it, we
1984 * queue the page and flush vTLB only once.
1985 * I.e. The caller must call dfree_flush() explicitly.
1986 */
1987 domain_flush_vtlb_all(d);
1988 put_page(page);
1989 break;
1990 case TLB_TRACK_NOT_FOUND:
1991 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
1992 /* This page is zapped from this domain
1993 * by grant table page unmap.
1994 * Luckily the domain that mapped this page didn't
1995 * access this page so that we don't have to flush vTLB.
1996 * Probably the domain did only DMA.
1997 */
1998 /* do nothing */
1999 put_page(page);
2000 break;
2001 case TLB_TRACK_FOUND:
2002 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
2003 /* This page is zapped from this domain
2004 * by grant table page unmap.
2005 * Fortunately this page is accessced via only one virtual
2006 * memory address. So it is easy to flush it.
2007 */
2008 domain_flush_vtlb_track_entry(d, entry);
2009 tlb_track_free_entry(d->arch.tlb_track, entry);
2010 put_page(page);
2011 break;
2012 case TLB_TRACK_MANY:
2013 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
2014 /* This page is zapped from this domain
2015 * by grant table page unmap.
2016 * Unfortunately this page is accessced via many virtual
2017 * memory address (or too many times with single virtual address).
2018 * So we abondaned to track virtual addresses.
2019 * full vTLB flush is necessary.
2020 */
2021 domain_flush_vtlb_all(d);
2022 put_page(page);
2023 break;
2024 case TLB_TRACK_AGAIN:
2025 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
2026 BUG();
2027 break;
2029 #endif
2030 perfc_incr(domain_page_flush_and_put);
2033 int
2034 domain_page_mapped(struct domain* d, unsigned long mpaddr)
2036 volatile pte_t * pte;
2038 pte = lookup_noalloc_domain_pte(d, mpaddr);
2039 if(pte != NULL && !pte_none(*pte))
2040 return 1;
2041 return 0;
2044 /* Flush cache of domain d. */
2045 void domain_cache_flush (struct domain *d, int sync_only)
2047 struct mm_struct *mm = &d->arch.mm;
2048 volatile pgd_t *pgd = mm->pgd;
2049 unsigned long maddr;
2050 int i,j,k, l;
2051 int nbr_page = 0;
2052 void (*flush_func)(unsigned long start, unsigned long end);
2053 extern void flush_dcache_range (unsigned long, unsigned long);
2055 if (sync_only)
2056 flush_func = &flush_icache_range;
2057 else
2058 flush_func = &flush_dcache_range;
2060 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
2061 volatile pud_t *pud;
2062 if (!pgd_present(*pgd)) // acquire semantics
2063 continue;
2064 pud = pud_offset(pgd, 0);
2065 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
2066 volatile pmd_t *pmd;
2067 if (!pud_present(*pud)) // acquire semantics
2068 continue;
2069 pmd = pmd_offset(pud, 0);
2070 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
2071 volatile pte_t *pte;
2072 if (!pmd_present(*pmd)) // acquire semantics
2073 continue;
2074 pte = pte_offset_map(pmd, 0);
2075 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
2076 if (!pte_present(*pte)) // acquire semantics
2077 continue;
2078 /* Convert PTE to maddr. */
2079 maddr = __va_ul (pte_val(*pte)
2080 & _PAGE_PPN_MASK);
2081 (*flush_func)(maddr, maddr+ PAGE_SIZE);
2082 nbr_page++;
2087 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
2090 #ifdef VERBOSE
2091 #define MEM_LOG(_f, _a...) \
2092 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
2093 current->domain->domain_id , __LINE__ , ## _a )
2094 #else
2095 #define MEM_LOG(_f, _a...) ((void)0)
2096 #endif
2098 static void free_page_type(struct page_info *page, u32 type)
2102 static int alloc_page_type(struct page_info *page, u32 type)
2104 return 1;
2107 static int opt_p2m_xenheap;
2108 boolean_param("p2m_xenheap", opt_p2m_xenheap);
2110 void *pgtable_quicklist_alloc(void)
2112 void *p;
2114 BUG_ON(dom_p2m == NULL);
2115 if (!opt_p2m_xenheap) {
2116 struct page_info *page = alloc_domheap_page(dom_p2m);
2117 if (page == NULL)
2118 return NULL;
2119 p = page_to_virt(page);
2120 clear_page(p);
2121 return p;
2123 p = alloc_xenheap_pages(0);
2124 if (p) {
2125 clear_page(p);
2126 /*
2127 * This page should be read only. At this moment, the third
2128 * argument doesn't make sense. It should be 1 when supported.
2129 */
2130 share_xen_page_with_guest(virt_to_page(p), dom_p2m, 0);
2132 return p;
2135 void pgtable_quicklist_free(void *pgtable_entry)
2137 struct page_info* page = virt_to_page(pgtable_entry);
2139 BUG_ON(page_get_owner(page) != dom_p2m);
2140 BUG_ON(page->count_info != (1 | PGC_allocated));
2142 put_page(page);
2143 if (opt_p2m_xenheap)
2144 free_xenheap_page(pgtable_entry);
2147 void put_page_type(struct page_info *page)
2149 u64 nx, x, y = page->u.inuse.type_info;
2151 again:
2152 do {
2153 x = y;
2154 nx = x - 1;
2156 ASSERT((x & PGT_count_mask) != 0);
2158 /*
2159 * The page should always be validated while a reference is held. The
2160 * exception is during domain destruction, when we forcibly invalidate
2161 * page-table pages if we detect a referential loop.
2162 * See domain.c:relinquish_list().
2163 */
2164 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
2166 if ( unlikely((nx & PGT_count_mask) == 0) )
2168 /* Record TLB information for flush later. Races are harmless. */
2169 page->tlbflush_timestamp = tlbflush_current_time();
2171 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2172 likely(nx & PGT_validated) )
2174 /*
2175 * Page-table pages must be unvalidated when count is zero. The
2176 * 'free' is safe because the refcnt is non-zero and validated
2177 * bit is clear => other ops will spin or fail.
2178 */
2179 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
2180 x & ~PGT_validated)) != x) )
2181 goto again;
2182 /* We cleared the 'valid bit' so we do the clean up. */
2183 free_page_type(page, x);
2184 /* Carry on, but with the 'valid bit' now clear. */
2185 x &= ~PGT_validated;
2186 nx &= ~PGT_validated;
2190 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
2194 int get_page_type(struct page_info *page, u32 type)
2196 u64 nx, x, y = page->u.inuse.type_info;
2198 ASSERT(!(type & ~PGT_type_mask));
2200 again:
2201 do {
2202 x = y;
2203 nx = x + 1;
2204 if ( unlikely((nx & PGT_count_mask) == 0) )
2206 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2207 return 0;
2209 else if ( unlikely((x & PGT_count_mask) == 0) )
2211 if ( (x & PGT_type_mask) != type )
2213 /*
2214 * On type change we check to flush stale TLB entries. This
2215 * may be unnecessary (e.g., page was GDT/LDT) but those
2216 * circumstances should be very rare.
2217 */
2218 cpumask_t mask =
2219 page_get_owner(page)->domain_dirty_cpumask;
2220 tlbflush_filter(mask, page->tlbflush_timestamp);
2222 if ( unlikely(!cpus_empty(mask)) )
2224 perfc_incr(need_flush_tlb_flush);
2225 flush_tlb_mask(mask);
2228 /* We lose existing type, back pointer, and validity. */
2229 nx &= ~(PGT_type_mask | PGT_validated);
2230 nx |= type;
2232 /* No special validation needed for writable pages. */
2233 /* Page tables and GDT/LDT need to be scanned for validity. */
2234 if ( type == PGT_writable_page )
2235 nx |= PGT_validated;
2238 else if ( unlikely((x & PGT_type_mask) != type) )
2240 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
2241 (type != PGT_l1_page_table) )
2242 MEM_LOG("Bad type (saw %08lx != exp %08x) "
2243 "for mfn %016lx (pfn %016lx)",
2244 x, type, page_to_mfn(page),
2245 get_gpfn_from_mfn(page_to_mfn(page)));
2246 return 0;
2248 else if ( unlikely(!(x & PGT_validated)) )
2250 /* Someone else is updating validation of this page. Wait... */
2251 while ( (y = page->u.inuse.type_info) == x )
2252 cpu_relax();
2253 goto again;
2256 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
2258 if ( unlikely(!(nx & PGT_validated)) )
2260 /* Try to validate page type; drop the new reference on failure. */
2261 if ( unlikely(!alloc_page_type(page, type)) )
2263 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
2264 ": caf=%08x taf=%" PRtype_info,
2265 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2266 type, page->count_info, page->u.inuse.type_info);
2267 /* Noone else can get a reference. We hold the only ref. */
2268 page->u.inuse.type_info = 0;
2269 return 0;
2272 /* Noone else is updating simultaneously. */
2273 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2276 return 1;
2279 int memory_is_conventional_ram(paddr_t p)
2281 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
2285 long
2286 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2288 switch (op) {
2289 case XENMEM_add_to_physmap:
2291 struct xen_add_to_physmap xatp;
2292 unsigned long prev_mfn, mfn = 0, gpfn;
2293 struct domain *d;
2295 if (copy_from_guest(&xatp, arg, 1))
2296 return -EFAULT;
2298 if (xatp.domid == DOMID_SELF) {
2299 d = get_current_domain();
2301 else if (!IS_PRIV(current->domain))
2302 return -EPERM;
2303 else if ((d = get_domain_by_id(xatp.domid)) == NULL)
2304 return -ESRCH;
2306 /* This hypercall is used for VT-i domain only */
2307 if (!VMX_DOMAIN(d->vcpu[0])) {
2308 put_domain(d);
2309 return -ENOSYS;
2312 switch (xatp.space) {
2313 case XENMAPSPACE_shared_info:
2314 if (xatp.idx == 0)
2315 mfn = virt_to_mfn(d->shared_info);
2316 break;
2317 case XENMAPSPACE_grant_table:
2318 spin_lock(&d->grant_table->lock);
2320 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
2321 (xatp.idx < max_nr_grant_frames))
2322 gnttab_grow_table(d, xatp.idx + 1);
2324 if (xatp.idx < nr_grant_frames(d->grant_table))
2325 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
2327 spin_unlock(&d->grant_table->lock);
2328 break;
2329 default:
2330 break;
2333 if (mfn == 0) {
2334 put_domain(d);
2335 return -EINVAL;
2338 LOCK_BIGLOCK(d);
2340 /* Check remapping necessity */
2341 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2342 if (mfn == prev_mfn)
2343 goto out;
2345 /* Remove previously mapped page if it was present. */
2346 if (prev_mfn && mfn_valid(prev_mfn)) {
2347 if (is_xen_heap_frame(mfn_to_page(prev_mfn)))
2348 /* Xen heap frames are simply unhooked from this phys slot. */
2349 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2350 else
2351 /* Normal domain memory is freed, to avoid leaking memory. */
2352 guest_remove_page(d, xatp.gpfn);
2355 /* Unmap from old location, if any. */
2356 gpfn = get_gpfn_from_mfn(mfn);
2357 if (gpfn != INVALID_M2P_ENTRY)
2358 guest_physmap_remove_page(d, gpfn, mfn);
2360 /* Map at new location. */
2361 guest_physmap_add_page(d, xatp.gpfn, mfn);
2363 out:
2364 UNLOCK_BIGLOCK(d);
2366 put_domain(d);
2368 break;
2371 case XENMEM_machine_memory_map:
2373 struct xen_memory_map memmap;
2374 struct xen_ia64_memmap_info memmap_info;
2375 XEN_GUEST_HANDLE(char) buffer;
2377 if (!IS_PRIV(current->domain))
2378 return -EINVAL;
2379 if (copy_from_guest(&memmap, arg, 1))
2380 return -EFAULT;
2381 if (memmap.nr_entries <
2382 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size)
2383 return -EINVAL;
2385 memmap.nr_entries =
2386 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size;
2387 memset(&memmap_info, 0, sizeof(memmap_info));
2388 memmap_info.efi_memmap_size = ia64_boot_param->efi_memmap_size;
2389 memmap_info.efi_memdesc_size = ia64_boot_param->efi_memdesc_size;
2390 memmap_info.efi_memdesc_version = ia64_boot_param->efi_memdesc_version;
2392 buffer = guest_handle_cast(memmap.buffer, char);
2393 if (copy_to_guest(buffer, (char*)&memmap_info, sizeof(memmap_info)) ||
2394 copy_to_guest_offset(buffer, sizeof(memmap_info),
2395 (char*)__va(ia64_boot_param->efi_memmap),
2396 ia64_boot_param->efi_memmap_size) ||
2397 copy_to_guest(arg, &memmap, 1))
2398 return -EFAULT;
2399 return 0;
2402 default:
2403 return -ENOSYS;
2406 return 0;
2409 /*
2410 * Local variables:
2411 * mode: C
2412 * c-set-style: "BSD"
2413 * c-basic-offset: 4
2414 * tab-width: 4
2415 * indent-tabs-mode: nil
2416 * End:
2417 */