ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 15826:7e79e7f01f3d

Implement ia64 continuable domain destroy.
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author kfraser@localhost.localdomain
date Fri Aug 31 15:46:37 2007 +0100 (2007-08-31)
parents cb3c7f006077
children 4ffca478e2f7
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * replace_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <xen/mm.h>
170 #include <xen/errno.h>
171 #include <asm/pgalloc.h>
172 #include <asm/vhpt.h>
173 #include <asm/vcpu.h>
174 #include <asm/shadow.h>
175 #include <asm/p2m_entry.h>
176 #include <asm/tlb_track.h>
177 #include <linux/efi.h>
178 #include <xen/guest_access.h>
179 #include <asm/page.h>
180 #include <public/memory.h>
182 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
183 volatile pte_t* ptep, pte_t old_pte,
184 struct page_info* page);
186 extern unsigned long ia64_iobase;
188 static struct domain *dom_xen, *dom_io;
190 /*
191 * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO.
192 * If more reserved domain ids are introduced, this might be increased.
193 */
194 #define DOMID_P2M (0x7FF8U)
195 static struct domain *dom_p2m;
197 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
198 void
199 alloc_dom_xen_and_dom_io(void)
200 {
201 /*
202 * Initialise our DOMID_XEN domain.
203 * Any Xen-heap pages that we will allow to be mapped will have
204 * their domain field set to dom_xen.
205 */
206 dom_xen = alloc_domain(DOMID_XEN);
207 BUG_ON(dom_xen == NULL);
209 /*
210 * Initialise our DOMID_IO domain.
211 * This domain owns I/O pages that are within the range of the page_info
212 * array. Mappings occur at the priv of the caller.
213 */
214 dom_io = alloc_domain(DOMID_IO);
215 BUG_ON(dom_io == NULL);
216 }
218 static int
219 mm_teardown_can_skip(struct domain* d, unsigned long offset)
220 {
221 return d->arch.mm_teardown_offset > offset;
222 }
224 static void
225 mm_teardown_update_offset(struct domain* d, unsigned long offset)
226 {
227 d->arch.mm_teardown_offset = offset;
228 }
230 static void
231 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
232 {
233 pte_t old_pte;
234 unsigned long mfn;
235 struct page_info* page;
237 old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics
239 // vmx domain use bit[58:56] to distinguish io region from memory.
240 // see vmx_build_physmap_table() in vmx_init.c
241 if (!pte_mem(old_pte))
242 return;
244 // domain might map IO space or acpi table pages. check it.
245 mfn = pte_pfn(old_pte);
246 if (!mfn_valid(mfn))
247 return;
248 page = mfn_to_page(mfn);
249 BUG_ON(page_get_owner(page) == NULL);
251 // struct page_info corresponding to mfn may exist or not depending
252 // on CONFIG_VIRTUAL_FRAME_TABLE.
253 // The above check is too easy.
254 // The right way is to check whether this page is of io area or acpi pages
256 if (pte_pgc_allocated(old_pte)) {
257 BUG_ON(page_get_owner(page) != d);
258 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
259 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
260 if (test_and_clear_bit(_PGC_allocated, &page->count_info))
261 put_page(page);
262 } else {
263 put_page(page);
264 }
265 }
267 static int
268 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
269 {
270 unsigned long i;
271 volatile pte_t* pte = pte_offset_map(pmd, offset);
273 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
274 unsigned long cur_offset = offset + (i << PAGE_SHIFT);
275 if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
276 continue;
277 if (!pte_present(*pte)) { // acquire semantics
278 mm_teardown_update_offset(d, cur_offset);
279 continue;
280 }
281 mm_teardown_update_offset(d, cur_offset);
282 mm_teardown_pte(d, pte, cur_offset);
283 if (hypercall_preempt_check())
284 return -EAGAIN;
285 }
286 return 0;
287 }
289 static int
290 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
291 {
292 unsigned long i;
293 volatile pmd_t *pmd = pmd_offset(pud, offset);
295 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
296 unsigned long cur_offset = offset + (i << PMD_SHIFT);
297 if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
298 continue;
299 if (!pmd_present(*pmd)) { // acquire semantics
300 mm_teardown_update_offset(d, cur_offset);
301 continue;
302 }
303 if (mm_teardown_pmd(d, pmd, cur_offset))
304 return -EAGAIN;
305 }
306 return 0;
307 }
309 static int
310 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
311 {
312 unsigned long i;
313 volatile pud_t *pud = pud_offset(pgd, offset);
315 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
316 unsigned long cur_offset = offset + (i << PUD_SHIFT);
317 #ifndef __PAGETABLE_PUD_FOLDED
318 if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
319 continue;
320 #endif
321 if (!pud_present(*pud)) { // acquire semantics
322 #ifndef __PAGETABLE_PUD_FOLDED
323 mm_teardown_update_offset(d, cur_offset);
324 #endif
325 continue;
326 }
327 if (mm_teardown_pud(d, pud, cur_offset))
328 return -EAGAIN;
329 }
330 return 0;
331 }
333 int
334 mm_teardown(struct domain* d)
335 {
336 struct mm_struct* mm = &d->arch.mm;
337 unsigned long i;
338 volatile pgd_t* pgd;
340 if (mm->pgd == NULL)
341 return;
343 pgd = pgd_offset(mm, 0);
344 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
345 if (!pgd_present(*pgd)) // acquire semantics
346 continue;
347 mm_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
348 }
349 }
351 static void
352 mm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd,
353 unsigned long offset)
354 {
355 pte_free_kernel(pte_offset_map(pmd, offset));
356 }
358 static void
359 mm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud,
360 unsigned long offset)
361 {
362 unsigned long i;
363 volatile pmd_t *pmd = pmd_offset(pud, offset);
365 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
366 if (!pmd_present(*pmd))
367 continue;
368 mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
369 }
370 pmd_free(pmd_offset(pud, offset));
371 }
373 static void
374 mm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd,
375 unsigned long offset)
376 {
377 unsigned long i;
378 volatile pud_t *pud = pud_offset(pgd, offset);
380 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
381 if (!pud_present(*pud))
382 continue;
383 mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
384 }
385 pud_free(pud_offset(pgd, offset));
386 }
388 static void
389 mm_p2m_teardown(struct domain* d)
390 {
391 struct mm_struct* mm = &d->arch.mm;
392 unsigned long i;
393 volatile pgd_t* pgd;
395 BUG_ON(mm->pgd == NULL);
396 pgd = pgd_offset(mm, 0);
397 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
398 if (!pgd_present(*pgd))
399 continue;
400 mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
401 }
402 pgd_free(mm->pgd);
403 mm->pgd = NULL;
404 }
406 void
407 mm_final_teardown(struct domain* d)
408 {
409 if (d->arch.shadow_bitmap != NULL) {
410 xfree(d->arch.shadow_bitmap);
411 d->arch.shadow_bitmap = NULL;
412 }
413 mm_p2m_teardown(d);
414 }
416 unsigned long
417 domain_get_maximum_gpfn(struct domain *d)
418 {
419 return (d->arch.convmem_end + PAGE_SIZE - 1) >> PAGE_SHIFT;
420 }
422 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
423 void
424 share_xen_page_with_guest(struct page_info *page,
425 struct domain *d, int readonly)
426 {
427 if ( page_get_owner(page) == d )
428 return;
430 #if 1
431 if (readonly) {
432 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
433 }
434 #endif
436 // alloc_xenheap_pages() doesn't initialize page owner.
437 //BUG_ON(page_get_owner(page) != NULL);
439 spin_lock(&d->page_alloc_lock);
441 #ifndef __ia64__
442 /* The incremented type count pins as writable or read-only. */
443 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
444 page->u.inuse.type_info |= PGT_validated | 1;
445 #endif
447 page_set_owner(page, d);
448 wmb(); /* install valid domain ptr before updating refcnt. */
449 ASSERT(page->count_info == 0);
451 /* Only add to the allocation list if the domain isn't dying. */
452 if ( !d->is_dying )
453 {
454 page->count_info |= PGC_allocated | 1;
455 if ( unlikely(d->xenheap_pages++ == 0) )
456 get_knownalive_domain(d);
457 list_add_tail(&page->list, &d->xenpage_list);
458 }
460 // grant_table_destroy() releases these pages.
461 // but it doesn't clear their m2p entry. So there might remain stale
462 // entries. such a stale entry is cleared here.
463 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
465 spin_unlock(&d->page_alloc_lock);
466 }
468 void
469 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
470 {
471 share_xen_page_with_guest(page, dom_xen, readonly);
472 }
474 unsigned long
475 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
476 {
477 unsigned long pte;
479 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
480 if (!pte) {
481 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
482 }
483 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
484 }
486 // given a domain virtual address, pte and pagesize, extract the metaphysical
487 // address, convert the pte for a physical address for (possibly different)
488 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
489 // current->arch.vhpt_pg_shift!)
490 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* itir,
491 struct p2m_entry* entry)
492 {
493 struct domain *d = current->domain;
494 ia64_itir_t _itir = {.itir = itir__};
495 u64 mask, mpaddr, pteval2;
496 u64 arflags;
497 u64 arflags2;
498 u64 maflags2;
499 u64 ps;
501 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
503 // FIXME address had better be pre-validated on insert
504 mask = ~itir_mask(_itir.itir);
505 mpaddr = ((pteval & _PAGE_PPN_MASK) & ~mask) | (address & mask);
506 ps = current->arch.vhpt_pg_shift ? current->arch.vhpt_pg_shift :
507 PAGE_SHIFT;
509 if (_itir.ps > ps)
510 _itir.ps = ps;
512 ((ia64_itir_t*)itir)->itir = _itir.itir;/* Copy the whole register. */
513 ((ia64_itir_t*)itir)->ps = _itir.ps; /* Overwrite ps part! */
515 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
516 if (ps < PAGE_SHIFT)
517 pteval2 |= address & (PAGE_SIZE - 1) & ~((1L << ps) - 1);
519 /* Check access rights. */
520 arflags = pteval & _PAGE_AR_MASK;
521 arflags2 = pteval2 & _PAGE_AR_MASK;
522 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
523 #if 0
524 dprintk(XENLOG_WARNING,
525 "%s:%d "
526 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
527 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
528 __func__, __LINE__,
529 pteval, arflags, address, itir__,
530 pteval2, arflags2, mpaddr);
531 #endif
532 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
533 }
535 /* Check memory attribute. The switch is on the *requested* memory
536 attribute. */
537 maflags2 = pteval2 & _PAGE_MA_MASK;
538 switch (pteval & _PAGE_MA_MASK) {
539 case _PAGE_MA_NAT:
540 /* NaT pages are always accepted! */
541 break;
542 case _PAGE_MA_UC:
543 case _PAGE_MA_UCE:
544 case _PAGE_MA_WC:
545 if (maflags2 == _PAGE_MA_WB) {
546 /* Don't let domains WB-map uncached addresses.
547 This can happen when domU tries to touch i/o
548 port space. Also prevents possible address
549 aliasing issues. */
550 if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE)) {
551 u64 ucwb;
553 /*
554 * If dom0 page has both UC & WB attributes
555 * don't warn about attempted UC access.
556 */
557 ucwb = efi_mem_attribute(mpaddr, PAGE_SIZE);
558 ucwb &= EFI_MEMORY_UC | EFI_MEMORY_WB;
559 ucwb ^= EFI_MEMORY_UC | EFI_MEMORY_WB;
561 if (d != dom0 || ucwb != 0)
562 gdprintk(XENLOG_WARNING, "Warning: UC"
563 " to WB for mpaddr=%lx\n",
564 mpaddr);
565 }
566 pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
567 }
568 break;
569 case _PAGE_MA_WB:
570 if (maflags2 != _PAGE_MA_WB) {
571 /* Forbid non-coherent access to coherent memory. */
572 panic_domain(NULL, "try to use WB mem attr on "
573 "UC page, mpaddr=%lx\n", mpaddr);
574 }
575 break;
576 default:
577 panic_domain(NULL, "try to use unknown mem attribute\n");
578 }
580 /* If shadow mode is enabled, virtualize dirty bit. */
581 if (shadow_mode_enabled(d) && (pteval & _PAGE_D)) {
582 u64 mp_page = mpaddr >> PAGE_SHIFT;
583 pteval |= _PAGE_VIRT_D;
585 /* If the page is not already dirty, don't set the dirty bit! */
586 if (mp_page < d->arch.shadow_bitmap_size * 8
587 && !test_bit(mp_page, d->arch.shadow_bitmap))
588 pteval &= ~_PAGE_D;
589 }
591 /* Ignore non-addr bits of pteval2 and force PL0->1
592 (PL3 is unaffected) */
593 return (pteval & ~(_PAGE_PPN_MASK | _PAGE_PL_MASK)) |
594 (pteval2 & _PAGE_PPN_MASK) |
595 (vcpu_pl_adjust(pteval, 7) & _PAGE_PL_MASK);
596 }
598 // given a current domain metaphysical address, return the physical address
599 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
600 struct p2m_entry* entry)
601 {
602 unsigned long pteval;
604 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
605 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
606 }
608 //XXX !xxx_present() should be used instread of !xxx_none()?
609 // pud, pmd, pte page is zero cleared when they are allocated.
610 // Their area must be visible before population so that
611 // cmpxchg must have release semantics.
612 static volatile pte_t*
613 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
614 {
615 struct mm_struct *mm = &d->arch.mm;
616 volatile pgd_t *pgd;
617 volatile pud_t *pud;
618 volatile pmd_t *pmd;
620 BUG_ON(mm->pgd == NULL);
622 pgd = pgd_offset(mm, mpaddr);
623 again_pgd:
624 if (unlikely(pgd_none(*pgd))) { // acquire semantics
625 pud_t *old_pud = NULL;
626 pud = pud_alloc_one(mm, mpaddr);
627 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
628 pud_free(pud);
629 goto again_pgd;
630 }
631 }
633 pud = pud_offset(pgd, mpaddr);
634 again_pud:
635 if (unlikely(pud_none(*pud))) { // acquire semantics
636 pmd_t* old_pmd = NULL;
637 pmd = pmd_alloc_one(mm, mpaddr);
638 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
639 pmd_free(pmd);
640 goto again_pud;
641 }
642 }
644 pmd = pmd_offset(pud, mpaddr);
645 again_pmd:
646 if (unlikely(pmd_none(*pmd))) { // acquire semantics
647 pte_t* old_pte = NULL;
648 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
649 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
650 pte_free_kernel(pte);
651 goto again_pmd;
652 }
653 }
655 return pte_offset_map(pmd, mpaddr);
656 }
658 //XXX xxx_none() should be used instread of !xxx_present()?
659 volatile pte_t*
660 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
661 {
662 struct mm_struct *mm = &d->arch.mm;
663 volatile pgd_t *pgd;
664 volatile pud_t *pud;
665 volatile pmd_t *pmd;
667 BUG_ON(mm->pgd == NULL);
668 pgd = pgd_offset(mm, mpaddr);
669 if (unlikely(!pgd_present(*pgd))) // acquire semantics
670 return NULL;
672 pud = pud_offset(pgd, mpaddr);
673 if (unlikely(!pud_present(*pud))) // acquire semantics
674 return NULL;
676 pmd = pmd_offset(pud, mpaddr);
677 if (unlikely(!pmd_present(*pmd))) // acquire semantics
678 return NULL;
680 return pte_offset_map(pmd, mpaddr);
681 }
683 static volatile pte_t*
684 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
685 {
686 struct mm_struct *mm = &d->arch.mm;
687 volatile pgd_t *pgd;
688 volatile pud_t *pud;
689 volatile pmd_t *pmd;
691 BUG_ON(mm->pgd == NULL);
692 pgd = pgd_offset(mm, mpaddr);
693 if (unlikely(pgd_none(*pgd))) // acquire semantics
694 return NULL;
696 pud = pud_offset(pgd, mpaddr);
697 if (unlikely(pud_none(*pud))) // acquire semantics
698 return NULL;
700 pmd = pmd_offset(pud, mpaddr);
701 if (unlikely(pmd_none(*pmd))) // acquire semantics
702 return NULL;
704 return pte_offset_map(pmd, mpaddr);
705 }
707 unsigned long
708 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
709 {
710 volatile pte_t *pte;
712 pte = lookup_noalloc_domain_pte(d, mpaddr);
713 if (pte == NULL)
714 return INVALID_MFN;
716 if (pte_present(*pte))
717 return (pte->pte & _PFN_MASK);
718 else if (VMX_DOMAIN(d->vcpu[0]))
719 return GPFN_INV_MASK;
720 return INVALID_MFN;
721 }
723 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
724 struct p2m_entry* entry)
725 {
726 volatile pte_t *pte = lookup_noalloc_domain_pte(d, mpaddr);
728 if (pte != NULL) {
729 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
730 if (pte_present(tmp_pte)) {
731 if (entry != NULL)
732 p2m_entry_set(entry, pte, tmp_pte);
733 return pte_val(tmp_pte);
734 } else if (VMX_DOMAIN(d->vcpu[0]))
735 return GPFN_INV_MASK;
736 }
738 if (mpaddr < d->arch.convmem_end && !d->is_dying) {
739 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
740 "d %"PRId16" 0x%lx (< 0x%lx)\n",
741 current->vcpu_id, PSCB(current, iip),
742 d->domain_id, mpaddr, d->arch.convmem_end);
743 } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
744 /* Log I/O port probing, but complain less loudly about it */
745 gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
746 "d %"PRId16" 0x%lx\n",
747 current->vcpu_id, PSCB(current, iip), d->domain_id,
748 IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
749 } else {
750 gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
751 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
752 current->vcpu_id, PSCB(current, iip),
753 d->domain_id, mpaddr, d->arch.convmem_end);
754 }
756 if (entry != NULL)
757 p2m_entry_set(entry, NULL, __pte(0));
758 //XXX This is a work around until the emulation memory access to a region
759 // where memory or device are attached is implemented.
760 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_PRIV |
761 _PAGE_AR_RWX)));
762 }
764 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
765 #if 1
766 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
767 {
768 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
769 unsigned long imva;
771 pte &= _PAGE_PPN_MASK;
772 imva = (unsigned long) __va(pte);
773 imva |= mpaddr & ~PAGE_MASK;
774 return (void*)imva;
775 }
776 #else
777 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
778 {
779 unsigned long imva = __gpa_to_mpa(d, mpaddr);
781 return (void *)__va(imva);
782 }
783 #endif
785 unsigned long
786 paddr_to_maddr(unsigned long paddr)
787 {
788 struct vcpu *v = current;
789 struct domain *d = v->domain;
790 u64 pa;
792 pa = ____lookup_domain_mpa(d, paddr);
793 if (pa == INVALID_MFN) {
794 printk("%s: called with bad memory address: 0x%lx - iip=%lx\n",
795 __func__, paddr, vcpu_regs(v)->cr_iip);
796 return 0;
797 }
798 return (pa & _PFN_MASK) | (paddr & ~PAGE_MASK);
799 }
801 /* Allocate a new page for domain and map it to the specified metaphysical
802 address. */
803 static struct page_info *
804 __assign_new_domain_page(struct domain *d, unsigned long mpaddr,
805 volatile pte_t* pte)
806 {
807 struct page_info *p;
808 unsigned long maddr;
810 BUG_ON(!pte_none(*pte));
812 p = alloc_domheap_page(d);
813 if (unlikely(!p)) {
814 printk("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
815 return(p);
816 }
818 // zero out pages for security reasons
819 clear_page(page_to_virt(p));
820 maddr = page_to_maddr (p);
821 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
822 && maddr < __get_cpu_var(vhpt_pend))) {
823 /* FIXME: how can this happen ?
824 vhpt is allocated by alloc_domheap_page. */
825 printk("assign_new_domain_page: reassigned vhpt page %lx!!\n",
826 maddr);
827 }
829 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
830 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
831 // because set_pte_rel() has release semantics
832 set_pte_rel(pte,
833 pfn_pte(maddr >> PAGE_SHIFT,
834 __pgprot(_PAGE_PGC_ALLOCATED | __DIRTY_BITS |
835 _PAGE_PL_PRIV | _PAGE_AR_RWX)));
837 smp_mb();
838 return p;
839 }
841 struct page_info *
842 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
843 {
844 volatile pte_t *pte = lookup_alloc_domain_pte(d, mpaddr);
846 if (!pte_none(*pte))
847 return NULL;
849 return __assign_new_domain_page(d, mpaddr, pte);
850 }
852 void __init
853 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
854 {
855 volatile pte_t *pte;
857 BUG_ON(d != dom0);
858 pte = lookup_alloc_domain_pte(d, mpaddr);
859 if (pte_none(*pte)) {
860 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
861 if (p == NULL) {
862 panic("%s: can't allocate page for dom0\n", __func__);
863 }
864 }
865 }
867 static unsigned long
868 flags_to_prot (unsigned long flags)
869 {
870 unsigned long res = _PAGE_PL_PRIV | __DIRTY_BITS;
872 res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
873 res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
874 #ifdef CONFIG_XEN_IA64_TLB_TRACK
875 res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
876 #endif
877 res |= flags & ASSIGN_pgc_allocated ? _PAGE_PGC_ALLOCATED: 0;
879 return res;
880 }
882 /* map a physical address to the specified metaphysical addr */
883 // flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
884 // This is called by assign_domain_mmio_page().
885 // So accessing to pte is racy.
886 int
887 __assign_domain_page(struct domain *d,
888 unsigned long mpaddr, unsigned long physaddr,
889 unsigned long flags)
890 {
891 volatile pte_t *pte;
892 pte_t old_pte;
893 pte_t new_pte;
894 pte_t ret_pte;
895 unsigned long prot = flags_to_prot(flags);
897 pte = lookup_alloc_domain_pte(d, mpaddr);
899 old_pte = __pte(0);
900 new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
901 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
902 if (pte_val(ret_pte) == pte_val(old_pte)) {
903 smp_mb();
904 return 0;
905 }
907 // dom0 tries to map real machine's I/O region, but failed.
908 // It is very likely that dom0 doesn't boot correctly because
909 // it can't access I/O. So complain here.
910 if ((flags & ASSIGN_nocache) &&
911 (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT) ||
912 !(pte_val(ret_pte) & _PAGE_MA_UC)))
913 printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
914 "\talready assigned pte_val 0x%016lx\n"
915 "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
916 __func__, __LINE__,
917 d, d->domain_id, pte_val(ret_pte),
918 mpaddr, physaddr, flags);
920 return -EAGAIN;
921 }
923 /* get_page() and map a physical address to the specified metaphysical addr */
924 void
925 assign_domain_page(struct domain *d,
926 unsigned long mpaddr, unsigned long physaddr)
927 {
928 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
930 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
931 BUG_ON(page->count_info != (PGC_allocated | 1));
932 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
933 // because __assign_domain_page() uses set_pte_rel() which has
934 // release semantics, smp_mb() isn't needed.
935 (void)__assign_domain_page(d, mpaddr, physaddr,
936 ASSIGN_writable | ASSIGN_pgc_allocated);
937 }
939 int
940 ioports_permit_access(struct domain *d, unsigned int fp, unsigned int lp)
941 {
942 struct io_space *space;
943 unsigned long mmio_start, mmio_end, mach_start;
944 int ret;
946 if (IO_SPACE_NR(fp) >= num_io_spaces) {
947 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
948 return -EFAULT;
949 }
951 /*
952 * The ioport_cap rangeset tracks the I/O port address including
953 * the port space ID. This means port space IDs need to match
954 * between Xen and dom0. This is also a requirement because
955 * the hypercall to pass these port ranges only uses a u32.
956 *
957 * NB - non-dom0 driver domains may only have a subset of the
958 * I/O port spaces and thus will number port spaces differently.
959 * This is ok, they don't make use of this interface.
960 */
961 ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
962 if (ret != 0)
963 return ret;
965 space = &io_space[IO_SPACE_NR(fp)];
967 /* Legacy I/O on dom0 is already setup */
968 if (d == dom0 && space == &io_space[0])
969 return 0;
971 fp = IO_SPACE_PORT(fp);
972 lp = IO_SPACE_PORT(lp);
974 if (space->sparse) {
975 mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & ~PAGE_MASK;
976 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
977 } else {
978 mmio_start = fp & ~PAGE_MASK;
979 mmio_end = PAGE_ALIGN(lp);
980 }
982 /*
983 * The "machine first port" is not necessarily identity mapped
984 * to the guest first port. At least for the legacy range.
985 */
986 mach_start = mmio_start | __pa(space->mmio_base);
988 if (space == &io_space[0]) {
989 mmio_start |= IO_PORTS_PADDR;
990 mmio_end |= IO_PORTS_PADDR;
991 } else {
992 mmio_start |= __pa(space->mmio_base);
993 mmio_end |= __pa(space->mmio_base);
994 }
996 while (mmio_start <= mmio_end) {
997 (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache);
998 mmio_start += PAGE_SIZE;
999 mach_start += PAGE_SIZE;
1002 return 0;
1005 static int
1006 ioports_has_allowed(struct domain *d, unsigned int fp, unsigned int lp)
1008 for (; fp < lp; fp++)
1009 if (rangeset_contains_singleton(d->arch.ioport_caps, fp))
1010 return 1;
1012 return 0;
1015 int
1016 ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
1018 int ret;
1019 struct mm_struct *mm = &d->arch.mm;
1020 unsigned long mmio_start, mmio_end, mmio_base;
1021 unsigned int fp_base, lp_base;
1022 struct io_space *space;
1024 if (IO_SPACE_NR(fp) >= num_io_spaces) {
1025 dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
1026 return -EFAULT;
1029 ret = rangeset_remove_range(d->arch.ioport_caps, fp, lp);
1030 if (ret != 0)
1031 return ret;
1033 space = &io_space[IO_SPACE_NR(fp)];
1034 fp_base = IO_SPACE_PORT(fp);
1035 lp_base = IO_SPACE_PORT(lp);
1037 if (space->sparse) {
1038 mmio_start = IO_SPACE_SPARSE_ENCODING(fp_base) & ~PAGE_MASK;
1039 mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp_base));
1040 } else {
1041 mmio_start = fp_base & ~PAGE_MASK;
1042 mmio_end = PAGE_ALIGN(lp_base);
1045 if (space == &io_space[0] && d != dom0)
1046 mmio_base = IO_PORTS_PADDR;
1047 else
1048 mmio_base = __pa(space->mmio_base);
1050 for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
1051 unsigned int port, range;
1052 unsigned long mpaddr;
1053 volatile pte_t *pte;
1054 pte_t old_pte;
1056 if (space->sparse) {
1057 port = IO_SPACE_SPARSE_DECODING(mmio_start);
1058 range = IO_SPACE_SPARSE_PORTS_PER_PAGE - 1;
1059 } else {
1060 port = mmio_start;
1061 range = PAGE_SIZE - 1;
1064 port |= IO_SPACE_BASE(IO_SPACE_NR(fp));
1066 if (port < fp || port + range > lp) {
1067 /* Maybe this covers an allowed port. */
1068 if (ioports_has_allowed(d, port, port + range))
1069 continue;
1072 mpaddr = mmio_start | mmio_base;
1073 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1074 BUG_ON(pte == NULL);
1075 BUG_ON(pte_none(*pte));
1077 /* clear pte */
1078 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1080 domain_flush_vtlb_all(d);
1081 return 0;
1084 static void
1085 assign_domain_same_page(struct domain *d,
1086 unsigned long mpaddr, unsigned long size,
1087 unsigned long flags)
1089 //XXX optimization
1090 unsigned long end = PAGE_ALIGN(mpaddr + size);
1091 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
1092 (void)__assign_domain_page(d, mpaddr, mpaddr, flags);
1096 int
1097 efi_mmio(unsigned long physaddr, unsigned long size)
1099 void *efi_map_start, *efi_map_end;
1100 u64 efi_desc_size;
1101 void* p;
1103 efi_map_start = __va(ia64_boot_param->efi_memmap);
1104 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
1105 efi_desc_size = ia64_boot_param->efi_memdesc_size;
1107 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
1108 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
1109 unsigned long start = md->phys_addr;
1110 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
1112 if (start <= physaddr && physaddr < end) {
1113 if ((physaddr + size) > end) {
1114 gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
1115 __func__, physaddr, size);
1116 return 0;
1119 // for io space
1120 if (md->type == EFI_MEMORY_MAPPED_IO ||
1121 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
1122 return 1;
1125 // for runtime
1126 // see efi_enter_virtual_mode(void)
1127 // in linux/arch/ia64/kernel/efi.c
1128 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
1129 !(md->attribute & EFI_MEMORY_WB)) {
1130 return 1;
1133 return 0;
1136 if (physaddr < start) {
1137 break;
1141 return 1;
1144 unsigned long
1145 assign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
1146 unsigned long phys_addr, unsigned long size,
1147 unsigned long flags)
1149 unsigned long addr = mpaddr & PAGE_MASK;
1150 unsigned long end = PAGE_ALIGN(mpaddr + size);
1152 if (size == 0) {
1153 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1154 __func__, d, mpaddr, size);
1156 if (!efi_mmio(mpaddr, size)) {
1157 #ifndef NDEBUG
1158 gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
1159 __func__, d, mpaddr, size);
1160 #endif
1161 return -EINVAL;
1164 for (phys_addr &= PAGE_MASK; addr < end;
1165 addr += PAGE_SIZE, phys_addr += PAGE_SIZE) {
1166 __assign_domain_page(d, addr, phys_addr, flags);
1169 return mpaddr;
1172 unsigned long
1173 assign_domain_mach_page(struct domain *d,
1174 unsigned long mpaddr, unsigned long size,
1175 unsigned long flags)
1177 BUG_ON(flags & ASSIGN_pgc_allocated);
1178 assign_domain_same_page(d, mpaddr, size, flags);
1179 return mpaddr;
1182 static void
1183 adjust_page_count_info(struct page_info* page)
1185 struct domain* d = page_get_owner(page);
1186 BUG_ON((page->count_info & PGC_count_mask) != 1);
1187 if (d != NULL) {
1188 int ret = get_page(page, d);
1189 BUG_ON(ret == 0);
1190 } else {
1191 u64 x, nx, y;
1193 y = *((u64*)&page->count_info);
1194 do {
1195 x = y;
1196 nx = x + 1;
1198 BUG_ON((x >> 32) != 0);
1199 BUG_ON((nx & PGC_count_mask) != 2);
1200 y = cmpxchg((u64*)&page->count_info, x, nx);
1201 } while (unlikely(y != x));
1205 static void
1206 domain_put_page(struct domain* d, unsigned long mpaddr,
1207 volatile pte_t* ptep, pte_t old_pte, int clear_PGC_allocate)
1209 unsigned long mfn = pte_pfn(old_pte);
1210 struct page_info* page = mfn_to_page(mfn);
1212 if (pte_pgc_allocated(old_pte)) {
1213 if (page_get_owner(page) == d || page_get_owner(page) == NULL) {
1214 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1215 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1216 } else {
1217 BUG();
1220 if (likely(clear_PGC_allocate)) {
1221 if (!test_and_clear_bit(_PGC_allocated, &page->count_info))
1222 BUG();
1223 /* put_page() is done by domain_page_flush_and_put() */
1224 } else {
1225 // In this case, page reference count mustn't touched.
1226 // domain_page_flush_and_put() decrements it, we increment
1227 // it in advence. This patch is slow path.
1228 //
1229 // guest_remove_page(): owner = d, count_info = 1
1230 // memory_exchange(): owner = NULL, count_info = 1
1231 adjust_page_count_info(page);
1234 domain_page_flush_and_put(d, mpaddr, ptep, old_pte, page);
1237 // caller must get_page(mfn_to_page(mfn)) before call.
1238 // caller must call set_gpfn_from_mfn() before call if necessary.
1239 // because set_gpfn_from_mfn() result must be visible before pte xchg
1240 // caller must use memory barrier. NOTE: xchg has acquire semantics.
1241 // flags: ASSIGN_xxx
1242 static void
1243 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
1244 unsigned long mfn, unsigned long flags)
1246 struct mm_struct *mm = &d->arch.mm;
1247 volatile pte_t* pte;
1248 pte_t old_pte;
1249 pte_t npte;
1250 unsigned long prot = flags_to_prot(flags);
1252 pte = lookup_alloc_domain_pte(d, mpaddr);
1254 // update pte
1255 npte = pfn_pte(mfn, __pgprot(prot));
1256 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
1257 if (pte_mem(old_pte)) {
1258 unsigned long old_mfn = pte_pfn(old_pte);
1260 // mfn = old_mfn case can happen when domain maps a granted page
1261 // twice with the same pseudo physial address.
1262 // It's non sense, but allowed.
1263 // __gnttab_map_grant_ref()
1264 // => create_host_mapping()
1265 // => assign_domain_page_replace()
1266 if (mfn != old_mfn) {
1267 domain_put_page(d, mpaddr, pte, old_pte, 1);
1270 perfc_incr(assign_domain_page_replace);
1273 // caller must get_page(new_page) before
1274 // Only steal_page() calls this function.
1275 static int
1276 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
1277 struct page_info* old_page,
1278 struct page_info* new_page,
1279 unsigned long flags, int clear_PGC_allocate)
1281 struct mm_struct *mm = &d->arch.mm;
1282 volatile pte_t* pte;
1283 unsigned long old_mfn;
1284 unsigned long old_prot;
1285 pte_t old_pte;
1286 unsigned long new_mfn;
1287 unsigned long new_prot;
1288 pte_t new_pte;
1289 pte_t ret_pte;
1291 BUG_ON((flags & ASSIGN_pgc_allocated) == 0);
1292 pte = lookup_alloc_domain_pte(d, mpaddr);
1294 again:
1295 old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
1296 old_mfn = page_to_mfn(old_page);
1297 old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
1298 if (!pte_present(old_pte)) {
1299 gdprintk(XENLOG_INFO,
1300 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
1301 __func__, pte_val(old_pte), old_prot, old_mfn);
1302 return -EINVAL;
1305 new_prot = flags_to_prot(flags);
1306 new_mfn = page_to_mfn(new_page);
1307 new_pte = pfn_pte(new_mfn, __pgprot(new_prot));
1309 // update pte
1310 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1311 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1312 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1313 goto again;
1316 gdprintk(XENLOG_INFO,
1317 "%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
1318 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1319 __func__,
1320 pte_val(old_pte), old_prot, old_mfn,
1321 pte_val(ret_pte), pte_pfn(ret_pte));
1322 return -EINVAL;
1325 BUG_ON(!pte_mem(old_pte));
1326 BUG_ON(!pte_pgc_allocated(old_pte));
1327 BUG_ON(page_get_owner(old_page) != d);
1328 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1329 BUG_ON(old_mfn == new_mfn);
1331 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1332 if (likely(clear_PGC_allocate)) {
1333 if (!test_and_clear_bit(_PGC_allocated, &old_page->count_info))
1334 BUG();
1335 } else {
1336 int ret;
1337 // adjust for count_info for domain_page_flush_and_put()
1338 // This is slow path.
1339 BUG_ON(!test_bit(_PGC_allocated, &old_page->count_info));
1340 BUG_ON(d == NULL);
1341 ret = get_page(old_page, d);
1342 BUG_ON(ret == 0);
1345 domain_page_flush_and_put(d, mpaddr, pte, old_pte, old_page);
1346 perfc_incr(assign_domain_pge_cmpxchg_rel);
1347 return 0;
1350 static void
1351 zap_domain_page_one(struct domain *d, unsigned long mpaddr,
1352 int clear_PGC_allocate, unsigned long mfn)
1354 struct mm_struct *mm = &d->arch.mm;
1355 volatile pte_t *pte;
1356 pte_t old_pte;
1357 struct page_info *page;
1359 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1360 if (pte == NULL)
1361 return;
1362 if (pte_none(*pte))
1363 return;
1365 if (mfn == INVALID_MFN) {
1366 // clear pte
1367 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1368 mfn = pte_pfn(old_pte);
1369 } else {
1370 unsigned long old_arflags;
1371 pte_t new_pte;
1372 pte_t ret_pte;
1374 again:
1375 // memory_exchange() calls guest_physmap_remove_page() with
1376 // a stealed page. i.e. page owner = NULL.
1377 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
1378 page_get_owner(mfn_to_page(mfn)) != NULL);
1379 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1380 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1381 new_pte = __pte(0);
1383 // update pte
1384 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1385 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1386 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1387 goto again;
1390 gdprintk(XENLOG_INFO, "%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1391 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1392 __func__,
1393 pte_val(old_pte), old_arflags, mfn,
1394 pte_val(ret_pte), pte_pfn(ret_pte));
1395 return;
1397 BUG_ON(mfn != pte_pfn(ret_pte));
1400 page = mfn_to_page(mfn);
1401 BUG_ON((page->count_info & PGC_count_mask) == 0);
1403 BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
1404 domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
1405 perfc_incr(zap_dcomain_page_one);
1408 unsigned long
1409 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1410 unsigned int extent_order)
1412 if (extent_order != 0) {
1413 //XXX
1414 return -ENOSYS;
1417 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 1, INVALID_MFN);
1418 perfc_incr(dom0vp_zap_physmap);
1419 return 0;
1422 static unsigned long
1423 __dom0vp_add_physmap(struct domain* d, unsigned long gpfn,
1424 unsigned long mfn_or_gmfn,
1425 unsigned long flags, domid_t domid, int is_gmfn)
1427 int error = -EINVAL;
1428 struct domain* rd;
1429 unsigned long mfn;
1431 /* Not allowed by a domain. */
1432 if (flags & (ASSIGN_nocache | ASSIGN_pgc_allocated))
1433 return -EINVAL;
1435 rd = get_domain_by_id(domid);
1436 if (unlikely(rd == NULL)) {
1437 switch (domid) {
1438 case DOMID_XEN:
1439 rd = dom_xen;
1440 break;
1441 case DOMID_IO:
1442 rd = dom_io;
1443 break;
1444 default:
1445 gdprintk(XENLOG_INFO, "d 0x%p domid %d "
1446 "gpfn 0x%lx mfn_or_gmfn 0x%lx flags 0x%lx domid %d\n",
1447 d, d->domain_id, gpfn, mfn_or_gmfn, flags, domid);
1448 return -ESRCH;
1450 BUG_ON(rd == NULL);
1451 get_knownalive_domain(rd);
1454 if (unlikely(rd == d))
1455 goto out1;
1456 /*
1457 * DOMID_XEN and DOMID_IO don't have their own p2m table.
1458 * It can be considered that their p2m conversion is p==m.
1459 */
1460 if (likely(is_gmfn && domid != DOMID_XEN && domid != DOMID_IO))
1461 mfn = gmfn_to_mfn(rd, mfn_or_gmfn);
1462 else
1463 mfn = mfn_or_gmfn;
1464 if (unlikely(!mfn_valid(mfn) || get_page(mfn_to_page(mfn), rd) == 0))
1465 goto out1;
1467 error = 0;
1468 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1469 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1470 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1471 //don't update p2m table because this page belongs to rd, not d.
1472 perfc_incr(dom0vp_add_physmap);
1473 out1:
1474 put_domain(rd);
1475 return error;
1478 unsigned long
1479 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1480 unsigned long flags, domid_t domid)
1482 return __dom0vp_add_physmap(d, gpfn, mfn, flags, domid, 0);
1485 unsigned long
1486 dom0vp_add_physmap_with_gmfn(struct domain* d, unsigned long gpfn,
1487 unsigned long gmfn, unsigned long flags,
1488 domid_t domid)
1490 return __dom0vp_add_physmap(d, gpfn, gmfn, flags, domid, 1);
1493 #ifdef CONFIG_XEN_IA64_EXPOSE_P2M
1494 static struct page_info* p2m_pte_zero_page = NULL;
1496 /* This must called before dom0 p2m table allocation */
1497 void __init
1498 expose_p2m_init(void)
1500 pte_t* pte;
1502 /*
1503 * Initialise our DOMID_P2M domain.
1504 * This domain owns m2p table pages.
1505 */
1506 dom_p2m = alloc_domain(DOMID_P2M);
1507 BUG_ON(dom_p2m == NULL);
1508 dom_p2m->max_pages = ~0U;
1510 pte = pte_alloc_one_kernel(NULL, 0);
1511 BUG_ON(pte == NULL);
1512 smp_mb();// make contents of the page visible.
1513 p2m_pte_zero_page = virt_to_page(pte);
1516 static int
1517 expose_p2m_page(struct domain* d, unsigned long mpaddr, struct page_info* page)
1519 int ret = get_page(page, dom_p2m);
1520 BUG_ON(ret != 1);
1521 return __assign_domain_page(d, mpaddr, page_to_maddr(page),
1522 ASSIGN_readonly);
1525 // It is possible to optimize loop, But this isn't performance critical.
1526 unsigned long
1527 dom0vp_expose_p2m(struct domain* d,
1528 unsigned long conv_start_gpfn,
1529 unsigned long assign_start_gpfn,
1530 unsigned long expose_size, unsigned long granule_pfn)
1532 unsigned long expose_num_pfn = expose_size >> PAGE_SHIFT;
1533 unsigned long i;
1534 volatile pte_t* conv_pte;
1535 volatile pte_t* assign_pte;
1537 if ((expose_size % PAGE_SIZE) != 0 ||
1538 (granule_pfn % PTRS_PER_PTE) != 0 ||
1539 (expose_num_pfn % PTRS_PER_PTE) != 0 ||
1540 (conv_start_gpfn % granule_pfn) != 0 ||
1541 (assign_start_gpfn % granule_pfn) != 0 ||
1542 (expose_num_pfn % granule_pfn) != 0) {
1543 gdprintk(XENLOG_INFO,
1544 "%s conv_start_gpfn 0x%016lx assign_start_gpfn 0x%016lx "
1545 "expose_size 0x%016lx granulte_pfn 0x%016lx\n", __func__,
1546 conv_start_gpfn, assign_start_gpfn, expose_size, granule_pfn);
1547 return -EINVAL;
1550 if (granule_pfn != PTRS_PER_PTE) {
1551 gdprintk(XENLOG_INFO,
1552 "%s granule_pfn 0x%016lx PTRS_PER_PTE 0x%016lx\n",
1553 __func__, granule_pfn, PTRS_PER_PTE);
1554 return -ENOSYS;
1557 // allocate pgd, pmd.
1558 i = conv_start_gpfn;
1559 while (i < expose_num_pfn) {
1560 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1561 PAGE_SHIFT);
1562 if (conv_pte == NULL) {
1563 i++;
1564 continue;
1567 assign_pte = lookup_alloc_domain_pte(d, (assign_start_gpfn <<
1568 PAGE_SHIFT) + i * sizeof(pte_t));
1569 if (assign_pte == NULL) {
1570 gdprintk(XENLOG_INFO, "%s failed to allocate pte page\n", __func__);
1571 return -ENOMEM;
1574 // skip to next pte page
1575 i += PTRS_PER_PTE;
1576 i &= ~(PTRS_PER_PTE - 1);
1579 // expose pte page
1580 i = 0;
1581 while (i < expose_num_pfn) {
1582 conv_pte = lookup_noalloc_domain_pte(d, (conv_start_gpfn + i) <<
1583 PAGE_SHIFT);
1584 if (conv_pte == NULL) {
1585 i++;
1586 continue;
1589 if (expose_p2m_page(d, (assign_start_gpfn << PAGE_SHIFT) +
1590 i * sizeof(pte_t), virt_to_page(conv_pte)) < 0) {
1591 gdprintk(XENLOG_INFO, "%s failed to assign page\n", __func__);
1592 return -EAGAIN;
1595 // skip to next pte page
1596 i += PTRS_PER_PTE;
1597 i &= ~(PTRS_PER_PTE - 1);
1600 // expose p2m_pte_zero_page
1601 for (i = 0; i < (expose_num_pfn + PTRS_PER_PTE - 1) / PTRS_PER_PTE; i++) {
1602 assign_pte = lookup_noalloc_domain_pte(d, (assign_start_gpfn + i) <<
1603 PAGE_SHIFT);
1604 if (assign_pte == NULL || pte_present(*assign_pte))
1605 continue;
1607 if (expose_p2m_page(d, (assign_start_gpfn + i) << PAGE_SHIFT,
1608 p2m_pte_zero_page) < 0) {
1609 gdprintk(XENLOG_INFO, "%s failed to assign zero-pte page\n", __func__);
1610 return -EAGAIN;
1614 return 0;
1616 #endif
1618 // grant table host mapping
1619 // mpaddr: host_addr: pseudo physical address
1620 // mfn: frame: machine page frame
1621 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1622 int
1623 create_grant_host_mapping(unsigned long gpaddr,
1624 unsigned long mfn, unsigned int flags)
1626 struct domain* d = current->domain;
1627 struct page_info* page;
1628 int ret;
1630 if (flags & (GNTMAP_device_map |
1631 GNTMAP_application_map | GNTMAP_contains_pte)) {
1632 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1633 return GNTST_general_error;
1636 BUG_ON(!mfn_valid(mfn));
1637 page = mfn_to_page(mfn);
1638 ret = get_page(page, page_get_owner(page));
1639 BUG_ON(ret == 0);
1640 assign_domain_page_replace(d, gpaddr, mfn,
1641 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1642 ASSIGN_tlb_track |
1643 #endif
1644 ((flags & GNTMAP_readonly) ?
1645 ASSIGN_readonly : ASSIGN_writable));
1646 perfc_incr(create_grant_host_mapping);
1647 return GNTST_okay;
1650 // grant table host unmapping
1651 int
1652 replace_grant_host_mapping(unsigned long gpaddr,
1653 unsigned long mfn, unsigned long new_gpaddr, unsigned int flags)
1655 struct domain* d = current->domain;
1656 unsigned long gpfn = gpaddr >> PAGE_SHIFT;
1657 volatile pte_t* pte;
1658 unsigned long cur_arflags;
1659 pte_t cur_pte;
1660 pte_t new_pte = __pte(0);
1661 pte_t old_pte;
1662 struct page_info* page = mfn_to_page(mfn);
1663 struct page_info* new_page = NULL;
1664 volatile pte_t* new_page_pte = NULL;
1666 if (new_gpaddr) {
1667 new_page_pte = lookup_noalloc_domain_pte_none(d, new_gpaddr);
1668 if (likely(new_page_pte != NULL)) {
1669 new_pte = ptep_get_and_clear(&d->arch.mm,
1670 new_gpaddr, new_page_pte);
1671 if (likely(pte_present(new_pte))) {
1672 unsigned long new_page_mfn;
1673 struct domain* page_owner;
1675 new_page_mfn = pte_pfn(new_pte);
1676 new_page = mfn_to_page(new_page_mfn);
1677 page_owner = page_get_owner(new_page);
1678 if (unlikely(page_owner == NULL)) {
1679 gdprintk(XENLOG_INFO,
1680 "%s: page_owner == NULL "
1681 "gpaddr 0x%lx mfn 0x%lx "
1682 "new_gpaddr 0x%lx mfn 0x%lx\n",
1683 __func__, gpaddr, mfn, new_gpaddr, new_page_mfn);
1684 new_page = NULL; /* prevent domain_put_page() */
1685 goto out;
1688 /*
1689 * domain_put_page(clear_PGC_allcoated = 0)
1690 * doesn't decrement refcount of page with
1691 * pte_ptc_allocated() = 1. Be carefull.
1692 */
1693 if (unlikely(!pte_pgc_allocated(new_pte))) {
1694 /* domain_put_page() decrements page refcount. adjust it. */
1695 if (get_page(new_page, page_owner)) {
1696 gdprintk(XENLOG_INFO,
1697 "%s: get_page() failed. "
1698 "gpaddr 0x%lx mfn 0x%lx "
1699 "new_gpaddr 0x%lx mfn 0x%lx\n",
1700 __func__, gpaddr, mfn,
1701 new_gpaddr, new_page_mfn);
1702 goto out;
1705 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 0);
1706 } else
1707 new_pte = __pte(0);
1711 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1712 gdprintk(XENLOG_INFO, "%s: flags 0x%x\n", __func__, flags);
1713 return GNTST_general_error;
1716 pte = lookup_noalloc_domain_pte(d, gpaddr);
1717 if (pte == NULL) {
1718 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx\n",
1719 __func__, gpaddr, mfn);
1720 goto out;
1723 again:
1724 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1725 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1726 if (!pte_present(cur_pte) ||
1727 (page_get_owner(page) == d && get_gpfn_from_mfn(mfn) == gpfn)) {
1728 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1729 __func__, gpaddr, mfn, pte_val(cur_pte));
1730 goto out;
1733 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1734 if (unlikely(!pte_present(old_pte))) {
1735 gdprintk(XENLOG_INFO, "%s: gpaddr 0x%lx mfn 0x%lx"
1736 " cur_pte 0x%lx old_pte 0x%lx\n",
1737 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1738 goto out;
1740 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1741 if (pte_pfn(old_pte) == mfn) {
1742 goto again;
1744 gdprintk(XENLOG_INFO, "%s gpaddr 0x%lx mfn 0x%lx cur_pte "
1745 "0x%lx old_pte 0x%lx\n",
1746 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1747 goto out;
1749 BUG_ON(pte_pfn(old_pte) != mfn);
1751 /* try_to_clear_PGC_allocate(d, page) is not needed. */
1752 BUG_ON(page_get_owner(page) == d &&
1753 get_gpfn_from_mfn(mfn) == gpfn);
1754 BUG_ON(pte_pgc_allocated(old_pte));
1755 domain_page_flush_and_put(d, gpaddr, pte, old_pte, page);
1757 perfc_incr(replace_grant_host_mapping);
1758 return GNTST_okay;
1760 out:
1761 if (new_page)
1762 domain_put_page(d, new_gpaddr, new_page_pte, new_pte, 1);
1763 return GNTST_general_error;
1766 // heavily depends on the struct page layout.
1767 // gnttab_transfer() calls steal_page() with memflags = 0
1768 // For grant table transfer, we must fill the page.
1769 // memory_exchange() calls steal_page() with memflags = MEMF_no_refcount
1770 // For memory exchange, we don't have to fill the page because
1771 // memory_exchange() does it.
1772 int
1773 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1775 #if 0 /* if big endian */
1776 # error "implement big endian version of steal_page()"
1777 #endif
1778 u32 _d, _nd;
1779 u64 x, nx, y;
1781 if (page_get_owner(page) != d) {
1782 gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
1783 __func__, d, page_get_owner(page));
1784 return -1;
1787 if (!(memflags & MEMF_no_refcount)) {
1788 unsigned long gpfn;
1789 struct page_info *new;
1790 unsigned long new_mfn;
1791 int ret;
1793 new = alloc_domheap_page(d);
1794 if (new == NULL) {
1795 gdprintk(XENLOG_INFO, "alloc_domheap_page() failed\n");
1796 return -1;
1798 // zero out pages for security reasons
1799 clear_page(page_to_virt(new));
1800 // assign_domain_page_cmpxchg_rel() has release semantics
1801 // so smp_mb() isn't needed.
1803 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1804 if (gpfn == INVALID_M2P_ENTRY) {
1805 free_domheap_page(new);
1806 return -1;
1808 new_mfn = page_to_mfn(new);
1809 set_gpfn_from_mfn(new_mfn, gpfn);
1810 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1811 // has release semantics.
1813 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1814 ASSIGN_writable |
1815 ASSIGN_pgc_allocated, 0);
1816 if (ret < 0) {
1817 gdprintk(XENLOG_INFO, "assign_domain_page_cmpxchg_rel failed %d\n",
1818 ret);
1819 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1820 free_domheap_page(new);
1821 return -1;
1823 perfc_incr(steal_page_refcount);
1826 spin_lock(&d->page_alloc_lock);
1828 /*
1829 * The tricky bit: atomically release ownership while there is just one
1830 * benign reference to the page (PGC_allocated). If that reference
1831 * disappears then the deallocation routine will safely spin.
1832 */
1833 _d = pickle_domptr(d);
1834 y = *((u64*)&page->count_info);
1835 do {
1836 x = y;
1837 nx = x & 0xffffffff;
1838 // page->count_info: untouched
1839 // page->u.inused._domain = 0;
1840 _nd = x >> 32;
1842 if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
1843 (1 | PGC_allocated))) ||
1844 unlikely(_nd != _d)) {
1845 struct domain* nd = unpickle_domptr(_nd);
1846 if (nd == NULL) {
1847 gdprintk(XENLOG_INFO, "gnttab_transfer: "
1848 "Bad page %p: ed=%p(%u) 0x%x, "
1849 "sd=%p 0x%x,"
1850 " caf=%016lx, taf=%" PRtype_info
1851 " memflags 0x%x\n",
1852 (void *) page_to_mfn(page),
1853 d, d->domain_id, _d,
1854 nd, _nd,
1855 x,
1856 page->u.inuse.type_info,
1857 memflags);
1858 } else {
1859 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
1860 "Bad page %p: ed=%p(%u) 0x%x, "
1861 "sd=%p(%u) 0x%x,"
1862 " caf=%016lx, taf=%" PRtype_info
1863 " memflags 0x%x\n",
1864 (void *) page_to_mfn(page),
1865 d, d->domain_id, _d,
1866 nd, nd->domain_id, _nd,
1867 x,
1868 page->u.inuse.type_info,
1869 memflags);
1871 spin_unlock(&d->page_alloc_lock);
1872 return -1;
1875 y = cmpxchg((u64*)&page->count_info, x, nx);
1876 } while (unlikely(y != x));
1878 /*
1879 * Unlink from 'd'. At least one reference remains (now anonymous), so
1880 * noone else is spinning to try to delete this page from 'd'.
1881 */
1882 if ( !(memflags & MEMF_no_refcount) )
1883 d->tot_pages--;
1884 list_del(&page->list);
1886 spin_unlock(&d->page_alloc_lock);
1887 perfc_incr(steal_page);
1888 return 0;
1891 void
1892 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1893 unsigned long mfn)
1895 BUG_ON(!mfn_valid(mfn));
1896 BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
1897 set_gpfn_from_mfn(mfn, gpfn);
1898 smp_mb();
1899 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
1900 ASSIGN_writable | ASSIGN_pgc_allocated);
1902 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1904 perfc_incr(guest_physmap_add_page);
1907 void
1908 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1909 unsigned long mfn)
1911 BUG_ON(mfn == 0);//XXX
1912 zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
1913 perfc_incr(guest_physmap_remove_page);
1916 static void
1917 domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
1918 volatile pte_t* ptep, pte_t old_pte,
1919 struct page_info* page)
1921 #ifdef CONFIG_XEN_IA64_TLB_TRACK
1922 struct tlb_track_entry* entry;
1923 #endif
1925 if (shadow_mode_enabled(d))
1926 shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
1928 #ifndef CONFIG_XEN_IA64_TLB_TRACK
1929 //XXX sledgehammer.
1930 // flush finer range.
1931 domain_flush_vtlb_all(d);
1932 put_page(page);
1933 #else
1934 switch (tlb_track_search_and_remove(d->arch.tlb_track,
1935 ptep, old_pte, &entry)) {
1936 case TLB_TRACK_NOT_TRACKED:
1937 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_TRACKED\n", __func__);
1938 /* This page is zapped from this domain
1939 * by memory decrease or exchange or dom0vp_zap_physmap.
1940 * I.e. the page is zapped for returning this page to xen
1941 * (balloon driver or DMA page allocation) or
1942 * foreign domain mapped page is unmapped from the domain.
1943 * In the former case the page is to be freed so that
1944 * we can defer freeing page to batch.
1945 * In the latter case the page is unmapped so that
1946 * we need to flush it. But to optimize it, we
1947 * queue the page and flush vTLB only once.
1948 * I.e. The caller must call dfree_flush() explicitly.
1949 */
1950 domain_flush_vtlb_all(d);
1951 put_page(page);
1952 break;
1953 case TLB_TRACK_NOT_FOUND:
1954 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_NOT_FOUND\n", __func__);
1955 /* This page is zapped from this domain
1956 * by grant table page unmap.
1957 * Luckily the domain that mapped this page didn't
1958 * access this page so that we don't have to flush vTLB.
1959 * Probably the domain did only DMA.
1960 */
1961 /* do nothing */
1962 put_page(page);
1963 break;
1964 case TLB_TRACK_FOUND:
1965 // dprintk(XENLOG_WARNING, "%s TLB_TRACK_FOUND\n", __func__);
1966 /* This page is zapped from this domain
1967 * by grant table page unmap.
1968 * Fortunately this page is accessced via only one virtual
1969 * memory address. So it is easy to flush it.
1970 */
1971 domain_flush_vtlb_track_entry(d, entry);
1972 tlb_track_free_entry(d->arch.tlb_track, entry);
1973 put_page(page);
1974 break;
1975 case TLB_TRACK_MANY:
1976 gdprintk(XENLOG_INFO, "%s TLB_TRACK_MANY\n", __func__);
1977 /* This page is zapped from this domain
1978 * by grant table page unmap.
1979 * Unfortunately this page is accessced via many virtual
1980 * memory address (or too many times with single virtual address).
1981 * So we abondaned to track virtual addresses.
1982 * full vTLB flush is necessary.
1983 */
1984 domain_flush_vtlb_all(d);
1985 put_page(page);
1986 break;
1987 case TLB_TRACK_AGAIN:
1988 gdprintk(XENLOG_ERR, "%s TLB_TRACK_AGAIN\n", __func__);
1989 BUG();
1990 break;
1992 #endif
1993 perfc_incr(domain_page_flush_and_put);
1996 int
1997 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1999 volatile pte_t * pte;
2001 pte = lookup_noalloc_domain_pte(d, mpaddr);
2002 if(pte != NULL && !pte_none(*pte))
2003 return 1;
2004 return 0;
2007 /* Flush cache of domain d. */
2008 void domain_cache_flush (struct domain *d, int sync_only)
2010 struct mm_struct *mm = &d->arch.mm;
2011 volatile pgd_t *pgd = mm->pgd;
2012 unsigned long maddr;
2013 int i,j,k, l;
2014 int nbr_page = 0;
2015 void (*flush_func)(unsigned long start, unsigned long end);
2016 extern void flush_dcache_range (unsigned long, unsigned long);
2018 if (sync_only)
2019 flush_func = &flush_icache_range;
2020 else
2021 flush_func = &flush_dcache_range;
2023 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
2024 volatile pud_t *pud;
2025 if (!pgd_present(*pgd)) // acquire semantics
2026 continue;
2027 pud = pud_offset(pgd, 0);
2028 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
2029 volatile pmd_t *pmd;
2030 if (!pud_present(*pud)) // acquire semantics
2031 continue;
2032 pmd = pmd_offset(pud, 0);
2033 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
2034 volatile pte_t *pte;
2035 if (!pmd_present(*pmd)) // acquire semantics
2036 continue;
2037 pte = pte_offset_map(pmd, 0);
2038 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
2039 if (!pte_present(*pte)) // acquire semantics
2040 continue;
2041 /* Convert PTE to maddr. */
2042 maddr = __va_ul (pte_val(*pte)
2043 & _PAGE_PPN_MASK);
2044 (*flush_func)(maddr, maddr+ PAGE_SIZE);
2045 nbr_page++;
2050 //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
2053 #ifdef VERBOSE
2054 #define MEM_LOG(_f, _a...) \
2055 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
2056 current->domain->domain_id , __LINE__ , ## _a )
2057 #else
2058 #define MEM_LOG(_f, _a...) ((void)0)
2059 #endif
2061 static void free_page_type(struct page_info *page, u32 type)
2065 static int alloc_page_type(struct page_info *page, u32 type)
2067 return 1;
2070 static int opt_p2m_xenheap;
2071 boolean_param("p2m_xenheap", opt_p2m_xenheap);
2073 void *pgtable_quicklist_alloc(void)
2075 void *p;
2077 BUG_ON(dom_p2m == NULL);
2078 if (!opt_p2m_xenheap) {
2079 struct page_info *page = alloc_domheap_page(dom_p2m);
2080 if (page == NULL)
2081 return NULL;
2082 p = page_to_virt(page);
2083 clear_page(p);
2084 return p;
2086 p = alloc_xenheap_pages(0);
2087 if (p) {
2088 clear_page(p);
2089 /*
2090 * This page should be read only. At this moment, the third
2091 * argument doesn't make sense. It should be 1 when supported.
2092 */
2093 share_xen_page_with_guest(virt_to_page(p), dom_p2m, 0);
2095 return p;
2098 void pgtable_quicklist_free(void *pgtable_entry)
2100 struct page_info* page = virt_to_page(pgtable_entry);
2102 BUG_ON(page_get_owner(page) != dom_p2m);
2103 BUG_ON(page->count_info != (1 | PGC_allocated));
2105 put_page(page);
2106 if (opt_p2m_xenheap)
2107 free_xenheap_page(pgtable_entry);
2110 void put_page_type(struct page_info *page)
2112 u64 nx, x, y = page->u.inuse.type_info;
2114 again:
2115 do {
2116 x = y;
2117 nx = x - 1;
2119 ASSERT((x & PGT_count_mask) != 0);
2121 /*
2122 * The page should always be validated while a reference is held. The
2123 * exception is during domain destruction, when we forcibly invalidate
2124 * page-table pages if we detect a referential loop.
2125 * See domain.c:relinquish_list().
2126 */
2127 ASSERT((x & PGT_validated) || page_get_owner(page)->is_dying);
2129 if ( unlikely((nx & PGT_count_mask) == 0) )
2131 /* Record TLB information for flush later. Races are harmless. */
2132 page->tlbflush_timestamp = tlbflush_current_time();
2134 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2135 likely(nx & PGT_validated) )
2137 /*
2138 * Page-table pages must be unvalidated when count is zero. The
2139 * 'free' is safe because the refcnt is non-zero and validated
2140 * bit is clear => other ops will spin or fail.
2141 */
2142 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
2143 x & ~PGT_validated)) != x) )
2144 goto again;
2145 /* We cleared the 'valid bit' so we do the clean up. */
2146 free_page_type(page, x);
2147 /* Carry on, but with the 'valid bit' now clear. */
2148 x &= ~PGT_validated;
2149 nx &= ~PGT_validated;
2153 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
2157 int get_page_type(struct page_info *page, u32 type)
2159 u64 nx, x, y = page->u.inuse.type_info;
2161 ASSERT(!(type & ~PGT_type_mask));
2163 again:
2164 do {
2165 x = y;
2166 nx = x + 1;
2167 if ( unlikely((nx & PGT_count_mask) == 0) )
2169 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2170 return 0;
2172 else if ( unlikely((x & PGT_count_mask) == 0) )
2174 if ( (x & PGT_type_mask) != type )
2176 /*
2177 * On type change we check to flush stale TLB entries. This
2178 * may be unnecessary (e.g., page was GDT/LDT) but those
2179 * circumstances should be very rare.
2180 */
2181 cpumask_t mask =
2182 page_get_owner(page)->domain_dirty_cpumask;
2183 tlbflush_filter(mask, page->tlbflush_timestamp);
2185 if ( unlikely(!cpus_empty(mask)) )
2187 perfc_incr(need_flush_tlb_flush);
2188 flush_tlb_mask(mask);
2191 /* We lose existing type, back pointer, and validity. */
2192 nx &= ~(PGT_type_mask | PGT_validated);
2193 nx |= type;
2195 /* No special validation needed for writable pages. */
2196 /* Page tables and GDT/LDT need to be scanned for validity. */
2197 if ( type == PGT_writable_page )
2198 nx |= PGT_validated;
2201 else if ( unlikely((x & PGT_type_mask) != type) )
2203 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
2204 (type != PGT_l1_page_table) )
2205 MEM_LOG("Bad type (saw %08lx != exp %08x) "
2206 "for mfn %016lx (pfn %016lx)",
2207 x, type, page_to_mfn(page),
2208 get_gpfn_from_mfn(page_to_mfn(page)));
2209 return 0;
2211 else if ( unlikely(!(x & PGT_validated)) )
2213 /* Someone else is updating validation of this page. Wait... */
2214 while ( (y = page->u.inuse.type_info) == x )
2215 cpu_relax();
2216 goto again;
2219 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
2221 if ( unlikely(!(nx & PGT_validated)) )
2223 /* Try to validate page type; drop the new reference on failure. */
2224 if ( unlikely(!alloc_page_type(page, type)) )
2226 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
2227 ": caf=%08x taf=%" PRtype_info,
2228 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2229 type, page->count_info, page->u.inuse.type_info);
2230 /* Noone else can get a reference. We hold the only ref. */
2231 page->u.inuse.type_info = 0;
2232 return 0;
2235 /* Noone else is updating simultaneously. */
2236 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2239 return 1;
2242 int memory_is_conventional_ram(paddr_t p)
2244 return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
2248 long
2249 arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2251 switch (op) {
2252 case XENMEM_add_to_physmap:
2254 struct xen_add_to_physmap xatp;
2255 unsigned long prev_mfn, mfn = 0, gpfn;
2256 struct domain *d;
2258 if (copy_from_guest(&xatp, arg, 1))
2259 return -EFAULT;
2261 if (xatp.domid == DOMID_SELF) {
2262 d = get_current_domain();
2264 else if (!IS_PRIV(current->domain))
2265 return -EPERM;
2266 else if ((d = get_domain_by_id(xatp.domid)) == NULL)
2267 return -ESRCH;
2269 /* This hypercall is used for VT-i domain only */
2270 if (!VMX_DOMAIN(d->vcpu[0])) {
2271 put_domain(d);
2272 return -ENOSYS;
2275 switch (xatp.space) {
2276 case XENMAPSPACE_shared_info:
2277 if (xatp.idx == 0)
2278 mfn = virt_to_mfn(d->shared_info);
2279 break;
2280 case XENMAPSPACE_grant_table:
2281 spin_lock(&d->grant_table->lock);
2283 if ((xatp.idx >= nr_grant_frames(d->grant_table)) &&
2284 (xatp.idx < max_nr_grant_frames))
2285 gnttab_grow_table(d, xatp.idx + 1);
2287 if (xatp.idx < nr_grant_frames(d->grant_table))
2288 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
2290 spin_unlock(&d->grant_table->lock);
2291 break;
2292 default:
2293 break;
2296 if (mfn == 0) {
2297 put_domain(d);
2298 return -EINVAL;
2301 LOCK_BIGLOCK(d);
2303 /* Check remapping necessity */
2304 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2305 if (mfn == prev_mfn)
2306 goto out;
2308 /* Remove previously mapped page if it was present. */
2309 if (prev_mfn && mfn_valid(prev_mfn)) {
2310 if (is_xen_heap_frame(mfn_to_page(prev_mfn)))
2311 /* Xen heap frames are simply unhooked from this phys slot. */
2312 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2313 else
2314 /* Normal domain memory is freed, to avoid leaking memory. */
2315 guest_remove_page(d, xatp.gpfn);
2318 /* Unmap from old location, if any. */
2319 gpfn = get_gpfn_from_mfn(mfn);
2320 if (gpfn != INVALID_M2P_ENTRY)
2321 guest_physmap_remove_page(d, gpfn, mfn);
2323 /* Map at new location. */
2324 guest_physmap_add_page(d, xatp.gpfn, mfn);
2326 out:
2327 UNLOCK_BIGLOCK(d);
2329 put_domain(d);
2331 break;
2334 case XENMEM_machine_memory_map:
2336 struct xen_memory_map memmap;
2337 struct xen_ia64_memmap_info memmap_info;
2338 XEN_GUEST_HANDLE(char) buffer;
2340 if (!IS_PRIV(current->domain))
2341 return -EINVAL;
2342 if (copy_from_guest(&memmap, arg, 1))
2343 return -EFAULT;
2344 if (memmap.nr_entries <
2345 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size)
2346 return -EINVAL;
2348 memmap.nr_entries =
2349 sizeof(memmap_info) + ia64_boot_param->efi_memmap_size;
2350 memset(&memmap_info, 0, sizeof(memmap_info));
2351 memmap_info.efi_memmap_size = ia64_boot_param->efi_memmap_size;
2352 memmap_info.efi_memdesc_size = ia64_boot_param->efi_memdesc_size;
2353 memmap_info.efi_memdesc_version = ia64_boot_param->efi_memdesc_version;
2355 buffer = guest_handle_cast(memmap.buffer, char);
2356 if (copy_to_guest(buffer, (char*)&memmap_info, sizeof(memmap_info)) ||
2357 copy_to_guest_offset(buffer, sizeof(memmap_info),
2358 (char*)__va(ia64_boot_param->efi_memmap),
2359 ia64_boot_param->efi_memmap_size) ||
2360 copy_to_guest(arg, &memmap, 1))
2361 return -EFAULT;
2362 return 0;
2365 default:
2366 return -ENOSYS;
2369 return 0;
2372 /*
2373 * Local variables:
2374 * mode: C
2375 * c-set-style: "BSD"
2376 * c-basic-offset: 4
2377 * tab-width: 4
2378 * indent-tabs-mode: nil
2379 * End:
2380 */