ia64/xen-unstable

view xen/arch/ia64/xen/mm.c @ 10447:056622b0f8f7

[IA64] memory exchange: update comment

update comment. s/steal_page_for_grant_transfer/steal_page/g

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author awilliam@xenbuild.aw
date Fri Jun 23 15:13:54 2006 -0600 (2006-06-23)
parents be0a536b70cc
children 39562deee9b8
line source
1 /*
2 * Copyright (C) 2005 Intel Co
3 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
4 *
5 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * dom0 vp model support
10 */
12 /*
13 * NOTES on SMP
14 *
15 * * shared structures
16 * There are some structures which are accessed by CPUs concurrently.
17 * Here is the list of shared structures and operations on them which
18 * read/write the structures.
19 *
20 * - struct page_info
21 * This is a xen global resource. This structure is accessed by
22 * any CPUs.
23 *
24 * operations on this structure:
25 * - get_page() and its variant
26 * - put_page() and its variant
27 *
28 * - vTLB
29 * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data.
30 * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU.
31 *
32 * domain_flush_vtlb_range() and domain_flush_vtlb_all()
33 * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current.
34 * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb.
35 * Please note that reading VHPT is done by hardware page table walker.
36 *
37 * operations on this structure:
38 * - global tlb purge
39 * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush()
40 * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all()
41 * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb
42 *
43 * - tlb insert and fc
44 * vcpu_itc_i()
45 * vcpu_itc_d()
46 * ia64_do_page_fault()
47 * vcpu_fc()
48 * These functions set VHPT entry and vcpu->arch.{i, d}tlb.
49 * Actually vcpu_itc_no_srlz() does.
50 *
51 * - the P2M table
52 * domain->mm and pgd, pud, pmd, pte table page.
53 * This structure is used to convert domain pseudo physical address
54 * to machine address. This is per domain resource.
55 *
56 * operations on this structure:
57 * - populate the P2M table tree
58 * lookup_alloc_domain_pte() and its variants.
59 * - set p2m entry
60 * assign_new_domain_page() and its variants.
61 * assign_domain_page() and its variants.
62 * - xchg p2m entry
63 * assign_domain_page_replace()
64 * - cmpxchg p2m entry
65 * assign_domain_page_cmpxchg_rel()
66 * destroy_grant_host_mapping()
67 * steal_page()
68 * zap_domain_page_one()
69 * - read p2m entry
70 * lookup_alloc_domain_pte() and its variants.
71 *
72 * - the M2P table
73 * mpt_table (or machine_to_phys_mapping)
74 * This is a table which converts from machine address to pseudo physical
75 * address. This is a global structure.
76 *
77 * operations on this structure:
78 * - set m2p entry
79 * set_gpfn_from_mfn()
80 * - zap m2p entry
81 * set_gpfn_from_mfn(INVALID_P2M_ENTRY)
82 * - get m2p entry
83 * get_gpfn_from_mfn()
84 *
85 *
86 * * avoiding races
87 * The resources which are shared by CPUs must be accessed carefully
88 * to avoid race.
89 * IA64 has weak memory ordering so that attention must be paid
90 * to access shared structures. [SDM vol2 PartII chap. 2]
91 *
92 * - struct page_info memory ordering
93 * get_page() has acquire semantics.
94 * put_page() has release semantics.
95 *
96 * - populating the p2m table
97 * pgd, pud, pmd are append only.
98 *
99 * - races when updating the P2M tables and the M2P table
100 * The P2M entry are shared by more than one vcpu.
101 * So they are accessed atomic operations.
102 * I.e. xchg or cmpxchg must be used to update the p2m entry.
103 * NOTE: When creating/destructing a domain, we don't need to take care of
104 * this race.
105 *
106 * The M2P table is inverse of the P2M table.
107 * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m
108 * The M2P table and P2M table must be updated consistently.
109 * Here is the update sequence
110 *
111 * xchg or cmpxchg case
112 * - set_gpfn_from_mfn(new_mfn, gpfn)
113 * - memory barrier
114 * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry)
115 * get old_mfn entry as a result.
116 * - memory barrier
117 * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY)
118 *
119 * Here memory barrier can be achieved by release semantics.
120 *
121 * - races between global tlb purge and tlb insert
122 * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry.
123 * When a vcpu is about to insert tlb, another vcpu may purge tlb
124 * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge
125 * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update
126 * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here.
127 *
128 * Here check vcpu->arch.{d, i}tlb.p bit
129 * After inserting tlb entry, check the p bit and retry to insert.
130 * This means that when global tlb purge and tlb insert are issued
131 * simultaneously, always global tlb purge happens after tlb insert.
132 *
133 * - races between p2m entry update and tlb insert
134 * This is a race between reading/writing the p2m entry.
135 * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc()
136 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
137 * steal_page(), zap_domain_page_one()
138 *
139 * For example, vcpu_itc_i() is about to insert tlb by calling
140 * vcpu_itc_no_srlz() after reading the p2m entry.
141 * At the same time, the p2m entry is replaced by xchg or cmpxchg and
142 * tlb cache of the page is flushed.
143 * There is a possibility that the p2m entry doesn't already point to the
144 * old page, but tlb cache still points to the old page.
145 * This can be detected similar to sequence lock using the p2m entry itself.
146 * reader remember the read value of the p2m entry, and insert tlb.
147 * Then read the p2m entry again. If the new p2m entry value is different
148 * from the used p2m entry value, the retry.
149 *
150 * - races between referencing page and p2m entry update
151 * This is a race between reading/writing the p2m entry.
152 * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(),
153 * efi_emulate_get_time()
154 * writer: assign_domain_page_cmpxchg_rel(), destroy_grant_host_mapping(),
155 * steal_page(), zap_domain_page_one()
156 *
157 * A page which assigned to a domain can be de-assigned by another vcpu.
158 * So before read/write to a domain page, the page's reference count
159 * must be incremented.
160 * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and
161 * efi_emulate_get_time()
162 *
163 */
165 #include <xen/config.h>
166 #include <xen/sched.h>
167 #include <xen/domain.h>
168 #include <asm/xentypes.h>
169 #include <asm/mm.h>
170 #include <asm/pgalloc.h>
171 #include <asm/vhpt.h>
172 #include <asm/vcpu.h>
173 #include <linux/efi.h>
175 #ifndef CONFIG_XEN_IA64_DOM0_VP
176 #define CONFIG_DOMAIN0_CONTIGUOUS
177 #else
178 static void domain_page_flush(struct domain* d, unsigned long mpaddr,
179 unsigned long old_mfn, unsigned long new_mfn);
180 #endif
182 static struct domain *dom_xen, *dom_io;
184 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
185 void
186 alloc_dom_xen_and_dom_io(void)
187 {
188 /*
189 * Initialise our DOMID_XEN domain.
190 * Any Xen-heap pages that we will allow to be mapped will have
191 * their domain field set to dom_xen.
192 */
193 dom_xen = alloc_domain(DOMID_XEN);
194 BUG_ON(dom_xen == NULL);
196 /*
197 * Initialise our DOMID_IO domain.
198 * This domain owns I/O pages that are within the range of the page_info
199 * array. Mappings occur at the priv of the caller.
200 */
201 dom_io = alloc_domain(DOMID_IO);
202 BUG_ON(dom_io == NULL);
203 }
205 // heavily depends on the struct page_info layout.
206 // if (page_get_owner(page) == d &&
207 // test_and_clear_bit(_PGC_allocated, &page->count_info)) {
208 // put_page(page);
209 // }
210 static void
211 try_to_clear_PGC_allocate(struct domain* d, struct page_info* page)
212 {
213 u32 _d, _nd;
214 u64 x, nx, y;
216 _d = pickle_domptr(d);
217 y = *((u64*)&page->count_info);
218 do {
219 x = y;
220 _nd = x >> 32;
221 nx = x - 1;
222 __clear_bit(_PGC_allocated, &nx);
224 if (unlikely(!(x & PGC_allocated)) || unlikely(_nd != _d)) {
225 struct domain* nd = unpickle_domptr(_nd);
226 if (nd == NULL) {
227 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
228 "sd=%p 0x%x,"
229 " caf=%016lx, taf=%" PRtype_info "\n",
230 (void *) page_to_mfn(page),
231 d, d->domain_id, _d,
232 nd, _nd,
233 x,
234 page->u.inuse.type_info);
235 }
236 break;
237 }
239 BUG_ON((nx & PGC_count_mask) < 1);
240 y = cmpxchg((u64*)&page->count_info, x, nx);
241 } while (unlikely(y != x));
242 }
244 static void
245 relinquish_pte(struct domain* d, pte_t* pte)
246 {
247 unsigned long mfn = pte_pfn(*pte);
248 struct page_info* page;
250 // vmx domain use bit[58:56] to distinguish io region from memory.
251 // see vmx_build_physmap_table() in vmx_init.c
252 if (!pte_mem(*pte))
253 return;
255 // domain might map IO space or acpi table pages. check it.
256 if (!mfn_valid(mfn))
257 return;
258 page = mfn_to_page(mfn);
259 // struct page_info corresponding to mfn may exist or not depending
260 // on CONFIG_VIRTUAL_FRAME_TABLE.
261 // This check is too easy.
262 // The right way is to check whether this page is of io area or acpi pages
263 if (page_get_owner(page) == NULL) {
264 BUG_ON(page->count_info != 0);
265 return;
266 }
268 #ifdef CONFIG_XEN_IA64_DOM0_VP
269 if (page_get_owner(page) == d) {
270 BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY);
271 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
272 }
273 #endif
274 try_to_clear_PGC_allocate(d, page);
275 put_page(page);
276 }
278 static void
279 relinquish_pmd(struct domain* d, pmd_t* pmd, unsigned long offset)
280 {
281 unsigned long i;
282 pte_t* pte = pte_offset_map(pmd, offset);
284 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
285 if (!pte_present(*pte))
286 continue;
288 relinquish_pte(d, pte);
289 }
290 pte_free_kernel(pte_offset_map(pmd, offset));
291 }
293 static void
294 relinquish_pud(struct domain* d, pud_t *pud, unsigned long offset)
295 {
296 unsigned long i;
297 pmd_t *pmd = pmd_offset(pud, offset);
299 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
300 if (!pmd_present(*pmd))
301 continue;
303 relinquish_pmd(d, pmd, offset + (i << PMD_SHIFT));
304 }
305 pmd_free(pmd_offset(pud, offset));
306 }
308 static void
309 relinquish_pgd(struct domain* d, pgd_t *pgd, unsigned long offset)
310 {
311 unsigned long i;
312 pud_t *pud = pud_offset(pgd, offset);
314 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
315 if (!pud_present(*pud))
316 continue;
318 relinquish_pud(d, pud, offset + (i << PUD_SHIFT));
319 }
320 pud_free(pud_offset(pgd, offset));
321 }
323 void
324 relinquish_mm(struct domain* d)
325 {
326 struct mm_struct* mm = &d->arch.mm;
327 unsigned long i;
328 pgd_t* pgd;
330 if (mm->pgd == NULL)
331 return;
333 pgd = pgd_offset(mm, 0);
334 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
335 if (!pgd_present(*pgd))
336 continue;
338 relinquish_pgd(d, pgd, i << PGDIR_SHIFT);
339 }
340 pgd_free(mm->pgd);
341 mm->pgd = NULL;
342 }
344 // stolen from share_xen_page_with_guest() in xen/arch/x86/mm.c
345 void
346 share_xen_page_with_guest(struct page_info *page,
347 struct domain *d, int readonly)
348 {
349 if ( page_get_owner(page) == d )
350 return;
352 #if 1
353 if (readonly) {
354 printk("%s:%d readonly is not supported yet\n", __func__, __LINE__);
355 }
356 #endif
358 // alloc_xenheap_pages() doesn't initialize page owner.
359 //BUG_ON(page_get_owner(page) != NULL);
361 spin_lock(&d->page_alloc_lock);
363 #ifndef __ia64__
364 /* The incremented type count pins as writable or read-only. */
365 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
366 page->u.inuse.type_info |= PGT_validated | 1;
367 #endif
369 page_set_owner(page, d);
370 wmb(); /* install valid domain ptr before updating refcnt. */
371 ASSERT(page->count_info == 0);
372 page->count_info |= PGC_allocated | 1;
374 if ( unlikely(d->xenheap_pages++ == 0) )
375 get_knownalive_domain(d);
376 list_add_tail(&page->list, &d->xenpage_list);
378 // grant_table_destroy() releases these pages.
379 // but it doesn't clear their m2p entry. So there might remain stale
380 // entries. such a stale entry is cleared here.
381 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
383 spin_unlock(&d->page_alloc_lock);
384 }
386 void
387 share_xen_page_with_privileged_guests(struct page_info *page, int readonly)
388 {
389 share_xen_page_with_guest(page, dom_xen, readonly);
390 }
392 unsigned long
393 gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
394 {
395 unsigned long pte;
397 #ifndef CONFIG_XEN_IA64_DOM0_VP
398 if (d == dom0)
399 return(gpfn);
400 #endif
401 pte = lookup_domain_mpa(d,gpfn << PAGE_SHIFT, NULL);
402 if (!pte) {
403 panic("gmfn_to_mfn_foreign: bad gpfn. spinning...\n");
404 }
405 return ((pte & _PFN_MASK) >> PAGE_SHIFT);
406 }
408 // given a domain virtual address, pte and pagesize, extract the metaphysical
409 // address, convert the pte for a physical address for (possibly different)
410 // Xen PAGE_SIZE and return modified pte. (NOTE: TLB insert should use
411 // PAGE_SIZE!)
412 u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__, u64* logps,
413 struct p2m_entry* entry)
414 {
415 struct domain *d = current->domain;
416 ia64_itir_t itir = {.itir = itir__};
417 u64 mask, mpaddr, pteval2;
418 u64 arflags;
419 u64 arflags2;
421 pteval &= ((1UL << 53) - 1);// ignore [63:53] bits
423 // FIXME address had better be pre-validated on insert
424 mask = ~itir_mask(itir.itir);
425 mpaddr = (((pteval & ~_PAGE_ED) & _PAGE_PPN_MASK) & ~mask) |
426 (address & mask);
427 #ifdef CONFIG_XEN_IA64_DOM0_VP
428 if (itir.ps > PAGE_SHIFT) {
429 itir.ps = PAGE_SHIFT;
430 }
431 #endif
432 *logps = itir.ps;
433 #ifndef CONFIG_XEN_IA64_DOM0_VP
434 if (d == dom0) {
435 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
436 /*
437 printk("translate_domain_pte: out-of-bounds dom0 mpaddr 0x%lx! itc=%lx...\n",
438 mpaddr, ia64_get_itc());
439 */
440 }
441 }
442 else if ((mpaddr >> PAGE_SHIFT) > d->max_pages) {
443 /* Address beyond the limit. However the grant table is
444 also beyond the limit. Display a message if not in the
445 grant table. */
446 if (mpaddr >= IA64_GRANT_TABLE_PADDR
447 && mpaddr < (IA64_GRANT_TABLE_PADDR
448 + (ORDER_GRANT_FRAMES << PAGE_SHIFT)))
449 printf("translate_domain_pte: bad mpa=0x%lx (> 0x%lx),"
450 "vadr=0x%lx,pteval=0x%lx,itir=0x%lx\n",
451 mpaddr, (unsigned long)d->max_pages<<PAGE_SHIFT,
452 address, pteval, itir.itir);
453 }
454 #endif
455 pteval2 = lookup_domain_mpa(d, mpaddr, entry);
456 arflags = pteval & _PAGE_AR_MASK;
457 arflags2 = pteval2 & _PAGE_AR_MASK;
458 if (arflags != _PAGE_AR_R && arflags2 == _PAGE_AR_R) {
459 #if 0
460 DPRINTK("%s:%d "
461 "pteval 0x%lx arflag 0x%lx address 0x%lx itir 0x%lx "
462 "pteval2 0x%lx arflags2 0x%lx mpaddr 0x%lx\n",
463 __func__, __LINE__,
464 pteval, arflags, address, itir__,
465 pteval2, arflags2, mpaddr);
466 #endif
467 pteval = (pteval & ~_PAGE_AR_MASK) | _PAGE_AR_R;
468 }
470 pteval2 &= _PAGE_PPN_MASK; // ignore non-addr bits
471 pteval2 |= (pteval & _PAGE_ED);
472 pteval2 |= _PAGE_PL_2; // force PL0->2 (PL3 is unaffected)
473 pteval2 = (pteval & ~_PAGE_PPN_MASK) | pteval2;
474 /*
475 * Don't let non-dom0 domains map uncached addresses. This can
476 * happen when domU tries to touch i/o port space. Also prevents
477 * possible address aliasing issues.
478 */
479 if (d != dom0)
480 pteval2 &= ~_PAGE_MA_MASK;
482 return pteval2;
483 }
485 // given a current domain metaphysical address, return the physical address
486 unsigned long translate_domain_mpaddr(unsigned long mpaddr,
487 struct p2m_entry* entry)
488 {
489 unsigned long pteval;
491 #ifndef CONFIG_XEN_IA64_DOM0_VP
492 if (current->domain == dom0) {
493 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
494 printk("translate_domain_mpaddr: out-of-bounds dom0 mpaddr 0x%lx! continuing...\n",
495 mpaddr);
496 }
497 }
498 #endif
499 pteval = lookup_domain_mpa(current->domain, mpaddr, entry);
500 return ((pteval & _PAGE_PPN_MASK) | (mpaddr & ~PAGE_MASK));
501 }
503 //XXX !xxx_present() should be used instread of !xxx_none()?
504 // __assign_new_domain_page(), assign_new_domain_page() and
505 // assign_new_domain0_page() are used only when domain creation.
506 // their accesses aren't racy so that returned pte_t doesn't need
507 // volatile qualifier
508 static pte_t*
509 __lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
510 {
511 struct mm_struct *mm = &d->arch.mm;
512 pgd_t *pgd;
513 pud_t *pud;
514 pmd_t *pmd;
516 BUG_ON(mm->pgd == NULL);
517 pgd = pgd_offset(mm, mpaddr);
518 if (pgd_none(*pgd)) {
519 pgd_populate(mm, pgd, pud_alloc_one(mm,mpaddr));
520 }
522 pud = pud_offset(pgd, mpaddr);
523 if (pud_none(*pud)) {
524 pud_populate(mm, pud, pmd_alloc_one(mm,mpaddr));
525 }
527 pmd = pmd_offset(pud, mpaddr);
528 if (pmd_none(*pmd)) {
529 pmd_populate_kernel(mm, pmd, pte_alloc_one_kernel(mm, mpaddr));
530 }
532 return pte_offset_map(pmd, mpaddr);
533 }
535 //XXX !xxx_present() should be used instread of !xxx_none()?
536 // pud, pmd, pte page is zero cleared when they are allocated.
537 // Their area must be visible before population so that
538 // cmpxchg must have release semantics.
539 static volatile pte_t*
540 lookup_alloc_domain_pte(struct domain* d, unsigned long mpaddr)
541 {
542 struct mm_struct *mm = &d->arch.mm;
543 pgd_t *pgd;
544 pud_t *pud;
545 pmd_t *pmd;
547 BUG_ON(mm->pgd == NULL);
549 pgd = pgd_offset(mm, mpaddr);
550 again_pgd:
551 if (unlikely(pgd_none(*pgd))) {
552 pud_t *old_pud = NULL;
553 pud = pud_alloc_one(mm, mpaddr);
554 if (unlikely(!pgd_cmpxchg_rel(mm, pgd, old_pud, pud))) {
555 pud_free(pud);
556 goto again_pgd;
557 }
558 }
560 pud = pud_offset(pgd, mpaddr);
561 again_pud:
562 if (unlikely(pud_none(*pud))) {
563 pmd_t* old_pmd = NULL;
564 pmd = pmd_alloc_one(mm, mpaddr);
565 if (unlikely(!pud_cmpxchg_rel(mm, pud, old_pmd, pmd))) {
566 pmd_free(pmd);
567 goto again_pud;
568 }
569 }
571 pmd = pmd_offset(pud, mpaddr);
572 again_pmd:
573 if (unlikely(pmd_none(*pmd))) {
574 pte_t* old_pte = NULL;
575 pte_t* pte = pte_alloc_one_kernel(mm, mpaddr);
576 if (unlikely(!pmd_cmpxchg_kernel_rel(mm, pmd, old_pte, pte))) {
577 pte_free_kernel(pte);
578 goto again_pmd;
579 }
580 }
582 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
583 }
585 //XXX xxx_none() should be used instread of !xxx_present()?
586 static volatile pte_t*
587 lookup_noalloc_domain_pte(struct domain* d, unsigned long mpaddr)
588 {
589 struct mm_struct *mm = &d->arch.mm;
590 pgd_t *pgd;
591 pud_t *pud;
592 pmd_t *pmd;
594 BUG_ON(mm->pgd == NULL);
595 pgd = pgd_offset(mm, mpaddr);
596 if (unlikely(!pgd_present(*pgd)))
597 return NULL;
599 pud = pud_offset(pgd, mpaddr);
600 if (unlikely(!pud_present(*pud)))
601 return NULL;
603 pmd = pmd_offset(pud, mpaddr);
604 if (unlikely(!pmd_present(*pmd)))
605 return NULL;
607 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
608 }
610 #ifdef CONFIG_XEN_IA64_DOM0_VP
611 static volatile pte_t*
612 lookup_noalloc_domain_pte_none(struct domain* d, unsigned long mpaddr)
613 {
614 struct mm_struct *mm = &d->arch.mm;
615 pgd_t *pgd;
616 pud_t *pud;
617 pmd_t *pmd;
619 BUG_ON(mm->pgd == NULL);
620 pgd = pgd_offset(mm, mpaddr);
621 if (unlikely(pgd_none(*pgd)))
622 return NULL;
624 pud = pud_offset(pgd, mpaddr);
625 if (unlikely(pud_none(*pud)))
626 return NULL;
628 pmd = pmd_offset(pud, mpaddr);
629 if (unlikely(pmd_none(*pmd)))
630 return NULL;
632 return (volatile pte_t*)pte_offset_map(pmd, mpaddr);
633 }
635 unsigned long
636 ____lookup_domain_mpa(struct domain *d, unsigned long mpaddr)
637 {
638 volatile pte_t *pte;
640 pte = lookup_noalloc_domain_pte(d, mpaddr);
641 if (pte == NULL)
642 return INVALID_MFN;
644 if (pte_present(*pte))
645 return (pte->pte & _PFN_MASK);
646 else if (VMX_DOMAIN(d->vcpu[0]))
647 return GPFN_INV_MASK;
648 return INVALID_MFN;
649 }
650 #endif
652 unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr,
653 struct p2m_entry* entry)
654 {
655 volatile pte_t *pte;
657 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
658 if (d == dom0) {
659 pte_t pteval;
660 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
661 //printk("lookup_domain_mpa: bad dom0 mpaddr 0x%lx!\n",mpaddr);
662 //printk("lookup_domain_mpa: start=0x%lx,end=0x%lx!\n",dom0_start,dom0_start+dom0_size);
663 }
664 pteval = pfn_pte(mpaddr >> PAGE_SHIFT,
665 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX));
666 return pte_val(pteval);
667 }
668 #endif
669 pte = lookup_noalloc_domain_pte(d, mpaddr);
670 if (pte != NULL) {
671 pte_t tmp_pte = *pte;// pte is volatile. copy the value.
672 if (pte_present(tmp_pte)) {
673 //printk("lookup_domain_page: found mapping for %lx, pte=%lx\n",mpaddr,pte_val(*pte));
674 if (entry != NULL)
675 p2m_entry_set(entry, pte, tmp_pte);
676 return pte_val(tmp_pte);
677 } else if (VMX_DOMAIN(d->vcpu[0]))
678 return GPFN_INV_MASK;
679 }
681 printk("%s: d 0x%p id %d current 0x%p id %d\n",
682 __func__, d, d->domain_id, current, current->vcpu_id);
683 if ((mpaddr >> PAGE_SHIFT) < d->max_pages)
684 printk("%s: non-allocated mpa 0x%lx (< 0x%lx)\n", __func__,
685 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
686 else
687 printk("%s: bad mpa 0x%lx (=> 0x%lx)\n", __func__,
688 mpaddr, (unsigned long)d->max_pages << PAGE_SHIFT);
690 if (entry != NULL)
691 p2m_entry_set(entry, NULL, __pte(0));
692 //XXX This is a work around until the emulation memory access to a region
693 // where memory or device are attached is implemented.
694 return pte_val(pfn_pte(0, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
695 }
697 // FIXME: ONLY USE FOR DOMAIN PAGE_SIZE == PAGE_SIZE
698 #if 1
699 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
700 {
701 unsigned long pte = lookup_domain_mpa(d, mpaddr, NULL);
702 unsigned long imva;
704 pte &= _PAGE_PPN_MASK;
705 imva = (unsigned long) __va(pte);
706 imva |= mpaddr & ~PAGE_MASK;
707 return (void*)imva;
708 }
709 #else
710 void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr)
711 {
712 unsigned long imva = __gpa_to_mpa(d, mpaddr);
714 return (void *)__va(imva);
715 }
716 #endif
718 /* Allocate a new page for domain and map it to the specified metaphysical
719 address. */
720 static struct page_info *
721 __assign_new_domain_page(struct domain *d, unsigned long mpaddr, pte_t* pte)
722 {
723 struct page_info *p = NULL;
724 unsigned long maddr;
725 int ret;
727 BUG_ON(!pte_none(*pte));
729 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
730 if (d == dom0) {
731 #if 0
732 if (mpaddr < dom0_start || mpaddr >= dom0_start + dom0_size) {
733 /* FIXME: is it true ?
734 dom0 memory is not contiguous! */
735 panic("assign_new_domain_page: bad domain0 "
736 "mpaddr=%lx, start=%lx, end=%lx!\n",
737 mpaddr, dom0_start, dom0_start+dom0_size);
738 }
739 #endif
740 p = mfn_to_page((mpaddr >> PAGE_SHIFT));
741 return p;
742 }
743 #endif
745 p = alloc_domheap_page(d);
746 if (unlikely(!p)) {
747 printf("assign_new_domain_page: Can't alloc!!!! Aaaargh!\n");
748 return(p);
749 }
751 // zero out pages for security reasons
752 clear_page(page_to_virt(p));
753 maddr = page_to_maddr (p);
754 if (unlikely(maddr > __get_cpu_var(vhpt_paddr)
755 && maddr < __get_cpu_var(vhpt_pend))) {
756 /* FIXME: how can this happen ?
757 vhpt is allocated by alloc_domheap_page. */
758 printf("assign_new_domain_page: reassigned vhpt page %lx!!\n",
759 maddr);
760 }
762 ret = get_page(p, d);
763 BUG_ON(ret == 0);
764 set_gpfn_from_mfn(page_to_mfn(p), mpaddr >> PAGE_SHIFT);
765 // clear_page() and set_gpfn_from_mfn() become visible before set_pte_rel()
766 // because set_pte_rel() has release semantics
767 set_pte_rel(pte,
768 pfn_pte(maddr >> PAGE_SHIFT,
769 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | _PAGE_AR_RWX)));
771 smp_mb();
772 return p;
773 }
775 struct page_info *
776 assign_new_domain_page(struct domain *d, unsigned long mpaddr)
777 {
778 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
779 pte_t dummy_pte = __pte(0);
780 return __assign_new_domain_page(d, mpaddr, &dummy_pte);
781 #else
782 struct page_info *p = NULL;
783 pte_t *pte;
785 pte = __lookup_alloc_domain_pte(d, mpaddr);
786 if (pte_none(*pte))
787 p = __assign_new_domain_page(d, mpaddr, pte);
789 return p;
790 #endif
791 }
793 void
794 assign_new_domain0_page(struct domain *d, unsigned long mpaddr)
795 {
796 #ifndef CONFIG_DOMAIN0_CONTIGUOUS
797 pte_t *pte;
799 BUG_ON(d != dom0);
800 pte = __lookup_alloc_domain_pte(d, mpaddr);
801 if (pte_none(*pte)) {
802 struct page_info *p = __assign_new_domain_page(d, mpaddr, pte);
803 if (p == NULL) {
804 panic("%s: can't allocate page for dom0", __func__);
805 }
806 }
807 #endif
808 }
810 /* map a physical address to the specified metaphysical addr */
811 // flags: currently only ASSIGN_readonly
812 // This is called by assign_domain_mmio_page().
813 // So accessing to pte is racy.
814 void
815 __assign_domain_page(struct domain *d,
816 unsigned long mpaddr, unsigned long physaddr,
817 unsigned long flags)
818 {
819 volatile pte_t *pte;
820 pte_t old_pte;
821 pte_t new_pte;
822 pte_t ret_pte;
823 unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
825 pte = lookup_alloc_domain_pte(d, mpaddr);
827 old_pte = __pte(0);
828 new_pte = pfn_pte(physaddr >> PAGE_SHIFT,
829 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
830 ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
831 if (pte_val(ret_pte) == pte_val(old_pte))
832 smp_mb();
833 }
835 /* get_page() and map a physical address to the specified metaphysical addr */
836 void
837 assign_domain_page(struct domain *d,
838 unsigned long mpaddr, unsigned long physaddr)
839 {
840 struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
841 int ret;
843 BUG_ON((physaddr & GPFN_IO_MASK) != GPFN_MEM);
844 ret = get_page(page, d);
845 BUG_ON(ret == 0);
846 set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
847 // because __assign_domain_page() uses set_pte_rel() which has
848 // release semantics, smp_mb() isn't needed.
849 __assign_domain_page(d, mpaddr, physaddr, ASSIGN_writable);
850 }
852 #ifdef CONFIG_XEN_IA64_DOM0_VP
853 static void
854 assign_domain_same_page(struct domain *d,
855 unsigned long mpaddr, unsigned long size,
856 unsigned long flags)
857 {
858 //XXX optimization
859 unsigned long end = PAGE_ALIGN(mpaddr + size);
860 for (mpaddr &= PAGE_MASK; mpaddr < end; mpaddr += PAGE_SIZE) {
861 __assign_domain_page(d, mpaddr, mpaddr, flags);
862 }
863 }
865 int
866 efi_mmio(unsigned long physaddr, unsigned long size)
867 {
868 void *efi_map_start, *efi_map_end;
869 u64 efi_desc_size;
870 void* p;
872 efi_map_start = __va(ia64_boot_param->efi_memmap);
873 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
874 efi_desc_size = ia64_boot_param->efi_memdesc_size;
876 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
877 efi_memory_desc_t* md = (efi_memory_desc_t *)p;
878 unsigned long start = md->phys_addr;
879 unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
881 if (start <= physaddr && physaddr < end) {
882 if ((physaddr + size) > end) {
883 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
884 __func__, __LINE__, physaddr, size);
885 return 0;
886 }
888 // for io space
889 if (md->type == EFI_MEMORY_MAPPED_IO ||
890 md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
891 return 1;
892 }
894 // for runtime
895 // see efi_enter_virtual_mode(void)
896 // in linux/arch/ia64/kernel/efi.c
897 if ((md->attribute & EFI_MEMORY_RUNTIME) &&
898 !(md->attribute & EFI_MEMORY_WB)) {
899 return 1;
900 }
902 DPRINTK("%s:%d physaddr 0x%lx size = 0x%lx\n",
903 __func__, __LINE__, physaddr, size);
904 return 0;
905 }
907 if (physaddr < start) {
908 break;
909 }
910 }
912 return 1;
913 }
915 unsigned long
916 assign_domain_mmio_page(struct domain *d,
917 unsigned long mpaddr, unsigned long size)
918 {
919 if (size == 0) {
920 DPRINTK("%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
921 __func__, d, mpaddr, size);
922 }
923 if (!efi_mmio(mpaddr, size)) {
924 DPRINTK("%s:%d domain %p mpaddr 0x%lx size = 0x%lx\n",
925 __func__, __LINE__, d, mpaddr, size);
926 return -EINVAL;
927 }
928 assign_domain_same_page(d, mpaddr, size, ASSIGN_writable);
929 return mpaddr;
930 }
932 unsigned long
933 assign_domain_mach_page(struct domain *d,
934 unsigned long mpaddr, unsigned long size,
935 unsigned long flags)
936 {
937 assign_domain_same_page(d, mpaddr, size, flags);
938 return mpaddr;
939 }
941 // caller must get_page(mfn_to_page(mfn)) before call.
942 // caller must call set_gpfn_from_mfn() before call if necessary.
943 // because set_gpfn_from_mfn() result must be visible before pte xchg
944 // caller must use memory barrier. NOTE: xchg has acquire semantics.
945 // flags: currently only ASSIGN_readonly
946 static void
947 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
948 unsigned long mfn, unsigned long flags)
949 {
950 struct mm_struct *mm = &d->arch.mm;
951 volatile pte_t* pte;
952 pte_t old_pte;
953 pte_t npte;
954 unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
955 pte = lookup_alloc_domain_pte(d, mpaddr);
957 // update pte
958 npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
959 old_pte = ptep_xchg(mm, mpaddr, pte, npte);
960 if (pte_mem(old_pte)) {
961 unsigned long old_mfn = pte_pfn(old_pte);
963 // mfn = old_mfn case can happen when domain maps a granted page
964 // twice with the same pseudo physial address.
965 // It's non sense, but allowed.
966 // __gnttab_map_grant_ref()
967 // => create_host_mapping()
968 // => assign_domain_page_replace()
969 if (mfn != old_mfn) {
970 struct page_info* old_page = mfn_to_page(old_mfn);
972 if (page_get_owner(old_page) == d) {
973 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
974 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
975 }
977 domain_page_flush(d, mpaddr, old_mfn, mfn);
979 try_to_clear_PGC_allocate(d, old_page);
980 put_page(old_page);
981 }
982 }
983 }
985 // caller must get_page(new_page) before
986 // Only steal_page() calls this function.
987 static int
988 assign_domain_page_cmpxchg_rel(struct domain* d, unsigned long mpaddr,
989 struct page_info* old_page,
990 struct page_info* new_page,
991 unsigned long flags)
992 {
993 struct mm_struct *mm = &d->arch.mm;
994 volatile pte_t* pte;
995 unsigned long old_mfn;
996 unsigned long old_arflags;
997 pte_t old_pte;
998 unsigned long new_mfn;
999 unsigned long new_arflags;
1000 pte_t new_pte;
1001 pte_t ret_pte;
1003 pte = lookup_alloc_domain_pte(d, mpaddr);
1005 again:
1006 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1007 old_mfn = page_to_mfn(old_page);
1008 old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
1009 if (!pte_present(old_pte)) {
1010 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
1011 __func__, pte_val(old_pte), old_arflags, old_mfn);
1012 return -EINVAL;
1015 new_arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
1016 new_mfn = page_to_mfn(new_page);
1017 new_pte = pfn_pte(new_mfn,
1018 __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_arflags));
1020 // update pte
1021 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1022 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1023 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1024 goto again;
1027 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
1028 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1029 __func__,
1030 pte_val(old_pte), old_arflags, old_mfn,
1031 pte_val(ret_pte), pte_pfn(ret_pte));
1032 return -EINVAL;
1035 BUG_ON(!pte_mem(old_pte));
1036 BUG_ON(page_get_owner(old_page) != d);
1037 BUG_ON(get_gpfn_from_mfn(old_mfn) != (mpaddr >> PAGE_SHIFT));
1038 BUG_ON(old_mfn == new_mfn);
1040 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
1042 domain_page_flush(d, mpaddr, old_mfn, new_mfn);
1043 put_page(old_page);
1044 return 0;
1047 static void
1048 zap_domain_page_one(struct domain *d, unsigned long mpaddr, unsigned long mfn)
1050 struct mm_struct *mm = &d->arch.mm;
1051 volatile pte_t *pte;
1052 pte_t old_pte;
1053 struct page_info *page;
1055 pte = lookup_noalloc_domain_pte_none(d, mpaddr);
1056 if (pte == NULL)
1057 return;
1058 if (pte_none(*pte))
1059 return;
1061 if (mfn == INVALID_MFN) {
1062 // clear pte
1063 old_pte = ptep_get_and_clear(mm, mpaddr, pte);
1064 mfn = pte_pfn(old_pte);
1065 } else {
1066 unsigned long old_arflags;
1067 pte_t new_pte;
1068 pte_t ret_pte;
1070 again:
1071 BUG_ON(page_get_owner(mfn_to_page(mfn)) != d);
1072 old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1073 old_pte = pfn_pte(mfn, __pgprot(old_arflags));
1074 new_pte = __pte(0);
1076 // update pte
1077 ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
1078 if (unlikely(pte_val(old_pte) != pte_val(ret_pte))) {
1079 if (pte_pfn(old_pte) == pte_pfn(ret_pte)) {
1080 goto again;
1083 DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx mfn 0x%lx "
1084 "ret_pte 0x%lx ret_mfn 0x%lx\n",
1085 __func__,
1086 pte_val(old_pte), old_arflags, mfn,
1087 pte_val(ret_pte), pte_pfn(ret_pte));
1088 return;
1090 BUG_ON(mfn != pte_pfn(ret_pte));
1093 page = mfn_to_page(mfn);
1094 BUG_ON((page->count_info & PGC_count_mask) == 0);
1096 if (page_get_owner(page) == d) {
1097 BUG_ON(get_gpfn_from_mfn(mfn) != (mpaddr >> PAGE_SHIFT));
1098 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1101 domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
1103 try_to_clear_PGC_allocate(d, page);
1104 put_page(page);
1107 unsigned long
1108 dom0vp_zap_physmap(struct domain *d, unsigned long gpfn,
1109 unsigned int extent_order)
1111 if (extent_order != 0) {
1112 //XXX
1113 return -ENOSYS;
1116 zap_domain_page_one(d, gpfn << PAGE_SHIFT, INVALID_MFN);
1117 return 0;
1120 unsigned long
1121 dom0vp_add_physmap(struct domain* d, unsigned long gpfn, unsigned long mfn,
1122 unsigned long flags, domid_t domid)
1124 int error = 0;
1125 struct domain* rd;
1127 rd = find_domain_by_id(domid);
1128 if (unlikely(rd == NULL)) {
1129 switch (domid) {
1130 case DOMID_XEN:
1131 rd = dom_xen;
1132 break;
1133 case DOMID_IO:
1134 rd = dom_io;
1135 break;
1136 default:
1137 DPRINTK("d 0x%p domid %d "
1138 "pgfn 0x%lx mfn 0x%lx flags 0x%lx domid %d\n",
1139 d, d->domain_id, gpfn, mfn, flags, domid);
1140 return -ESRCH;
1142 BUG_ON(rd == NULL);
1143 get_knownalive_domain(rd);
1146 if (unlikely(rd == d)) {
1147 error = -EINVAL;
1148 goto out1;
1150 BUG_ON(!mfn_valid(mfn));
1151 if (unlikely(get_page(mfn_to_page(mfn), rd) == 0)) {
1152 error = -EINVAL;
1153 goto out1;
1155 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1156 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1157 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, flags);
1158 //don't update p2m table because this page belongs to rd, not d.
1159 out1:
1160 put_domain(rd);
1161 return error;
1164 // grant table host mapping
1165 // mpaddr: host_addr: pseudo physical address
1166 // mfn: frame: machine page frame
1167 // flags: GNTMAP_readonly | GNTMAP_application_map | GNTMAP_contains_pte
1168 int
1169 create_grant_host_mapping(unsigned long gpaddr,
1170 unsigned long mfn, unsigned int flags)
1172 struct domain* d = current->domain;
1173 struct page_info* page;
1174 int ret;
1176 if (flags & (GNTMAP_device_map |
1177 GNTMAP_application_map | GNTMAP_contains_pte)) {
1178 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1179 return GNTST_general_error;
1182 BUG_ON(!mfn_valid(mfn));
1183 page = mfn_to_page(mfn);
1184 ret = get_page(page, page_get_owner(page));
1185 BUG_ON(ret == 0);
1186 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1187 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1188 assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
1189 ASSIGN_readonly: ASSIGN_writable);
1190 return GNTST_okay;
1193 // grant table host unmapping
1194 int
1195 destroy_grant_host_mapping(unsigned long gpaddr,
1196 unsigned long mfn, unsigned int flags)
1198 struct domain* d = current->domain;
1199 volatile pte_t* pte;
1200 unsigned long cur_arflags;
1201 pte_t cur_pte;
1202 pte_t new_pte;
1203 pte_t old_pte;
1204 struct page_info* page;
1206 if (flags & (GNTMAP_application_map | GNTMAP_contains_pte)) {
1207 DPRINTK("%s: flags 0x%x\n", __func__, flags);
1208 return GNTST_general_error;
1211 pte = lookup_noalloc_domain_pte(d, gpaddr);
1212 if (pte == NULL) {
1213 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx\n", __func__, gpaddr, mfn);
1214 return GNTST_general_error;
1217 again:
1218 cur_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
1219 cur_pte = pfn_pte(mfn, __pgprot(cur_arflags));
1220 if (!pte_present(cur_pte)) {
1221 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx\n",
1222 __func__, gpaddr, mfn, pte_val(cur_pte));
1223 return GNTST_general_error;
1225 new_pte = __pte(0);
1227 old_pte = ptep_cmpxchg_rel(&d->arch.mm, gpaddr, pte, cur_pte, new_pte);
1228 if (unlikely(!pte_present(old_pte))) {
1229 DPRINTK("%s: gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1230 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1231 return GNTST_general_error;
1233 if (unlikely(pte_val(cur_pte) != pte_val(old_pte))) {
1234 if (pte_pfn(old_pte) == mfn) {
1235 goto again;
1237 DPRINTK("%s gpaddr 0x%lx mfn 0x%lx cur_pte 0x%lx old_pte 0x%lx\n",
1238 __func__, gpaddr, mfn, pte_val(cur_pte), pte_val(old_pte));
1239 return GNTST_general_error;
1241 BUG_ON(pte_pfn(old_pte) != mfn);
1243 domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
1245 page = mfn_to_page(mfn);
1246 BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) is not needed.
1247 put_page(page);
1249 return GNTST_okay;
1252 // heavily depends on the struct page layout.
1253 int
1254 steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
1256 #if 0 /* if big endian */
1257 # error "implement big endian version of steal_page()"
1258 #endif
1259 u32 _d, _nd;
1260 u64 x, nx, y;
1261 unsigned long gpfn;
1262 struct page_info *new;
1263 unsigned long new_mfn;
1264 int ret;
1265 new = alloc_domheap_page(d);
1266 if (new == NULL) {
1267 DPRINTK("alloc_domheap_page() failed\n");
1268 return -1;
1270 // zero out pages for security reasons
1271 clear_page(page_to_virt(new));
1272 // assign_domain_page_cmpxchg_rel() has release semantics
1273 // so smp_mb() isn't needed.
1275 ret = get_page(new, d);
1276 BUG_ON(ret == 0);
1278 gpfn = get_gpfn_from_mfn(page_to_mfn(page));
1279 if (gpfn == INVALID_M2P_ENTRY) {
1280 free_domheap_page(new);
1281 return -1;
1283 new_mfn = page_to_mfn(new);
1284 set_gpfn_from_mfn(new_mfn, gpfn);
1285 // smp_mb() isn't needed because assign_domain_pge_cmpxchg_rel()
1286 // has release semantics.
1288 ret = assign_domain_page_cmpxchg_rel(d, gpfn << PAGE_SHIFT, page, new,
1289 ASSIGN_writable);
1290 if (ret < 0) {
1291 DPRINTK("assign_domain_page_cmpxchg_rel failed %d\n", ret);
1292 set_gpfn_from_mfn(new_mfn, INVALID_M2P_ENTRY);
1293 free_domheap_page(new);
1294 return -1;
1297 spin_lock(&d->page_alloc_lock);
1299 /*
1300 * The tricky bit: atomically release ownership while there is just one
1301 * benign reference to the page (PGC_allocated). If that reference
1302 * disappears then the deallocation routine will safely spin.
1303 */
1304 _d = pickle_domptr(d);
1305 y = *((u64*)&page->count_info);
1306 do {
1307 x = y;
1308 nx = x & 0xffffffff;
1309 // page->count_info: untouched
1310 // page->u.inused._domain = 0;
1311 _nd = x >> 32;
1313 if (unlikely((x & (PGC_count_mask | PGC_allocated)) !=
1314 (1 | PGC_allocated)) ||
1315 unlikely(_nd != _d)) {
1316 struct domain* nd = unpickle_domptr(_nd);
1317 if (nd == NULL) {
1318 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1319 "sd=%p 0x%x,"
1320 " caf=%016lx, taf=%" PRtype_info "\n",
1321 (void *) page_to_mfn(page),
1322 d, d->domain_id, _d,
1323 nd, _nd,
1324 x,
1325 page->u.inuse.type_info);
1326 } else {
1327 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u) 0x%x, "
1328 "sd=%p(%u) 0x%x,"
1329 " caf=%016lx, taf=%" PRtype_info "\n",
1330 (void *) page_to_mfn(page),
1331 d, d->domain_id, _d,
1332 nd, nd->domain_id, _nd,
1333 x,
1334 page->u.inuse.type_info);
1336 spin_unlock(&d->page_alloc_lock);
1337 return -1;
1340 y = cmpxchg((u64*)&page->count_info, x, nx);
1341 } while (unlikely(y != x));
1343 /*
1344 * Unlink from 'd'. At least one reference remains (now anonymous), so
1345 * noone else is spinning to try to delete this page from 'd'.
1346 */
1347 if ( !(memflags & MEMF_no_refcount) )
1348 d->tot_pages--;
1349 list_del(&page->list);
1351 spin_unlock(&d->page_alloc_lock);
1352 return 0;
1355 void
1356 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
1357 unsigned long mfn)
1359 int ret;
1361 BUG_ON(!mfn_valid(mfn));
1362 ret = get_page(mfn_to_page(mfn), d);
1363 BUG_ON(ret == 0);
1364 BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
1365 get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
1366 set_gpfn_from_mfn(mfn, gpfn);
1367 smp_mb();
1368 assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn, ASSIGN_writable);
1370 //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
1373 void
1374 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
1375 unsigned long mfn)
1377 BUG_ON(mfn == 0);//XXX
1378 zap_domain_page_one(d, gpfn << PAGE_SHIFT, mfn);
1381 //XXX sledgehammer.
1382 // flush finer range.
1383 void
1384 domain_page_flush(struct domain* d, unsigned long mpaddr,
1385 unsigned long old_mfn, unsigned long new_mfn)
1387 domain_flush_vtlb_all();
1390 int
1391 domain_page_mapped(struct domain* d, unsigned long mpaddr)
1393 volatile pte_t * pte;
1395 pte = lookup_noalloc_domain_pte(d, mpaddr);
1396 if(pte != NULL && !pte_none(*pte))
1397 return 1;
1398 return 0;
1400 #endif
1402 /* Flush cache of domain d. */
1403 void domain_cache_flush (struct domain *d, int sync_only)
1405 struct mm_struct *mm = &d->arch.mm;
1406 pgd_t *pgd = mm->pgd;
1407 unsigned long maddr;
1408 int i,j,k, l;
1409 int nbr_page = 0;
1410 void (*flush_func)(unsigned long start, unsigned long end);
1411 extern void flush_dcache_range (unsigned long, unsigned long);
1413 if (sync_only)
1414 flush_func = &flush_icache_range;
1415 else
1416 flush_func = &flush_dcache_range;
1418 #ifdef CONFIG_DOMAIN0_CONTIGUOUS
1419 if (d == dom0) {
1420 /* This is not fully correct (because of hole), but it should
1421 be enough for now. */
1422 (*flush_func)(__va_ul (dom0_start),
1423 __va_ul (dom0_start + dom0_size));
1424 return;
1426 #endif
1427 for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
1428 pud_t *pud;
1429 if (!pgd_present(*pgd))
1430 continue;
1431 pud = pud_offset(pgd, 0);
1432 for (j = 0; j < PTRS_PER_PUD; pud++, j++) {
1433 pmd_t *pmd;
1434 if (!pud_present(*pud))
1435 continue;
1436 pmd = pmd_offset(pud, 0);
1437 for (k = 0; k < PTRS_PER_PMD; pmd++, k++) {
1438 pte_t *pte;
1439 if (!pmd_present(*pmd))
1440 continue;
1441 pte = pte_offset_map(pmd, 0);
1442 for (l = 0; l < PTRS_PER_PTE; pte++, l++) {
1443 if (!pte_present(*pte))
1444 continue;
1445 /* Convert PTE to maddr. */
1446 maddr = __va_ul (pte_val(*pte)
1447 & _PAGE_PPN_MASK);
1448 (*flush_func)(maddr, maddr+ PAGE_SIZE);
1449 nbr_page++;
1454 //printf ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
1457 #ifdef VERBOSE
1458 #define MEM_LOG(_f, _a...) \
1459 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
1460 current->domain->domain_id , __LINE__ , ## _a )
1461 #else
1462 #define MEM_LOG(_f, _a...) ((void)0)
1463 #endif
1465 static void free_page_type(struct page_info *page, u32 type)
1469 static int alloc_page_type(struct page_info *page, u32 type)
1471 return 1;
1474 unsigned long __get_free_pages(unsigned int mask, unsigned int order)
1476 void *p = alloc_xenheap_pages(order);
1478 memset(p,0,PAGE_SIZE<<order);
1479 return (unsigned long)p;
1482 void __free_pages(struct page_info *page, unsigned int order)
1484 if (order) BUG();
1485 free_xenheap_page(page);
1488 void *pgtable_quicklist_alloc(void)
1490 void *p;
1491 p = alloc_xenheap_pages(0);
1492 if (p)
1493 clear_page(p);
1494 return p;
1497 void pgtable_quicklist_free(void *pgtable_entry)
1499 free_xenheap_page(pgtable_entry);
1502 void cleanup_writable_pagetable(struct domain *d)
1504 return;
1507 void put_page_type(struct page_info *page)
1509 u32 nx, x, y = page->u.inuse.type_info;
1511 again:
1512 do {
1513 x = y;
1514 nx = x - 1;
1516 ASSERT((x & PGT_count_mask) != 0);
1518 /*
1519 * The page should always be validated while a reference is held. The
1520 * exception is during domain destruction, when we forcibly invalidate
1521 * page-table pages if we detect a referential loop.
1522 * See domain.c:relinquish_list().
1523 */
1524 ASSERT((x & PGT_validated) ||
1525 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1527 if ( unlikely((nx & PGT_count_mask) == 0) )
1529 /* Record TLB information for flush later. Races are harmless. */
1530 page->tlbflush_timestamp = tlbflush_current_time();
1532 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1533 likely(nx & PGT_validated) )
1535 /*
1536 * Page-table pages must be unvalidated when count is zero. The
1537 * 'free' is safe because the refcnt is non-zero and validated
1538 * bit is clear => other ops will spin or fail.
1539 */
1540 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1541 x & ~PGT_validated)) != x) )
1542 goto again;
1543 /* We cleared the 'valid bit' so we do the clean up. */
1544 free_page_type(page, x);
1545 /* Carry on, but with the 'valid bit' now clear. */
1546 x &= ~PGT_validated;
1547 nx &= ~PGT_validated;
1550 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1551 (PGT_pinned | 1)) &&
1552 ((nx & PGT_type_mask) != PGT_writable_page)) )
1554 /* Page is now only pinned. Make the back pointer mutable again. */
1555 nx |= PGT_va_mutable;
1558 while ( unlikely((y = cmpxchg_rel(&page->u.inuse.type_info, x, nx)) != x) );
1562 int get_page_type(struct page_info *page, u32 type)
1564 u32 nx, x, y = page->u.inuse.type_info;
1566 again:
1567 do {
1568 x = y;
1569 nx = x + 1;
1570 if ( unlikely((nx & PGT_count_mask) == 0) )
1572 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1573 return 0;
1575 else if ( unlikely((x & PGT_count_mask) == 0) )
1577 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1579 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1581 /*
1582 * On type change we check to flush stale TLB
1583 * entries. This may be unnecessary (e.g., page
1584 * was GDT/LDT) but those circumstances should be
1585 * very rare.
1586 */
1587 cpumask_t mask =
1588 page_get_owner(page)->domain_dirty_cpumask;
1589 tlbflush_filter(mask, page->tlbflush_timestamp);
1591 if ( unlikely(!cpus_empty(mask)) )
1593 perfc_incrc(need_flush_tlb_flush);
1594 flush_tlb_mask(mask);
1598 /* We lose existing type, back pointer, and validity. */
1599 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1600 nx |= type;
1602 /* No special validation needed for writable pages. */
1603 /* Page tables and GDT/LDT need to be scanned for validity. */
1604 if ( type == PGT_writable_page )
1605 nx |= PGT_validated;
1608 else
1610 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1612 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1614 if ( current->domain == page_get_owner(page) )
1616 /*
1617 * This ensures functions like set_gdt() see up-to-date
1618 * type info without needing to clean up writable p.t.
1619 * state on the fast path.
1620 */
1621 LOCK_BIGLOCK(current->domain);
1622 cleanup_writable_pagetable(current->domain);
1623 y = page->u.inuse.type_info;
1624 UNLOCK_BIGLOCK(current->domain);
1625 /* Can we make progress now? */
1626 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1627 ((y & PGT_count_mask) == 0) )
1628 goto again;
1630 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1631 ((type & PGT_type_mask) != PGT_l1_page_table) )
1632 MEM_LOG("Bad type (saw %08x != exp %08x) "
1633 "for mfn %016lx (pfn %016lx)",
1634 x, type, page_to_mfn(page),
1635 get_gpfn_from_mfn(page_to_mfn(page)));
1636 return 0;
1638 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1640 /* The va backpointer is mutable, hence we update it. */
1641 nx &= ~PGT_va_mask;
1642 nx |= type; /* we know the actual type is correct */
1644 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1645 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1647 #ifdef CONFIG_X86_PAE
1648 /* We use backptr as extra typing. Cannot be unknown. */
1649 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1650 return 0;
1651 #endif
1652 /* This table is possibly mapped at multiple locations. */
1653 nx &= ~PGT_va_mask;
1654 nx |= PGT_va_unknown;
1657 if ( unlikely(!(x & PGT_validated)) )
1659 /* Someone else is updating validation of this page. Wait... */
1660 while ( (y = page->u.inuse.type_info) == x )
1661 cpu_relax();
1662 goto again;
1666 while ( unlikely((y = cmpxchg_acq(&page->u.inuse.type_info, x, nx)) != x) );
1668 if ( unlikely(!(nx & PGT_validated)) )
1670 /* Try to validate page type; drop the new reference on failure. */
1671 if ( unlikely(!alloc_page_type(page, type)) )
1673 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
1674 ": caf=%08x taf=%" PRtype_info,
1675 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1676 type, page->count_info, page->u.inuse.type_info);
1677 /* Noone else can get a reference. We hold the only ref. */
1678 page->u.inuse.type_info = 0;
1679 return 0;
1682 /* Noone else is updating simultaneously. */
1683 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1686 return 1;
1689 /*
1690 * Local variables:
1691 * mode: C
1692 * c-set-style: "BSD"
1693 * c-basic-offset: 4
1694 * tab-width: 4
1695 * indent-tabs-mode: nil
1696 * End:
1697 */