ia64/xen-unstable

view xen/arch/ia64/xen/tlb_track.c @ 16785:af3550f53874

[IA64] domheap: Don't pin xenheap down. Now it's unnecessary.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Alex Williamson <alex.williamson@hp.com>
date Thu Jan 17 12:05:43 2008 -0700 (2008-01-17)
parents ea0b50ca4999
children 57febe0264e1
line source
1 /******************************************************************************
2 * tlb_track.c
3 *
4 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
5 * VA Linux Systems Japan K.K.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
23 #include <asm/tlb_track.h>
24 #include <asm/p2m_entry.h>
25 #include <asm/vmx_mm_def.h> /* for IA64_RR_SHIFT */
26 #include <asm/vmx_vcpu.h> /* for VRN7 */
27 #include <asm/vcpu.h> /* for PSCB() */
29 #define CONFIG_TLB_TRACK_DEBUG
30 #ifdef CONFIG_TLB_TRACK_DEBUG
31 # define tlb_track_printd(fmt, ...) \
32 dprintk(XENLOG_DEBUG, fmt, ##__VA_ARGS__)
33 #else
34 # define tlb_track_printd(fmt, ...) do { } while (0)
35 #endif
37 static int
38 tlb_track_allocate_entries(struct tlb_track* tlb_track)
39 {
40 struct page_info* entry_page;
41 struct tlb_track_entry* track_entries;
42 unsigned int allocated;
43 unsigned long i;
45 BUG_ON(tlb_track->num_free > 0);
46 if (tlb_track->num_entries >= tlb_track->limit) {
47 dprintk(XENLOG_WARNING, "%s: num_entries %d limit %d\n",
48 __func__, tlb_track->num_entries, tlb_track->limit);
49 return -ENOMEM;
50 }
51 entry_page = alloc_domheap_page(NULL);
52 if (entry_page == NULL) {
53 dprintk(XENLOG_WARNING,
54 "%s: domheap page failed. num_entries %d limit %d\n",
55 __func__, tlb_track->num_entries, tlb_track->limit);
56 return -ENOMEM;
57 }
59 list_add(&entry_page->list, &tlb_track->page_list);
60 track_entries = (struct tlb_track_entry*)page_to_virt(entry_page);
61 allocated = PAGE_SIZE / sizeof(track_entries[0]);
62 tlb_track->num_entries += allocated;
63 tlb_track->num_free += allocated;
64 for (i = 0; i < allocated; i++) {
65 list_add(&track_entries[i].list, &tlb_track->free_list);
66 // tlb_track_printd("track_entries[%ld] 0x%p\n", i, &track_entries[i]);
67 }
68 tlb_track_printd("allocated %d num_entries %d num_free %d\n",
69 allocated, tlb_track->num_entries, tlb_track->num_free);
70 return 0;
71 }
74 int
75 tlb_track_create(struct domain* d)
76 {
77 struct tlb_track* tlb_track = NULL;
78 struct page_info* hash_page = NULL;
79 unsigned int hash_size;
80 unsigned int hash_shift;
81 unsigned int i;
83 tlb_track = xmalloc(struct tlb_track);
84 if (tlb_track == NULL)
85 goto out;
87 hash_page = alloc_domheap_page(NULL);
88 if (hash_page == NULL)
89 goto out;
91 spin_lock_init(&tlb_track->free_list_lock);
92 INIT_LIST_HEAD(&tlb_track->free_list);
93 tlb_track->limit = TLB_TRACK_LIMIT_ENTRIES;
94 tlb_track->num_entries = 0;
95 tlb_track->num_free = 0;
96 INIT_LIST_HEAD(&tlb_track->page_list);
97 if (tlb_track_allocate_entries(tlb_track) < 0)
98 goto out;
100 spin_lock_init(&tlb_track->hash_lock);
101 /* XXX hash size optimization */
102 hash_size = PAGE_SIZE / sizeof(tlb_track->hash[0]);
103 for (hash_shift = 0; (1 << (hash_shift + 1)) < hash_size; hash_shift++)
104 /* nothing */;
105 tlb_track->hash_size = (1 << hash_shift);
106 tlb_track->hash_shift = hash_shift;
107 tlb_track->hash_mask = (1 << hash_shift) - 1;
108 tlb_track->hash = page_to_virt(hash_page);
109 for (i = 0; i < tlb_track->hash_size; i++)
110 INIT_LIST_HEAD(&tlb_track->hash[i]);
112 smp_mb(); /* make initialization visible before use. */
113 d->arch.tlb_track = tlb_track;
114 dprintk(XENLOG_DEBUG, "hash 0x%p hash_size %d\n",
115 tlb_track->hash, tlb_track->hash_size);
117 return 0;
119 out:
120 if (hash_page != NULL)
121 free_domheap_page(hash_page);
123 if (tlb_track != NULL)
124 xfree(tlb_track);
126 return -ENOMEM;
127 }
129 void
130 tlb_track_destroy(struct domain* d)
131 {
132 struct tlb_track* tlb_track = d->arch.tlb_track;
133 struct page_info* page;
134 struct page_info* next;
136 spin_lock(&tlb_track->free_list_lock);
137 BUG_ON(tlb_track->num_free != tlb_track->num_entries);
139 list_for_each_entry_safe(page, next, &tlb_track->page_list, list) {
140 list_del(&page->list);
141 free_domheap_page(page);
142 }
144 free_domheap_page(virt_to_page(tlb_track->hash));
145 xfree(tlb_track);
146 // d->tlb_track = NULL;
147 }
149 static struct tlb_track_entry*
150 tlb_track_get_entry(struct tlb_track* tlb_track)
151 {
152 struct tlb_track_entry* entry = NULL;
153 spin_lock(&tlb_track->free_list_lock);
154 if (tlb_track->num_free == 0)
155 (void)tlb_track_allocate_entries(tlb_track);
157 if (tlb_track->num_free > 0) {
158 BUG_ON(list_empty(&tlb_track->free_list));
159 entry = list_entry(tlb_track->free_list.next,
160 struct tlb_track_entry, list);
161 tlb_track->num_free--;
162 list_del(&entry->list);
163 }
164 spin_unlock(&tlb_track->free_list_lock);
165 return entry;
166 }
168 void
169 tlb_track_free_entry(struct tlb_track* tlb_track,
170 struct tlb_track_entry* entry)
171 {
172 spin_lock(&tlb_track->free_list_lock);
173 list_add(&entry->list, &tlb_track->free_list);
174 tlb_track->num_free++;
175 spin_unlock(&tlb_track->free_list_lock);
176 }
179 #include <linux/hash.h>
180 /* XXX hash function. */
181 static struct list_head*
182 tlb_track_hash_head(struct tlb_track* tlb_track, volatile pte_t* ptep)
183 {
184 unsigned long hash = hash_long((unsigned long)ptep, tlb_track->hash_shift);
185 BUG_ON(hash >= tlb_track->hash_size);
186 BUG_ON((hash & tlb_track->hash_mask) != hash);
187 return &tlb_track->hash[hash];
188 }
190 static int
191 tlb_track_pte_zapped(pte_t old_pte, pte_t ret_pte)
192 {
193 if (pte_pfn(old_pte) != pte_pfn(ret_pte) ||
194 (pte_val(old_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK)) !=
195 (pte_val(ret_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK))) {
196 /* Other thread zapped the p2m entry. */
197 return 1;
198 }
199 return 0;
200 }
202 static TLB_TRACK_RET_T
203 tlb_track_insert_or_dirty(struct tlb_track* tlb_track, struct mm_struct* mm,
204 volatile pte_t* ptep, pte_t old_pte,
205 unsigned long vaddr, unsigned long rid)
206 {
207 unsigned long mfn = pte_pfn(old_pte);
208 struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
209 struct tlb_track_entry* entry;
210 struct tlb_track_entry* new_entry = NULL;
211 unsigned long bit_to_be_set = _PAGE_TLB_INSERTED;
212 pte_t new_pte;
213 pte_t ret_pte;
215 struct vcpu* v = current;
216 TLB_TRACK_RET_T ret = TLB_TRACK_NOT_FOUND;
218 #if 0 /* this is done at vcpu_tlb_track_insert_or_dirty() */
219 perfc_incr(tlb_track_iod);
220 if (!pte_tlb_tracking(old_pte)) {
221 perfc_incr(tlb_track_iod_not_tracked);
222 return TLB_TRACK_NOT_TRACKED;
223 }
224 #endif
225 if (pte_tlb_inserted_many(old_pte)) {
226 perfc_incr(tlb_track_iod_tracked_many);
227 return TLB_TRACK_MANY;
228 }
230 /* vaddr must be normalized so that it is in vrn7 and page aligned. */
231 BUG_ON((vaddr >> IA64_RR_SHIFT) != VRN7);
232 BUG_ON((vaddr & ~PAGE_MASK) != 0);
233 #if 0
234 tlb_track_printd("\n"
235 "\tmfn 0x%016lx\n"
236 "\told_pte 0x%016lx ptep 0x%p\n"
237 "\tptep_val 0x%016lx vaddr 0x%016lx rid %ld\n"
238 "\ttlb_track 0x%p head 0x%p\n",
239 mfn,
240 pte_val(old_pte), ptep, pte_val(*ptep),
241 vaddr, rid,
242 tlb_track, head);
243 #endif
245 again:
246 /*
247 * zapping side may zap the p2m entry and then remove tlb track entry
248 * non-atomically. We may see the stale tlb track entry here.
249 * p2m_entry_retry() handles such a case.
250 * Or other thread may zap the p2m entry and remove tlb track entry
251 * and inserted new tlb track entry.
252 */
253 spin_lock(&tlb_track->hash_lock);
254 list_for_each_entry(entry, head, list) {
255 if (entry->ptep != ptep)
256 continue;
258 if (pte_pfn(entry->pte_val) == mfn) {
259 // tlb_track_entry_printf(entry);
260 if (entry->vaddr == vaddr && entry->rid == rid) {
261 // tlb_track_printd("TLB_TRACK_FOUND\n");
262 ret = TLB_TRACK_FOUND;
263 perfc_incr(tlb_track_iod_found);
264 #ifdef CONFIG_TLB_TRACK_CNT
265 entry->cnt++;
266 if (entry->cnt > TLB_TRACK_CNT_FORCE_MANY) {
267 /*
268 * heuristics:
269 * If a page is used to transfer data by dev channel,
270 * it would be unmapped with small amount access
271 * (once or twice tlb insert) after real device
272 * I/O completion. It would be short period.
273 * However this page seems to be accessed many times.
274 * We guess that this page is used I/O ring
275 * so that tracking this entry might be useless.
276 */
277 // tlb_track_entry_printf(entry);
278 // tlb_track_printd("cnt = %ld\n", entry->cnt);
279 perfc_incr(tlb_track_iod_force_many);
280 goto force_many;
281 }
282 #endif
283 goto found;
284 } else {
285 #ifdef CONFIG_TLB_TRACK_CNT
286 force_many:
287 #endif
288 if (!pte_tlb_inserted(old_pte)) {
289 printk("%s:%d racy update\n", __func__, __LINE__);
290 old_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED);
291 }
292 new_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED_MANY);
293 ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
294 if (pte_val(ret_pte) != pte_val(old_pte)) {
295 // tlb_track_printd("TLB_TRACK_AGAIN\n");
296 ret = TLB_TRACK_AGAIN;
297 perfc_incr(tlb_track_iod_again);
298 } else {
299 // tlb_track_printd("TLB_TRACK_MANY del entry 0x%p\n",
300 // entry);
301 ret = TLB_TRACK_MANY;
302 list_del(&entry->list);
303 // tlb_track_entry_printf(entry);
304 perfc_incr(tlb_track_iod_tracked_many_del);
305 }
306 goto out;
307 }
308 }
310 /*
311 * Other thread changed the p2m entry and removed and inserted new
312 * tlb tracn entry after we get old_pte, but before we get
313 * spinlock.
314 */
315 // tlb_track_printd("TLB_TRACK_AGAIN\n");
316 ret = TLB_TRACK_AGAIN;
317 perfc_incr(tlb_track_iod_again);
318 goto out;
319 }
321 entry = NULL; // prevent freeing entry.
322 if (pte_tlb_inserted(old_pte)) {
323 /* Other thread else removed the tlb_track_entry after we got old_pte
324 before we got spin lock. */
325 ret = TLB_TRACK_AGAIN;
326 perfc_incr(tlb_track_iod_again);
327 goto out;
328 }
329 if (new_entry == NULL && bit_to_be_set == _PAGE_TLB_INSERTED) {
330 spin_unlock(&tlb_track->hash_lock);
331 new_entry = tlb_track_get_entry(tlb_track);
332 if (new_entry == NULL) {
333 tlb_track_printd("get_entry failed\n");
334 /* entry can't be allocated.
335 fall down into full flush mode. */
336 bit_to_be_set |= _PAGE_TLB_INSERTED_MANY;
337 perfc_incr(tlb_track_iod_new_failed);
338 }
339 // tlb_track_printd("new_entry 0x%p\n", new_entry);
340 perfc_incr(tlb_track_iod_new_entry);
341 goto again;
342 }
344 BUG_ON(pte_tlb_inserted_many(old_pte));
345 new_pte = __pte(pte_val(old_pte) | bit_to_be_set);
346 ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
347 if (pte_val(old_pte) != pte_val(ret_pte)) {
348 if (tlb_track_pte_zapped(old_pte, ret_pte)) {
349 // tlb_track_printd("zapped TLB_TRACK_AGAIN\n");
350 ret = TLB_TRACK_AGAIN;
351 perfc_incr(tlb_track_iod_again);
352 goto out;
353 }
355 /* Other thread set _PAGE_TLB_INSERTED and/or _PAGE_TLB_INSERTED_MANY */
356 if (pte_tlb_inserted_many(ret_pte)) {
357 /* Other thread already set _PAGE_TLB_INSERTED_MANY and
358 removed the entry. */
359 // tlb_track_printd("iserted TLB_TRACK_MANY\n");
360 BUG_ON(!pte_tlb_inserted(ret_pte));
361 ret = TLB_TRACK_MANY;
362 perfc_incr(tlb_track_iod_new_many);
363 goto out;
364 }
365 BUG_ON(pte_tlb_inserted(ret_pte));
366 BUG();
367 }
368 if (new_entry) {
369 // tlb_track_printd("iserting new_entry 0x%p\n", new_entry);
370 entry = new_entry;
371 new_entry = NULL;
373 entry->ptep = ptep;
374 entry->pte_val = old_pte;
375 entry->vaddr = vaddr;
376 entry->rid = rid;
377 cpus_clear(entry->pcpu_dirty_mask);
378 vcpus_clear(entry->vcpu_dirty_mask);
379 list_add(&entry->list, head);
381 #ifdef CONFIG_TLB_TRACK_CNT
382 entry->cnt = 0;
383 #endif
384 perfc_incr(tlb_track_iod_insert);
385 // tlb_track_entry_printf(entry);
386 } else {
387 goto out;
388 }
390 found:
391 BUG_ON(v->processor >= NR_CPUS);
392 cpu_set(v->processor, entry->pcpu_dirty_mask);
393 BUG_ON(v->vcpu_id >= NR_CPUS);
394 vcpu_set(v->vcpu_id, entry->vcpu_dirty_mask);
395 perfc_incr(tlb_track_iod_dirtied);
397 out:
398 spin_unlock(&tlb_track->hash_lock);
399 if (ret == TLB_TRACK_MANY && entry != NULL)
400 tlb_track_free_entry(tlb_track, entry);
401 if (new_entry != NULL)
402 tlb_track_free_entry(tlb_track, new_entry);
403 return ret;
404 }
406 void
407 __vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
408 struct p2m_entry* entry)
409 {
410 unsigned long vrn = vaddr >> IA64_RR_SHIFT;
411 unsigned long rid = PSCB(vcpu, rrs[vrn]);
412 TLB_TRACK_RET_T ret;
414 /* normalize vrn7
415 When linux dom0 case, vrn7 is the most common case. */
416 vaddr |= VRN7 << VRN_SHIFT;
417 vaddr &= PAGE_MASK;
418 ret = tlb_track_insert_or_dirty(vcpu->domain->arch.tlb_track,
419 &vcpu->domain->arch.mm,
420 entry->ptep, entry->used,
421 vaddr, rid);
422 if (ret == TLB_TRACK_AGAIN)
423 p2m_entry_set_retry(entry);
424 }
426 TLB_TRACK_RET_T
427 tlb_track_search_and_remove(struct tlb_track* tlb_track,
428 volatile pte_t* ptep, pte_t old_pte,
429 struct tlb_track_entry** entryp)
430 {
431 unsigned long mfn = pte_pfn(old_pte);
432 struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
433 struct tlb_track_entry* entry;
435 perfc_incr(tlb_track_sar);
436 if (!pte_tlb_tracking(old_pte)) {
437 perfc_incr(tlb_track_sar_not_tracked);
438 return TLB_TRACK_NOT_TRACKED;
439 }
440 if (!pte_tlb_inserted(old_pte)) {
441 BUG_ON(pte_tlb_inserted_many(old_pte));
442 perfc_incr(tlb_track_sar_not_found);
443 return TLB_TRACK_NOT_FOUND;
444 }
445 if (pte_tlb_inserted_many(old_pte)) {
446 BUG_ON(!pte_tlb_inserted(old_pte));
447 perfc_incr(tlb_track_sar_many);
448 return TLB_TRACK_MANY;
449 }
451 spin_lock(&tlb_track->hash_lock);
452 list_for_each_entry(entry, head, list) {
453 if (entry->ptep != ptep)
454 continue;
456 if (pte_pfn(entry->pte_val) == mfn) {
457 /*
458 * PARANOIA
459 * We're here after zapping p2m entry. However another pCPU
460 * may update the same p2m entry entry the same mfn at the
461 * same time in theory. In such a case, we can't determine
462 * whether this entry is for us or for the racy p2m update.
463 * Such a guest domain's racy behaviour doesn't make sense,
464 * but is allowed. Go the very pessimistic way. Leave this
465 * entry to be found later and do full flush at this time.
466 *
467 * NOTE: Updating tlb tracking hash is protected by spin lock and
468 * setting _PAGE_TLB_INSERTED and_PAGE_TLB_INSERTED_MANY bits
469 * is serialized by the same spin lock.
470 * See tlb_track_insert_or_dirty().
471 */
472 pte_t current_pte = *ptep;
473 if (unlikely(pte_pfn(current_pte) == mfn &&
474 pte_tlb_tracking(current_pte) &&
475 pte_tlb_inserted(current_pte))) {
476 BUG_ON(pte_tlb_inserted_many(current_pte));
477 spin_unlock(&tlb_track->hash_lock);
478 perfc_incr(tlb_track_sar_many);
479 return TLB_TRACK_MANY;
480 }
482 list_del(&entry->list);
483 spin_unlock(&tlb_track->hash_lock);
484 *entryp = entry;
485 perfc_incr(tlb_track_sar_found);
486 // tlb_track_entry_printf(entry);
487 #ifdef CONFIG_TLB_TRACK_CNT
488 // tlb_track_printd("cnt = %ld\n", entry->cnt);
489 #endif
490 return TLB_TRACK_FOUND;
491 }
492 BUG();
493 }
494 BUG();
495 spin_unlock(&tlb_track->hash_lock);
496 return TLB_TRACK_NOT_TRACKED;
497 }
499 /* for debug */
500 void
501 __tlb_track_entry_printf(const char* func, int line,
502 const struct tlb_track_entry* entry)
503 {
504 char pcpumask_buf[NR_CPUS + 1];
505 char vcpumask_buf[MAX_VIRT_CPUS + 1];
506 cpumask_scnprintf(pcpumask_buf, sizeof(pcpumask_buf),
507 entry->pcpu_dirty_mask);
508 vcpumask_scnprintf(vcpumask_buf, sizeof(vcpumask_buf),
509 entry->vcpu_dirty_mask);
510 printk("%s:%d\n"
511 "\tmfn 0x%016lx\n"
512 "\told_pte 0x%016lx ptep 0x%p\n"
513 "\tpte_val 0x%016lx vaddr 0x%016lx rid %ld\n"
514 "\tpcpu_dirty_mask %s vcpu_dirty_mask %s\n"
515 "\tentry 0x%p\n",
516 func, line,
517 pte_pfn(entry->pte_val),
518 pte_val(entry->pte_val), entry->ptep, pte_val(*entry->ptep),
519 entry->vaddr, entry->rid,
520 pcpumask_buf, vcpumask_buf,
521 entry);
522 }
524 /*
525 * Local variables:
526 * mode: C
527 * c-set-style: "BSD"
528 * c-basic-offset: 4
529 * tab-width: 4
530 * indent-tabs-mode: nil
531 * End:
532 */