ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19418:abb87a8387ac

vtd: only enable Interrupt Remapping if Queued Invalidation is also enabled.

If Queued Invalidation is not supported or not enabled, we should not
enable Interrupt Remapping even if HW supports it, because Interrupt
Remapping needs Queued Invalidation to invalidate Interrupt Remapping
Cache.

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Mar 20 09:10:55 2009 +0000 (2009-03-20)
parents f02a528d2e56
children 115c164721dc
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
233 s_time_t start_time;
235 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
236 return;
237 val = iommu->gcmd | DMA_GCMD_WBF;
239 spin_lock_irqsave(&iommu->register_lock, flag);
240 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
242 /* Make sure hardware complete it */
243 start_time = NOW();
244 for ( ; ; )
245 {
246 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
247 if ( !(val & DMA_GSTS_WBFS) )
248 break;
249 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
250 panic("%s: DMAR hardware is malfunctional,"
251 " please disable IOMMU\n", __func__);
252 cpu_relax();
253 }
254 spin_unlock_irqrestore(&iommu->register_lock, flag);
255 }
257 /* return value determine if we need a write buffer flush */
258 static int flush_context_reg(
259 void *_iommu,
260 u16 did, u16 source_id, u8 function_mask, u64 type,
261 int non_present_entry_flush)
262 {
263 struct iommu *iommu = (struct iommu *) _iommu;
264 u64 val = 0;
265 unsigned long flag;
266 s_time_t start_time;
268 /*
269 * In the non-present entry flush case, if hardware doesn't cache
270 * non-present entry we do nothing and if hardware cache non-present
271 * entry, we flush entries of domain 0 (the domain id is used to cache
272 * any non-present entries)
273 */
274 if ( non_present_entry_flush )
275 {
276 if ( !cap_caching_mode(iommu->cap) )
277 return 1;
278 else
279 did = 0;
280 }
282 /* use register invalidation */
283 switch ( type )
284 {
285 case DMA_CCMD_GLOBAL_INVL:
286 val = DMA_CCMD_GLOBAL_INVL;
287 break;
288 case DMA_CCMD_DOMAIN_INVL:
289 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
290 break;
291 case DMA_CCMD_DEVICE_INVL:
292 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
293 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
294 break;
295 default:
296 BUG();
297 }
298 val |= DMA_CCMD_ICC;
300 spin_lock_irqsave(&iommu->register_lock, flag);
301 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
303 /* Make sure hardware complete it */
304 start_time = NOW();
305 for ( ; ; )
306 {
307 val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
308 if ( !(val & DMA_CCMD_ICC) )
309 break;
310 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
311 panic("%s: DMAR hardware is malfunctional,"
312 " please disable IOMMU\n", __func__);
313 cpu_relax();
314 }
315 spin_unlock_irqrestore(&iommu->register_lock, flag);
316 /* flush context entry will implicitly flush write buffer */
317 return 0;
318 }
320 static int inline iommu_flush_context_global(
321 struct iommu *iommu, int non_present_entry_flush)
322 {
323 struct iommu_flush *flush = iommu_get_flush(iommu);
324 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
325 non_present_entry_flush);
326 }
328 static int inline iommu_flush_context_domain(
329 struct iommu *iommu, u16 did, int non_present_entry_flush)
330 {
331 struct iommu_flush *flush = iommu_get_flush(iommu);
332 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
333 non_present_entry_flush);
334 }
336 static int inline iommu_flush_context_device(
337 struct iommu *iommu, u16 did, u16 source_id,
338 u8 function_mask, int non_present_entry_flush)
339 {
340 struct iommu_flush *flush = iommu_get_flush(iommu);
341 return flush->context(iommu, did, source_id, function_mask,
342 DMA_CCMD_DEVICE_INVL,
343 non_present_entry_flush);
344 }
346 /* return value determine if we need a write buffer flush */
347 static int flush_iotlb_reg(void *_iommu, u16 did,
348 u64 addr, unsigned int size_order, u64 type,
349 int non_present_entry_flush)
350 {
351 struct iommu *iommu = (struct iommu *) _iommu;
352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
353 u64 val = 0, val_iva = 0;
354 unsigned long flag;
355 s_time_t start_time;
357 /*
358 * In the non-present entry flush case, if hardware doesn't cache
359 * non-present entry we do nothing and if hardware cache non-present
360 * entry, we flush entries of domain 0 (the domain id is used to cache
361 * any non-present entries)
362 */
363 if ( non_present_entry_flush )
364 {
365 if ( !cap_caching_mode(iommu->cap) )
366 return 1;
367 else
368 did = 0;
369 }
371 /* use register invalidation */
372 switch ( type )
373 {
374 case DMA_TLB_GLOBAL_FLUSH:
375 /* global flush doesn't need set IVA_REG */
376 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
377 break;
378 case DMA_TLB_DSI_FLUSH:
379 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
380 break;
381 case DMA_TLB_PSI_FLUSH:
382 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
383 /* Note: always flush non-leaf currently */
384 val_iva = size_order | addr;
385 break;
386 default:
387 BUG();
388 }
389 /* Note: set drain read/write */
390 if ( cap_read_drain(iommu->cap) )
391 val |= DMA_TLB_READ_DRAIN;
392 if ( cap_write_drain(iommu->cap) )
393 val |= DMA_TLB_WRITE_DRAIN;
395 spin_lock_irqsave(&iommu->register_lock, flag);
396 /* Note: Only uses first TLB reg currently */
397 if ( val_iva )
398 dmar_writeq(iommu->reg, tlb_offset, val_iva);
399 dmar_writeq(iommu->reg, tlb_offset + 8, val);
401 /* Make sure hardware complete it */
402 start_time = NOW();
403 for ( ; ; )
404 {
405 val = dmar_readq(iommu->reg, tlb_offset + 8);
406 if ( !(val & DMA_TLB_IVT) )
407 break;
408 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
409 panic("%s: DMAR hardware is malfunctional,"
410 " please disable IOMMU\n", __func__);
411 cpu_relax();
412 }
413 spin_unlock_irqrestore(&iommu->register_lock, flag);
415 /* check IOTLB invalidation granularity */
416 if ( DMA_TLB_IAIG(val) == 0 )
417 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
419 /* flush iotlb entry will implicitly flush write buffer */
420 return 0;
421 }
423 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
424 int non_present_entry_flush)
425 {
426 struct iommu_flush *flush = iommu_get_flush(iommu);
427 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
428 non_present_entry_flush);
429 }
431 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
432 int non_present_entry_flush)
433 {
434 struct iommu_flush *flush = iommu_get_flush(iommu);
435 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
436 non_present_entry_flush);
437 }
439 static int inline get_alignment(u64 base, unsigned int size)
440 {
441 int t = 0;
442 u64 end;
444 end = base + size - 1;
445 while ( base != end )
446 {
447 t++;
448 base >>= 1;
449 end >>= 1;
450 }
451 return t;
452 }
454 static int inline iommu_flush_iotlb_psi(
455 struct iommu *iommu, u16 did,
456 u64 addr, unsigned int pages, int non_present_entry_flush)
457 {
458 unsigned int align;
459 struct iommu_flush *flush = iommu_get_flush(iommu);
461 ASSERT(!(addr & (~PAGE_MASK_4K)));
462 ASSERT(pages > 0);
464 /* Fallback to domain selective flush if no PSI support */
465 if ( !cap_pgsel_inv(iommu->cap) )
466 return iommu_flush_iotlb_dsi(iommu, did,
467 non_present_entry_flush);
469 /*
470 * PSI requires page size is 2 ^ x, and the base address is naturally
471 * aligned to the size
472 */
473 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
474 /* Fallback to domain selective flush if size is too big */
475 if ( align > cap_max_amask_val(iommu->cap) )
476 return iommu_flush_iotlb_dsi(iommu, did,
477 non_present_entry_flush);
479 addr >>= PAGE_SHIFT_4K + align;
480 addr <<= PAGE_SHIFT_4K + align;
482 return flush->iotlb(iommu, did, addr, align,
483 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
484 }
486 void iommu_flush_all(void)
487 {
488 struct acpi_drhd_unit *drhd;
489 struct iommu *iommu;
491 flush_all_cache();
492 for_each_drhd_unit ( drhd )
493 {
494 iommu = drhd->iommu;
495 iommu_flush_context_global(iommu, 0);
496 iommu_flush_iotlb_global(iommu, 0);
497 }
498 }
500 /* clear one page's page table */
501 static void dma_pte_clear_one(struct domain *domain, u64 addr)
502 {
503 struct hvm_iommu *hd = domain_hvm_iommu(domain);
504 struct acpi_drhd_unit *drhd;
505 struct iommu *iommu;
506 struct dma_pte *page = NULL, *pte = NULL;
507 u64 pg_maddr;
509 spin_lock(&hd->mapping_lock);
510 /* get last level pte */
511 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
512 if ( pg_maddr == 0 )
513 {
514 spin_unlock(&hd->mapping_lock);
515 return;
516 }
518 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
519 pte = page + address_level_offset(addr, 1);
521 if ( !dma_pte_present(*pte) )
522 {
523 spin_unlock(&hd->mapping_lock);
524 unmap_vtd_domain_page(page);
525 return;
526 }
528 dma_clear_pte(*pte);
529 spin_unlock(&hd->mapping_lock);
530 iommu_flush_cache_entry(pte);
532 /* No need pcidevs_lock here since do that on assign/deassign device*/
533 for_each_drhd_unit ( drhd )
534 {
535 iommu = drhd->iommu;
536 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
537 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
538 addr, 1, 0))
539 iommu_flush_write_buffer(iommu);
540 }
542 unmap_vtd_domain_page(page);
543 }
545 static void iommu_free_pagetable(u64 pt_maddr, int level)
546 {
547 int i;
548 struct dma_pte *pt_vaddr, *pte;
549 int next_level = level - 1;
551 if ( pt_maddr == 0 )
552 return;
554 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
556 for ( i = 0; i < PTE_NUM; i++ )
557 {
558 pte = &pt_vaddr[i];
559 if ( !dma_pte_present(*pte) )
560 continue;
562 if ( next_level >= 1 )
563 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
565 dma_clear_pte(*pte);
566 iommu_flush_cache_entry(pte);
567 }
569 unmap_vtd_domain_page(pt_vaddr);
570 free_pgtable_maddr(pt_maddr);
571 }
573 static int iommu_set_root_entry(struct iommu *iommu)
574 {
575 u32 cmd, sts;
576 unsigned long flags;
577 s_time_t start_time;
579 spin_lock(&iommu->lock);
581 if ( iommu->root_maddr == 0 )
582 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
583 if ( iommu->root_maddr == 0 )
584 {
585 spin_unlock(&iommu->lock);
586 return -ENOMEM;
587 }
589 spin_unlock(&iommu->lock);
590 spin_lock_irqsave(&iommu->register_lock, flags);
591 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
592 cmd = iommu->gcmd | DMA_GCMD_SRTP;
593 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
595 /* Make sure hardware complete it */
596 start_time = NOW();
597 for ( ; ; )
598 {
599 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
600 if ( sts & DMA_GSTS_RTPS )
601 break;
602 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
603 panic("%s: DMAR hardware is malfunctional,"
604 " please disable IOMMU\n", __func__);
605 cpu_relax();
606 }
608 spin_unlock_irqrestore(&iommu->register_lock, flags);
610 return 0;
611 }
613 static void iommu_enable_translation(struct iommu *iommu)
614 {
615 u32 sts;
616 unsigned long flags;
617 s_time_t start_time;
619 dprintk(XENLOG_INFO VTDPREFIX,
620 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
621 spin_lock_irqsave(&iommu->register_lock, flags);
622 iommu->gcmd |= DMA_GCMD_TE;
623 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
624 /* Make sure hardware complete it */
625 start_time = NOW();
626 for ( ; ; )
627 {
628 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
629 if ( sts & DMA_GSTS_TES )
630 break;
631 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
632 panic("%s: DMAR hardware is malfunctional,"
633 " please disable IOMMU\n", __func__);
634 cpu_relax();
635 }
637 /* Disable PMRs when VT-d engine takes effect per spec definition */
638 disable_pmr(iommu);
639 spin_unlock_irqrestore(&iommu->register_lock, flags);
640 }
642 int iommu_disable_translation(struct iommu *iommu)
643 {
644 u32 sts;
645 unsigned long flags;
646 s_time_t start_time;
648 spin_lock_irqsave(&iommu->register_lock, flags);
649 iommu->gcmd &= ~ DMA_GCMD_TE;
650 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
652 /* Make sure hardware complete it */
653 start_time = NOW();
654 for ( ; ; )
655 {
656 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
657 if ( !(sts & DMA_GSTS_TES) )
658 break;
659 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
660 panic("%s: DMAR hardware is malfunctional,"
661 " please disable IOMMU\n", __func__);
662 cpu_relax();
663 }
664 spin_unlock_irqrestore(&iommu->register_lock, flags);
665 return 0;
666 }
668 static struct iommu *vector_to_iommu[NR_VECTORS];
669 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
670 u8 fault_reason, u16 source_id, u64 addr)
671 {
672 dprintk(XENLOG_WARNING VTDPREFIX,
673 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
674 "iommu->reg = %p\n",
675 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
676 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
677 fault_reason, iommu->reg);
679 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
680 if ( fault_reason < 0x20 )
681 print_vtd_entries(iommu, (source_id >> 8),
682 (source_id & 0xff), (addr >> PAGE_SHIFT));
683 #endif
685 return 0;
686 }
688 static void iommu_fault_status(u32 fault_status)
689 {
690 if ( fault_status & DMA_FSTS_PFO )
691 dprintk(XENLOG_ERR VTDPREFIX,
692 "iommu_fault_status: Fault Overflow\n");
693 if ( fault_status & DMA_FSTS_PPF )
694 dprintk(XENLOG_ERR VTDPREFIX,
695 "iommu_fault_status: Primary Pending Fault\n");
696 if ( fault_status & DMA_FSTS_AFO )
697 dprintk(XENLOG_ERR VTDPREFIX,
698 "iommu_fault_status: Advanced Fault Overflow\n");
699 if ( fault_status & DMA_FSTS_APF )
700 dprintk(XENLOG_ERR VTDPREFIX,
701 "iommu_fault_status: Advanced Pending Fault\n");
702 if ( fault_status & DMA_FSTS_IQE )
703 dprintk(XENLOG_ERR VTDPREFIX,
704 "iommu_fault_status: Invalidation Queue Error\n");
705 if ( fault_status & DMA_FSTS_ICE )
706 dprintk(XENLOG_ERR VTDPREFIX,
707 "iommu_fault_status: Invalidation Completion Error\n");
708 if ( fault_status & DMA_FSTS_ITE )
709 dprintk(XENLOG_ERR VTDPREFIX,
710 "iommu_fault_status: Invalidation Time-out Error\n");
711 }
713 #define PRIMARY_FAULT_REG_LEN (16)
714 static void iommu_page_fault(int vector, void *dev_id,
715 struct cpu_user_regs *regs)
716 {
717 struct iommu *iommu = dev_id;
718 int reg, fault_index;
719 u32 fault_status;
720 unsigned long flags;
722 dprintk(XENLOG_WARNING VTDPREFIX,
723 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
725 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
727 iommu_fault_status(fault_status);
729 /* FIXME: ignore advanced fault log */
730 if ( !(fault_status & DMA_FSTS_PPF) )
731 goto clear_overflow;
733 fault_index = dma_fsts_fault_record_index(fault_status);
734 reg = cap_fault_reg_offset(iommu->cap);
735 while (1)
736 {
737 u8 fault_reason;
738 u16 source_id;
739 u32 data;
740 u64 guest_addr;
741 int type;
743 /* highest 32 bits */
744 spin_lock_irqsave(&iommu->register_lock, flags);
745 data = dmar_readl(iommu->reg, reg +
746 fault_index * PRIMARY_FAULT_REG_LEN + 12);
747 if ( !(data & DMA_FRCD_F) )
748 {
749 spin_unlock_irqrestore(&iommu->register_lock, flags);
750 break;
751 }
753 fault_reason = dma_frcd_fault_reason(data);
754 type = dma_frcd_type(data);
756 data = dmar_readl(iommu->reg, reg +
757 fault_index * PRIMARY_FAULT_REG_LEN + 8);
758 source_id = dma_frcd_source_id(data);
760 guest_addr = dmar_readq(iommu->reg, reg +
761 fault_index * PRIMARY_FAULT_REG_LEN);
762 guest_addr = dma_frcd_page_addr(guest_addr);
763 /* clear the fault */
764 dmar_writel(iommu->reg, reg +
765 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
766 spin_unlock_irqrestore(&iommu->register_lock, flags);
768 iommu_page_fault_do_one(iommu, type, fault_reason,
769 source_id, guest_addr);
771 fault_index++;
772 if ( fault_index > cap_num_fault_regs(iommu->cap) )
773 fault_index = 0;
774 }
775 clear_overflow:
776 /* clear primary fault overflow */
777 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
778 if ( fault_status & DMA_FSTS_PFO )
779 {
780 spin_lock_irqsave(&iommu->register_lock, flags);
781 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
782 spin_unlock_irqrestore(&iommu->register_lock, flags);
783 }
784 }
786 static void dma_msi_unmask(unsigned int vector)
787 {
788 struct iommu *iommu = vector_to_iommu[vector];
789 unsigned long flags;
791 /* unmask it */
792 spin_lock_irqsave(&iommu->register_lock, flags);
793 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
794 spin_unlock_irqrestore(&iommu->register_lock, flags);
795 }
797 static void dma_msi_mask(unsigned int vector)
798 {
799 unsigned long flags;
800 struct iommu *iommu = vector_to_iommu[vector];
802 /* mask it */
803 spin_lock_irqsave(&iommu->register_lock, flags);
804 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
805 spin_unlock_irqrestore(&iommu->register_lock, flags);
806 }
808 static unsigned int dma_msi_startup(unsigned int vector)
809 {
810 dma_msi_unmask(vector);
811 return 0;
812 }
814 static void dma_msi_end(unsigned int vector)
815 {
816 dma_msi_unmask(vector);
817 ack_APIC_irq();
818 }
820 static void dma_msi_data_init(struct iommu *iommu, int vector)
821 {
822 u32 msi_data = 0;
823 unsigned long flags;
825 /* Fixed, edge, assert mode. Follow MSI setting */
826 msi_data |= vector & 0xff;
827 msi_data |= 1 << 14;
829 spin_lock_irqsave(&iommu->register_lock, flags);
830 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
831 spin_unlock_irqrestore(&iommu->register_lock, flags);
832 }
834 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
835 {
836 u64 msi_address;
837 unsigned long flags;
839 /* Physical, dedicated cpu. Follow MSI setting */
840 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
841 msi_address |= MSI_PHYSICAL_MODE << 2;
842 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
843 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
845 spin_lock_irqsave(&iommu->register_lock, flags);
846 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
847 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
848 spin_unlock_irqrestore(&iommu->register_lock, flags);
849 }
851 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
852 {
853 struct iommu *iommu = vector_to_iommu[vector];
854 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
855 }
857 static struct hw_interrupt_type dma_msi_type = {
858 .typename = "DMA_MSI",
859 .startup = dma_msi_startup,
860 .shutdown = dma_msi_mask,
861 .enable = dma_msi_unmask,
862 .disable = dma_msi_mask,
863 .ack = dma_msi_mask,
864 .end = dma_msi_end,
865 .set_affinity = dma_msi_set_affinity,
866 };
868 static int iommu_set_interrupt(struct iommu *iommu)
869 {
870 int vector, ret;
872 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
873 if ( vector <= 0 )
874 {
875 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
876 return -EINVAL;
877 }
879 irq_desc[vector].handler = &dma_msi_type;
880 vector_to_iommu[vector] = iommu;
881 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
882 if ( ret )
883 {
884 irq_desc[vector].handler = &no_irq_type;
885 vector_to_iommu[vector] = NULL;
886 free_irq_vector(vector);
887 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
888 return ret;
889 }
891 /* Make sure that vector is never re-used. */
892 vector_irq[vector] = NEVER_ASSIGN_IRQ;
894 return vector;
895 }
897 static int iommu_alloc(struct acpi_drhd_unit *drhd)
898 {
899 struct iommu *iommu;
900 unsigned long sagaw;
901 int agaw;
903 if ( nr_iommus > MAX_IOMMUS )
904 {
905 gdprintk(XENLOG_ERR VTDPREFIX,
906 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
907 return -ENOMEM;
908 }
910 iommu = xmalloc(struct iommu);
911 if ( iommu == NULL )
912 return -ENOMEM;
913 memset(iommu, 0, sizeof(struct iommu));
915 iommu->intel = alloc_intel_iommu();
916 if ( iommu->intel == NULL )
917 {
918 xfree(iommu);
919 return -ENOMEM;
920 }
922 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
923 iommu->index = nr_iommus++;
925 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
926 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
928 /* Calculate number of pagetable levels: between 2 and 4. */
929 sagaw = cap_sagaw(iommu->cap);
930 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
931 if ( test_bit(agaw, &sagaw) )
932 break;
933 if ( agaw < 0 )
934 {
935 gdprintk(XENLOG_ERR VTDPREFIX,
936 "IOMMU: unsupported sagaw %lx\n", sagaw);
937 xfree(iommu);
938 return -ENODEV;
939 }
940 iommu->nr_pt_levels = agaw_to_level(agaw);
942 if ( !ecap_coherent(iommu->ecap) )
943 iommus_incoherent = 1;
945 spin_lock_init(&iommu->lock);
946 spin_lock_init(&iommu->register_lock);
948 drhd->iommu = iommu;
949 return 0;
950 }
952 static void iommu_free(struct acpi_drhd_unit *drhd)
953 {
954 struct iommu *iommu = drhd->iommu;
956 if ( iommu == NULL )
957 return;
959 if ( iommu->root_maddr != 0 )
960 {
961 free_pgtable_maddr(iommu->root_maddr);
962 iommu->root_maddr = 0;
963 }
965 if ( iommu->reg )
966 iounmap(iommu->reg);
968 free_intel_iommu(iommu->intel);
969 release_irq_vector(iommu->vector);
970 xfree(iommu);
972 drhd->iommu = NULL;
973 }
975 #define guestwidth_to_adjustwidth(gaw) ({ \
976 int agaw, r = (gaw - 12) % 9; \
977 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
978 if ( agaw > 64 ) \
979 agaw = 64; \
980 agaw; })
982 static int intel_iommu_domain_init(struct domain *d)
983 {
984 struct hvm_iommu *hd = domain_hvm_iommu(d);
985 struct iommu *iommu = NULL;
986 struct acpi_drhd_unit *drhd;
988 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
989 iommu = drhd->iommu;
991 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
993 if ( d->domain_id == 0 )
994 {
995 /* Set up 1:1 page table for dom0 */
996 iommu_set_dom0_mapping(d);
998 setup_dom0_devices(d);
999 setup_dom0_rmrr(d);
1001 iommu_flush_all();
1003 for_each_drhd_unit ( drhd )
1005 iommu = drhd->iommu;
1006 iommu_enable_translation(iommu);
1010 return 0;
1013 static int domain_context_mapping_one(
1014 struct domain *domain,
1015 struct iommu *iommu,
1016 u8 bus, u8 devfn)
1018 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1019 struct context_entry *context, *context_entries;
1020 u64 maddr, pgd_maddr;
1021 struct pci_dev *pdev = NULL;
1022 int agaw;
1024 ASSERT(spin_is_locked(&pcidevs_lock));
1025 spin_lock(&iommu->lock);
1026 maddr = bus_to_context_maddr(iommu, bus);
1027 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1028 context = &context_entries[devfn];
1030 if ( context_present(*context) )
1032 int res = 0;
1034 pdev = pci_get_pdev(bus, devfn);
1035 if (!pdev)
1036 res = -ENODEV;
1037 else if (pdev->domain != domain)
1038 res = -EINVAL;
1039 unmap_vtd_domain_page(context_entries);
1040 spin_unlock(&iommu->lock);
1041 return res;
1044 if ( iommu_passthrough && (domain->domain_id == 0) )
1046 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1047 agaw = level_to_agaw(iommu->nr_pt_levels);
1049 else
1051 spin_lock(&hd->mapping_lock);
1053 /* Ensure we have pagetables allocated down to leaf PTE. */
1054 if ( hd->pgd_maddr == 0 )
1056 addr_to_dma_page_maddr(domain, 0, 1);
1057 if ( hd->pgd_maddr == 0 )
1059 nomem:
1060 spin_unlock(&hd->mapping_lock);
1061 spin_unlock(&iommu->lock);
1062 unmap_vtd_domain_page(context_entries);
1063 return -ENOMEM;
1067 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1068 pgd_maddr = hd->pgd_maddr;
1069 for ( agaw = level_to_agaw(4);
1070 agaw != level_to_agaw(iommu->nr_pt_levels);
1071 agaw-- )
1073 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1074 pgd_maddr = dma_pte_addr(*p);
1075 unmap_vtd_domain_page(p);
1076 if ( pgd_maddr == 0 )
1077 goto nomem;
1080 context_set_address_root(*context, pgd_maddr);
1081 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1082 spin_unlock(&hd->mapping_lock);
1085 /*
1086 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1087 * be 1 based as required by intel's iommu hw.
1088 */
1089 context_set_domain_id(context, domain);
1090 context_set_address_width(*context, agaw);
1091 context_set_fault_enable(*context);
1092 context_set_present(*context);
1093 iommu_flush_cache_entry(context);
1094 spin_unlock(&iommu->lock);
1096 /* Context entry was previously non-present (with domid 0). */
1097 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1098 DMA_CCMD_MASK_NOBIT, 1) )
1099 iommu_flush_write_buffer(iommu);
1100 else
1101 iommu_flush_iotlb_dsi(iommu, 0, 1);
1103 set_bit(iommu->index, &hd->iommu_bitmap);
1105 unmap_vtd_domain_page(context_entries);
1107 return 0;
1110 #define PCI_BASE_CLASS_BRIDGE 0x06
1111 #define PCI_CLASS_BRIDGE_PCI 0x0604
1113 enum {
1114 DEV_TYPE_PCIe_ENDPOINT,
1115 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1116 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1117 DEV_TYPE_PCI,
1118 };
1120 int pdev_type(u8 bus, u8 devfn)
1122 u16 class_device;
1123 u16 status, creg;
1124 int pos;
1125 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1127 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1128 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1130 pos = pci_find_next_cap(bus, devfn,
1131 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1132 if ( !pos )
1133 return DEV_TYPE_PCI_BRIDGE;
1134 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1135 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1136 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1139 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1140 if ( !(status & PCI_STATUS_CAP_LIST) )
1141 return DEV_TYPE_PCI;
1143 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1144 return DEV_TYPE_PCIe_ENDPOINT;
1146 return DEV_TYPE_PCI;
1149 #define MAX_BUSES 256
1150 static DEFINE_SPINLOCK(bus2bridge_lock);
1151 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1153 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1155 int cnt = 0;
1156 *secbus = *bus;
1158 ASSERT(spin_is_locked(&bus2bridge_lock));
1159 if ( !bus2bridge[*bus].map )
1160 return 0;
1162 while ( bus2bridge[*bus].map )
1164 *secbus = *bus;
1165 *devfn = bus2bridge[*bus].devfn;
1166 *bus = bus2bridge[*bus].bus;
1167 if ( cnt++ >= MAX_BUSES )
1168 return 0;
1171 return 1;
1174 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1176 int ret = 0;
1178 if ( *bus == 0 )
1179 /* assume integrated PCI devices in RC have valid requester-id */
1180 return 1;
1182 spin_lock(&bus2bridge_lock);
1183 ret = _find_pcie_endpoint(bus, devfn, secbus);
1184 spin_unlock(&bus2bridge_lock);
1186 return ret;
1189 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1191 struct acpi_drhd_unit *drhd;
1192 int ret = 0;
1193 u16 sec_bus, sub_bus;
1194 u32 type;
1195 u8 secbus, secdevfn;
1196 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1198 BUG_ON(!pdev);
1200 drhd = acpi_find_matched_drhd_unit(pdev);
1201 if ( !drhd )
1202 return -ENODEV;
1204 ASSERT(spin_is_locked(&pcidevs_lock));
1206 type = pdev_type(bus, devfn);
1207 switch ( type )
1209 case DEV_TYPE_PCIe_BRIDGE:
1210 break;
1212 case DEV_TYPE_PCI_BRIDGE:
1213 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1214 PCI_SECONDARY_BUS);
1215 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1216 PCI_SUBORDINATE_BUS);
1218 spin_lock(&bus2bridge_lock);
1219 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1221 bus2bridge[sec_bus].map = 1;
1222 bus2bridge[sec_bus].bus = bus;
1223 bus2bridge[sec_bus].devfn = devfn;
1225 spin_unlock(&bus2bridge_lock);
1226 break;
1228 case DEV_TYPE_PCIe_ENDPOINT:
1229 gdprintk(XENLOG_INFO VTDPREFIX,
1230 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1231 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1232 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1233 break;
1235 case DEV_TYPE_PCI:
1236 gdprintk(XENLOG_INFO VTDPREFIX,
1237 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1238 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1240 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1241 if ( ret )
1242 break;
1244 secbus = bus;
1245 secdevfn = devfn;
1246 /* dependent devices mapping */
1247 while ( bus2bridge[bus].map )
1249 secbus = bus;
1250 secdevfn = devfn;
1251 devfn = bus2bridge[bus].devfn;
1252 bus = bus2bridge[bus].bus;
1253 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1254 if ( ret )
1255 return ret;
1258 if ( (secbus != bus) && (secdevfn != 0) )
1259 /*
1260 * The source-id for transactions on non-PCIe buses seem
1261 * to originate from devfn=0 on the secondary bus behind
1262 * the bridge. Map that id as well. The id to use in
1263 * these scanarios is not particularly well documented
1264 * anywhere.
1265 */
1266 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1267 break;
1269 default:
1270 gdprintk(XENLOG_ERR VTDPREFIX,
1271 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1272 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1273 ret = -EINVAL;
1274 break;
1277 return ret;
1280 static int domain_context_unmap_one(
1281 struct domain *domain,
1282 struct iommu *iommu,
1283 u8 bus, u8 devfn)
1285 struct context_entry *context, *context_entries;
1286 u64 maddr;
1288 ASSERT(spin_is_locked(&pcidevs_lock));
1289 spin_lock(&iommu->lock);
1291 maddr = bus_to_context_maddr(iommu, bus);
1292 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1293 context = &context_entries[devfn];
1295 if ( !context_present(*context) )
1297 spin_unlock(&iommu->lock);
1298 unmap_vtd_domain_page(context_entries);
1299 return 0;
1302 context_clear_present(*context);
1303 context_clear_entry(*context);
1304 iommu_flush_cache_entry(context);
1306 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1307 (((u16)bus) << 8) | devfn,
1308 DMA_CCMD_MASK_NOBIT, 0) )
1309 iommu_flush_write_buffer(iommu);
1310 else
1311 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
1313 spin_unlock(&iommu->lock);
1314 unmap_vtd_domain_page(context_entries);
1316 return 0;
1319 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1321 struct acpi_drhd_unit *drhd;
1322 int ret = 0;
1323 u32 type;
1324 u8 secbus, secdevfn;
1325 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1327 BUG_ON(!pdev);
1329 drhd = acpi_find_matched_drhd_unit(pdev);
1330 if ( !drhd )
1331 return -ENODEV;
1333 type = pdev_type(bus, devfn);
1334 switch ( type )
1336 case DEV_TYPE_PCIe_BRIDGE:
1337 case DEV_TYPE_PCI_BRIDGE:
1338 break;
1340 case DEV_TYPE_PCIe_ENDPOINT:
1341 gdprintk(XENLOG_INFO VTDPREFIX,
1342 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1343 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1344 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1345 break;
1347 case DEV_TYPE_PCI:
1348 gdprintk(XENLOG_INFO VTDPREFIX,
1349 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1350 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1351 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1352 if ( ret )
1353 break;
1355 secbus = bus;
1356 secdevfn = devfn;
1357 /* dependent devices unmapping */
1358 while ( bus2bridge[bus].map )
1360 secbus = bus;
1361 secdevfn = devfn;
1362 devfn = bus2bridge[bus].devfn;
1363 bus = bus2bridge[bus].bus;
1364 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1365 if ( ret )
1366 return ret;
1369 if ( (secbus != bus) && (secdevfn != 0) )
1370 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1371 break;
1373 default:
1374 gdprintk(XENLOG_ERR VTDPREFIX,
1375 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1376 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1377 ret = -EINVAL;
1378 break;
1381 return ret;
1384 static int reassign_device_ownership(
1385 struct domain *source,
1386 struct domain *target,
1387 u8 bus, u8 devfn)
1389 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1390 struct pci_dev *pdev;
1391 struct acpi_drhd_unit *drhd;
1392 struct iommu *pdev_iommu;
1393 int ret, found = 0;
1395 ASSERT(spin_is_locked(&pcidevs_lock));
1396 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1398 if (!pdev)
1399 return -ENODEV;
1401 drhd = acpi_find_matched_drhd_unit(pdev);
1402 pdev_iommu = drhd->iommu;
1403 domain_context_unmap(source, bus, devfn);
1405 ret = domain_context_mapping(target, bus, devfn);
1406 if ( ret )
1407 return ret;
1409 list_move(&pdev->domain_list, &target->arch.pdev_list);
1410 pdev->domain = target;
1412 for_each_pdev ( source, pdev )
1414 drhd = acpi_find_matched_drhd_unit(pdev);
1415 if ( drhd->iommu == pdev_iommu )
1417 found = 1;
1418 break;
1422 if ( !found )
1423 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1425 return ret;
1428 void iommu_domain_teardown(struct domain *d)
1430 struct hvm_iommu *hd = domain_hvm_iommu(d);
1432 if ( list_empty(&acpi_drhd_units) )
1433 return;
1435 spin_lock(&hd->mapping_lock);
1436 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1437 hd->pgd_maddr = 0;
1438 spin_unlock(&hd->mapping_lock);
1440 iommu_domid_release(d);
1443 int intel_iommu_map_page(
1444 struct domain *d, unsigned long gfn, unsigned long mfn)
1446 struct hvm_iommu *hd = domain_hvm_iommu(d);
1447 struct acpi_drhd_unit *drhd;
1448 struct iommu *iommu;
1449 struct dma_pte *page = NULL, *pte = NULL;
1450 u64 pg_maddr;
1451 int pte_present;
1453 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1454 iommu = drhd->iommu;
1456 /* do nothing if dom0 and iommu supports pass thru */
1457 if ( iommu_passthrough && (d->domain_id == 0) )
1458 return 0;
1460 spin_lock(&hd->mapping_lock);
1462 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1463 if ( pg_maddr == 0 )
1465 spin_unlock(&hd->mapping_lock);
1466 return -ENOMEM;
1468 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1469 pte = page + (gfn & LEVEL_MASK);
1470 pte_present = dma_pte_present(*pte);
1471 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1472 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1474 /* Set the SNP on leaf page table if Snoop Control available */
1475 if ( iommu_snoop )
1476 dma_set_pte_snp(*pte);
1478 iommu_flush_cache_entry(pte);
1479 spin_unlock(&hd->mapping_lock);
1480 unmap_vtd_domain_page(page);
1482 /*
1483 * No need pcideves_lock here because we have flush
1484 * when assign/deassign device
1485 */
1486 for_each_drhd_unit ( drhd )
1488 iommu = drhd->iommu;
1490 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1491 continue;
1493 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1494 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1495 !pte_present) )
1496 iommu_flush_write_buffer(iommu);
1499 return 0;
1502 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1504 struct acpi_drhd_unit *drhd;
1505 struct iommu *iommu;
1507 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1508 iommu = drhd->iommu;
1510 /* do nothing if dom0 and iommu supports pass thru */
1511 if ( iommu_passthrough && (d->domain_id == 0) )
1512 return 0;
1514 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1516 return 0;
1519 static int iommu_prepare_rmrr_dev(struct domain *d,
1520 struct acpi_rmrr_unit *rmrr,
1521 u8 bus, u8 devfn)
1523 int ret = 0;
1524 u64 base, end;
1525 unsigned long base_pfn, end_pfn;
1527 ASSERT(spin_is_locked(&pcidevs_lock));
1528 ASSERT(rmrr->base_address < rmrr->end_address);
1530 base = rmrr->base_address & PAGE_MASK_4K;
1531 base_pfn = base >> PAGE_SHIFT_4K;
1532 end = PAGE_ALIGN_4K(rmrr->end_address);
1533 end_pfn = end >> PAGE_SHIFT_4K;
1535 while ( base_pfn < end_pfn )
1537 intel_iommu_map_page(d, base_pfn, base_pfn);
1538 base_pfn++;
1541 ret = domain_context_mapping(d, bus, devfn);
1543 return ret;
1546 static int intel_iommu_add_device(struct pci_dev *pdev)
1548 struct acpi_rmrr_unit *rmrr;
1549 u16 bdf;
1550 int ret, i;
1552 ASSERT(spin_is_locked(&pcidevs_lock));
1554 if ( !pdev->domain )
1555 return -EINVAL;
1557 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1558 if ( ret )
1560 gdprintk(XENLOG_ERR VTDPREFIX,
1561 "intel_iommu_add_device: context mapping failed\n");
1562 return ret;
1565 for_each_rmrr_device ( rmrr, bdf, i )
1567 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1569 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1570 pdev->bus, pdev->devfn);
1571 if ( ret )
1572 gdprintk(XENLOG_ERR VTDPREFIX,
1573 "intel_iommu_add_device: RMRR mapping failed\n");
1574 break;
1578 return ret;
1581 static int intel_iommu_remove_device(struct pci_dev *pdev)
1583 struct acpi_rmrr_unit *rmrr;
1584 u16 bdf;
1585 int i;
1587 if ( !pdev->domain )
1588 return -EINVAL;
1590 /* If the device belongs to dom0, and it has RMRR, don't remove it
1591 * from dom0, because BIOS may use RMRR at booting time.
1592 */
1593 if ( pdev->domain->domain_id == 0 )
1595 for_each_rmrr_device ( rmrr, bdf, i )
1597 if ( PCI_BUS(bdf) == pdev->bus &&
1598 PCI_DEVFN2(bdf) == pdev->devfn )
1599 return 0;
1603 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1606 static void setup_dom0_devices(struct domain *d)
1608 struct hvm_iommu *hd;
1609 struct pci_dev *pdev;
1610 int bus, dev, func;
1611 u32 l;
1613 hd = domain_hvm_iommu(d);
1615 spin_lock(&pcidevs_lock);
1616 for ( bus = 0; bus < 256; bus++ )
1618 for ( dev = 0; dev < 32; dev++ )
1620 for ( func = 0; func < 8; func++ )
1622 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1623 /* some broken boards return 0 or ~0 if a slot is empty: */
1624 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1625 (l == 0x0000ffff) || (l == 0xffff0000) )
1626 continue;
1628 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1629 pdev->domain = d;
1630 list_add(&pdev->domain_list, &d->arch.pdev_list);
1631 domain_context_mapping(d, pdev->bus, pdev->devfn);
1635 spin_unlock(&pcidevs_lock);
1638 void clear_fault_bits(struct iommu *iommu)
1640 u64 val;
1642 val = dmar_readq(
1643 iommu->reg,
1644 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1645 dmar_writeq(
1646 iommu->reg,
1647 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1648 val);
1649 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1652 static int init_vtd_hw(void)
1654 struct acpi_drhd_unit *drhd;
1655 struct iommu *iommu;
1656 struct iommu_flush *flush = NULL;
1657 int vector;
1658 int ret;
1660 for_each_drhd_unit ( drhd )
1662 iommu = drhd->iommu;
1663 ret = iommu_set_root_entry(iommu);
1664 if ( ret )
1666 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1667 return -EIO;
1670 vector = iommu_set_interrupt(iommu);
1671 if ( vector < 0 )
1673 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1674 return vector;
1676 dma_msi_data_init(iommu, vector);
1677 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1678 iommu->vector = vector;
1679 clear_fault_bits(iommu);
1680 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1682 /* initialize flush functions */
1683 flush = iommu_get_flush(iommu);
1684 flush->context = flush_context_reg;
1685 flush->iotlb = flush_iotlb_reg;
1688 if ( iommu_qinval )
1690 for_each_drhd_unit ( drhd )
1692 iommu = drhd->iommu;
1693 if ( qinval_setup(iommu) != 0 )
1695 dprintk(XENLOG_INFO VTDPREFIX,
1696 "Failed to enable Queued Invalidation!\n");
1697 break;
1702 if ( iommu_intremap )
1704 for_each_drhd_unit ( drhd )
1706 iommu = drhd->iommu;
1707 if ( intremap_setup(iommu) != 0 )
1709 dprintk(XENLOG_INFO VTDPREFIX,
1710 "Failed to enable Interrupt Remapping!\n");
1711 break;
1716 return 0;
1719 static void setup_dom0_rmrr(struct domain *d)
1721 struct acpi_rmrr_unit *rmrr;
1722 u16 bdf;
1723 int ret, i;
1725 spin_lock(&pcidevs_lock);
1726 for_each_rmrr_device ( rmrr, bdf, i )
1728 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1729 if ( ret )
1730 gdprintk(XENLOG_ERR VTDPREFIX,
1731 "IOMMU: mapping reserved region failed\n");
1733 spin_unlock(&pcidevs_lock);
1736 static void platform_quirks(void)
1738 u32 id;
1740 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1741 id = pci_conf_read32(0, 0, 0, 0);
1742 if ( id == 0x2a408086 )
1744 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1745 rwbf_quirk = 1;
1749 int intel_vtd_setup(void)
1751 struct acpi_drhd_unit *drhd;
1752 struct iommu *iommu;
1754 if ( !vtd_enabled )
1755 return -ENODEV;
1757 platform_quirks();
1759 spin_lock_init(&domid_bitmap_lock);
1760 clflush_size = get_cache_line_size();
1762 /* We enable the following features only if they are supported by all VT-d
1763 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1764 * Interrupt Remapping.
1765 */
1766 for_each_drhd_unit ( drhd )
1768 if ( iommu_alloc(drhd) != 0 )
1769 goto error;
1771 iommu = drhd->iommu;
1773 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1774 iommu_snoop = 0;
1776 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1777 iommu_passthrough = 0;
1779 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1780 iommu_qinval = 0;
1782 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1783 iommu_intremap = 0;
1786 if ( !iommu_qinval && iommu_intremap )
1788 iommu_intremap = 0;
1789 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1790 "since Queued Invalidation isn't supported or enabled.\n");
1793 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1794 P(iommu_snoop, "Snoop Control");
1795 P(iommu_passthrough, "DMA Passthrough");
1796 P(iommu_qinval, "Queued Invalidation");
1797 P(iommu_intremap, "Interrupt Remapping");
1798 #undef P
1800 /* Allocate IO page directory page for the domain. */
1801 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1802 iommu = drhd->iommu;
1804 /* Allocate domain id bitmap, and set bit 0 as reserved */
1805 domid_bitmap_size = cap_ndoms(iommu->cap);
1806 domid_bitmap = xmalloc_array(unsigned long,
1807 BITS_TO_LONGS(domid_bitmap_size));
1808 if ( domid_bitmap == NULL )
1809 goto error;
1810 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1811 set_bit(0, domid_bitmap);
1813 if ( init_vtd_hw() )
1814 goto error;
1816 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1818 return 0;
1820 error:
1821 for_each_drhd_unit ( drhd )
1822 iommu_free(drhd);
1823 vtd_enabled = 0;
1824 iommu_snoop = 0;
1825 iommu_passthrough = 0;
1826 iommu_qinval = 0;
1827 iommu_intremap = 0;
1828 return -ENOMEM;
1831 /*
1832 * If the device isn't owned by dom0, it means it already
1833 * has been assigned to other domain, or it's not exist.
1834 */
1835 int device_assigned(u8 bus, u8 devfn)
1837 struct pci_dev *pdev;
1839 spin_lock(&pcidevs_lock);
1840 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1841 if (!pdev)
1843 spin_unlock(&pcidevs_lock);
1844 return -1;
1847 spin_unlock(&pcidevs_lock);
1848 return 0;
1851 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1853 struct acpi_rmrr_unit *rmrr;
1854 int ret = 0, i;
1855 struct pci_dev *pdev;
1856 u16 bdf;
1858 if ( list_empty(&acpi_drhd_units) )
1859 return -ENODEV;
1861 ASSERT(spin_is_locked(&pcidevs_lock));
1862 pdev = pci_get_pdev(bus, devfn);
1863 if (!pdev)
1864 return -ENODEV;
1866 if (pdev->domain != dom0)
1868 gdprintk(XENLOG_ERR VTDPREFIX,
1869 "IOMMU: assign a assigned device\n");
1870 return -EBUSY;
1873 ret = reassign_device_ownership(dom0, d, bus, devfn);
1874 if ( ret )
1875 goto done;
1877 /* Setup rmrr identity mapping */
1878 for_each_rmrr_device( rmrr, bdf, i )
1880 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1882 /* FIXME: Because USB RMRR conflicts with guest bios region,
1883 * ignore USB RMRR temporarily.
1884 */
1885 if ( is_usb_device(bus, devfn) )
1887 ret = 0;
1888 goto done;
1891 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1892 if ( ret )
1893 gdprintk(XENLOG_ERR VTDPREFIX,
1894 "IOMMU: mapping reserved region failed\n");
1895 goto done;
1899 done:
1900 return ret;
1903 static int intel_iommu_group_id(u8 bus, u8 devfn)
1905 u8 secbus;
1906 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1907 return PCI_BDF2(bus, devfn);
1908 else
1909 return -1;
1912 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1913 void iommu_suspend(void)
1915 struct acpi_drhd_unit *drhd;
1916 struct iommu *iommu;
1917 u32 i;
1919 if ( !vtd_enabled )
1920 return;
1922 iommu_flush_all();
1924 for_each_drhd_unit ( drhd )
1926 iommu = drhd->iommu;
1927 i = iommu->index;
1929 iommu_state[i][DMAR_FECTL_REG] =
1930 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1931 iommu_state[i][DMAR_FEDATA_REG] =
1932 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1933 iommu_state[i][DMAR_FEADDR_REG] =
1934 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1935 iommu_state[i][DMAR_FEUADDR_REG] =
1936 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1940 void iommu_resume(void)
1942 struct acpi_drhd_unit *drhd;
1943 struct iommu *iommu;
1944 u32 i;
1946 if ( !vtd_enabled )
1947 return;
1949 iommu_flush_all();
1951 if ( init_vtd_hw() != 0 && force_iommu )
1952 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1954 for_each_drhd_unit ( drhd )
1956 iommu = drhd->iommu;
1957 i = iommu->index;
1959 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1960 (u32) iommu_state[i][DMAR_FECTL_REG]);
1961 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1962 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1963 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1964 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1965 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1966 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1967 iommu_enable_translation(iommu);
1971 struct iommu_ops intel_iommu_ops = {
1972 .init = intel_iommu_domain_init,
1973 .add_device = intel_iommu_add_device,
1974 .remove_device = intel_iommu_remove_device,
1975 .assign_device = intel_iommu_assign_device,
1976 .teardown = iommu_domain_teardown,
1977 .map_page = intel_iommu_map_page,
1978 .unmap_page = intel_iommu_unmap_page,
1979 .reassign_device = reassign_device_ownership,
1980 .get_device_group_id = intel_iommu_group_id,
1981 .update_ire_from_apic = io_apic_write_remap_rte,
1982 .update_ire_from_msi = msi_msg_write_remap_rte,
1983 };
1985 /*
1986 * Local variables:
1987 * mode: C
1988 * c-set-style: "BSD"
1989 * c-basic-offset: 4
1990 * tab-width: 4
1991 * indent-tabs-mode: nil
1992 * End:
1993 */