ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19733:a69daf23602a

VT-d: define a macro for waiting hardare completion

When set some registers of VT-d, it must wait for hardware
completion. There are lots of duplicated code to do that. This patch
defines a macro for it, thus it is much cleaner.

Signed-off-by: Weidong Han <weidong.han@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 09:27:18 2009 +0100 (2009-06-05)
parents 931dbe86e5f3
children 4fb8a6c993e2
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
234 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
235 return;
236 val = iommu->gcmd | DMA_GCMD_WBF;
238 spin_lock_irqsave(&iommu->register_lock, flag);
239 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
241 /* Make sure hardware complete it */
242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
243 !(val & DMA_GSTS_WBFS), val);
245 spin_unlock_irqrestore(&iommu->register_lock, flag);
246 }
248 /* return value determine if we need a write buffer flush */
249 static int flush_context_reg(
250 void *_iommu,
251 u16 did, u16 source_id, u8 function_mask, u64 type,
252 int flush_non_present_entry)
253 {
254 struct iommu *iommu = (struct iommu *) _iommu;
255 u64 val = 0;
256 unsigned long flag;
258 /*
259 * In the non-present entry flush case, if hardware doesn't cache
260 * non-present entry we do nothing and if hardware cache non-present
261 * entry, we flush entries of domain 0 (the domain id is used to cache
262 * any non-present entries)
263 */
264 if ( flush_non_present_entry )
265 {
266 if ( !cap_caching_mode(iommu->cap) )
267 return 1;
268 else
269 did = 0;
270 }
272 /* use register invalidation */
273 switch ( type )
274 {
275 case DMA_CCMD_GLOBAL_INVL:
276 val = DMA_CCMD_GLOBAL_INVL;
277 break;
278 case DMA_CCMD_DOMAIN_INVL:
279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
280 break;
281 case DMA_CCMD_DEVICE_INVL:
282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
283 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
284 break;
285 default:
286 BUG();
287 }
288 val |= DMA_CCMD_ICC;
290 spin_lock_irqsave(&iommu->register_lock, flag);
291 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
293 /* Make sure hardware complete it */
294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
295 !(val & DMA_CCMD_ICC), val);
297 spin_unlock_irqrestore(&iommu->register_lock, flag);
298 /* flush context entry will implicitly flush write buffer */
299 return 0;
300 }
302 static int inline iommu_flush_context_global(
303 struct iommu *iommu, int flush_non_present_entry)
304 {
305 struct iommu_flush *flush = iommu_get_flush(iommu);
306 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
307 flush_non_present_entry);
308 }
310 static int inline iommu_flush_context_domain(
311 struct iommu *iommu, u16 did, int flush_non_present_entry)
312 {
313 struct iommu_flush *flush = iommu_get_flush(iommu);
314 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
315 flush_non_present_entry);
316 }
318 static int inline iommu_flush_context_device(
319 struct iommu *iommu, u16 did, u16 source_id,
320 u8 function_mask, int flush_non_present_entry)
321 {
322 struct iommu_flush *flush = iommu_get_flush(iommu);
323 return flush->context(iommu, did, source_id, function_mask,
324 DMA_CCMD_DEVICE_INVL,
325 flush_non_present_entry);
326 }
328 /* return value determine if we need a write buffer flush */
329 static int flush_iotlb_reg(void *_iommu, u16 did,
330 u64 addr, unsigned int size_order, u64 type,
331 int flush_non_present_entry, int flush_dev_iotlb)
332 {
333 struct iommu *iommu = (struct iommu *) _iommu;
334 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
335 u64 val = 0, val_iva = 0;
336 unsigned long flag;
338 /*
339 * In the non-present entry flush case, if hardware doesn't cache
340 * non-present entry we do nothing and if hardware cache non-present
341 * entry, we flush entries of domain 0 (the domain id is used to cache
342 * any non-present entries)
343 */
344 if ( flush_non_present_entry )
345 {
346 if ( !cap_caching_mode(iommu->cap) )
347 return 1;
348 else
349 did = 0;
350 }
352 /* use register invalidation */
353 switch ( type )
354 {
355 case DMA_TLB_GLOBAL_FLUSH:
356 /* global flush doesn't need set IVA_REG */
357 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
358 break;
359 case DMA_TLB_DSI_FLUSH:
360 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
361 break;
362 case DMA_TLB_PSI_FLUSH:
363 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
364 /* Note: always flush non-leaf currently */
365 val_iva = size_order | addr;
366 break;
367 default:
368 BUG();
369 }
370 /* Note: set drain read/write */
371 if ( cap_read_drain(iommu->cap) )
372 val |= DMA_TLB_READ_DRAIN;
373 if ( cap_write_drain(iommu->cap) )
374 val |= DMA_TLB_WRITE_DRAIN;
376 spin_lock_irqsave(&iommu->register_lock, flag);
377 /* Note: Only uses first TLB reg currently */
378 if ( val_iva )
379 dmar_writeq(iommu->reg, tlb_offset, val_iva);
380 dmar_writeq(iommu->reg, tlb_offset + 8, val);
382 /* Make sure hardware complete it */
383 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
384 !(val & DMA_TLB_IVT), val);
385 spin_unlock_irqrestore(&iommu->register_lock, flag);
387 /* check IOTLB invalidation granularity */
388 if ( DMA_TLB_IAIG(val) == 0 )
389 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
391 /* flush iotlb entry will implicitly flush write buffer */
392 return 0;
393 }
395 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
396 int flush_non_present_entry, int flush_dev_iotlb)
397 {
398 struct iommu_flush *flush = iommu_get_flush(iommu);
399 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
400 flush_non_present_entry, flush_dev_iotlb);
401 }
403 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
404 int flush_non_present_entry, int flush_dev_iotlb)
405 {
406 struct iommu_flush *flush = iommu_get_flush(iommu);
407 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
408 flush_non_present_entry, flush_dev_iotlb);
409 }
411 static int inline get_alignment(u64 base, unsigned int size)
412 {
413 int t = 0;
414 u64 end;
416 end = base + size - 1;
417 while ( base != end )
418 {
419 t++;
420 base >>= 1;
421 end >>= 1;
422 }
423 return t;
424 }
426 static int inline iommu_flush_iotlb_psi(
427 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
428 int flush_non_present_entry, int flush_dev_iotlb)
429 {
430 unsigned int align;
431 struct iommu_flush *flush = iommu_get_flush(iommu);
433 ASSERT(!(addr & (~PAGE_MASK_4K)));
434 ASSERT(pages > 0);
436 /* Fallback to domain selective flush if no PSI support */
437 if ( !cap_pgsel_inv(iommu->cap) )
438 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
440 /*
441 * PSI requires page size is 2 ^ x, and the base address is naturally
442 * aligned to the size
443 */
444 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
445 /* Fallback to domain selective flush if size is too big */
446 if ( align > cap_max_amask_val(iommu->cap) )
447 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
449 addr >>= PAGE_SHIFT_4K + align;
450 addr <<= PAGE_SHIFT_4K + align;
452 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
453 flush_non_present_entry, flush_dev_iotlb);
454 }
456 void iommu_flush_all(void)
457 {
458 struct acpi_drhd_unit *drhd;
459 struct iommu *iommu;
460 int flush_dev_iotlb;
462 flush_all_cache();
463 for_each_drhd_unit ( drhd )
464 {
465 iommu = drhd->iommu;
466 iommu_flush_context_global(iommu, 0);
467 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
468 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
469 }
470 }
472 /* clear one page's page table */
473 static void dma_pte_clear_one(struct domain *domain, u64 addr)
474 {
475 struct hvm_iommu *hd = domain_hvm_iommu(domain);
476 struct acpi_drhd_unit *drhd;
477 struct iommu *iommu;
478 struct dma_pte *page = NULL, *pte = NULL;
479 u64 pg_maddr;
480 int flush_dev_iotlb;
482 spin_lock(&hd->mapping_lock);
483 /* get last level pte */
484 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
485 if ( pg_maddr == 0 )
486 {
487 spin_unlock(&hd->mapping_lock);
488 return;
489 }
491 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
492 pte = page + address_level_offset(addr, 1);
494 if ( !dma_pte_present(*pte) )
495 {
496 spin_unlock(&hd->mapping_lock);
497 unmap_vtd_domain_page(page);
498 return;
499 }
501 dma_clear_pte(*pte);
502 spin_unlock(&hd->mapping_lock);
503 iommu_flush_cache_entry(pte);
505 /* No need pcidevs_lock here since do that on assign/deassign device*/
506 for_each_drhd_unit ( drhd )
507 {
508 iommu = drhd->iommu;
509 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
510 {
511 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
512 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
513 addr, 1, 0, flush_dev_iotlb) )
514 iommu_flush_write_buffer(iommu);
515 }
516 }
518 unmap_vtd_domain_page(page);
519 }
521 static void iommu_free_pagetable(u64 pt_maddr, int level)
522 {
523 int i;
524 struct dma_pte *pt_vaddr, *pte;
525 int next_level = level - 1;
527 if ( pt_maddr == 0 )
528 return;
530 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
532 for ( i = 0; i < PTE_NUM; i++ )
533 {
534 pte = &pt_vaddr[i];
535 if ( !dma_pte_present(*pte) )
536 continue;
538 if ( next_level >= 1 )
539 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
541 dma_clear_pte(*pte);
542 iommu_flush_cache_entry(pte);
543 }
545 unmap_vtd_domain_page(pt_vaddr);
546 free_pgtable_maddr(pt_maddr);
547 }
549 static int iommu_set_root_entry(struct iommu *iommu)
550 {
551 u32 cmd, sts;
552 unsigned long flags;
554 spin_lock(&iommu->lock);
556 if ( iommu->root_maddr == 0 )
557 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
558 if ( iommu->root_maddr == 0 )
559 {
560 spin_unlock(&iommu->lock);
561 return -ENOMEM;
562 }
564 spin_unlock(&iommu->lock);
565 spin_lock_irqsave(&iommu->register_lock, flags);
566 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
567 cmd = iommu->gcmd | DMA_GCMD_SRTP;
568 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
570 /* Make sure hardware complete it */
571 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
572 (sts & DMA_GSTS_RTPS), sts);
573 spin_unlock_irqrestore(&iommu->register_lock, flags);
575 return 0;
576 }
578 static void iommu_enable_translation(struct iommu *iommu)
579 {
580 u32 sts;
581 unsigned long flags;
583 dprintk(XENLOG_INFO VTDPREFIX,
584 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
585 spin_lock_irqsave(&iommu->register_lock, flags);
586 iommu->gcmd |= DMA_GCMD_TE;
587 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
589 /* Make sure hardware complete it */
590 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
591 (sts & DMA_GSTS_TES), sts);
593 /* Disable PMRs when VT-d engine takes effect per spec definition */
594 disable_pmr(iommu);
595 spin_unlock_irqrestore(&iommu->register_lock, flags);
596 }
598 static void iommu_disable_translation(struct iommu *iommu)
599 {
600 u32 sts;
601 unsigned long flags;
603 spin_lock_irqsave(&iommu->register_lock, flags);
604 iommu->gcmd &= ~ DMA_GCMD_TE;
605 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
607 /* Make sure hardware complete it */
608 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
609 !(sts & DMA_GSTS_TES), sts);
610 spin_unlock_irqrestore(&iommu->register_lock, flags);
611 }
613 static struct iommu *vector_to_iommu[NR_VECTORS];
614 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
615 u8 fault_reason, u16 source_id, u64 addr)
616 {
617 dprintk(XENLOG_WARNING VTDPREFIX,
618 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
619 "iommu->reg = %p\n",
620 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
621 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
622 fault_reason, iommu->reg);
624 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
625 if ( fault_reason < 0x20 )
626 print_vtd_entries(iommu, (source_id >> 8),
627 (source_id & 0xff), (addr >> PAGE_SHIFT));
628 #endif
630 return 0;
631 }
633 static void iommu_fault_status(u32 fault_status)
634 {
635 if ( fault_status & DMA_FSTS_PFO )
636 dprintk(XENLOG_ERR VTDPREFIX,
637 "iommu_fault_status: Fault Overflow\n");
638 if ( fault_status & DMA_FSTS_PPF )
639 dprintk(XENLOG_ERR VTDPREFIX,
640 "iommu_fault_status: Primary Pending Fault\n");
641 if ( fault_status & DMA_FSTS_AFO )
642 dprintk(XENLOG_ERR VTDPREFIX,
643 "iommu_fault_status: Advanced Fault Overflow\n");
644 if ( fault_status & DMA_FSTS_APF )
645 dprintk(XENLOG_ERR VTDPREFIX,
646 "iommu_fault_status: Advanced Pending Fault\n");
647 if ( fault_status & DMA_FSTS_IQE )
648 dprintk(XENLOG_ERR VTDPREFIX,
649 "iommu_fault_status: Invalidation Queue Error\n");
650 if ( fault_status & DMA_FSTS_ICE )
651 dprintk(XENLOG_ERR VTDPREFIX,
652 "iommu_fault_status: Invalidation Completion Error\n");
653 if ( fault_status & DMA_FSTS_ITE )
654 dprintk(XENLOG_ERR VTDPREFIX,
655 "iommu_fault_status: Invalidation Time-out Error\n");
656 }
658 #define PRIMARY_FAULT_REG_LEN (16)
659 static void iommu_page_fault(int vector, void *dev_id,
660 struct cpu_user_regs *regs)
661 {
662 struct iommu *iommu = dev_id;
663 int reg, fault_index;
664 u32 fault_status;
665 unsigned long flags;
667 dprintk(XENLOG_WARNING VTDPREFIX,
668 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
670 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
672 iommu_fault_status(fault_status);
674 /* FIXME: ignore advanced fault log */
675 if ( !(fault_status & DMA_FSTS_PPF) )
676 goto clear_overflow;
678 fault_index = dma_fsts_fault_record_index(fault_status);
679 reg = cap_fault_reg_offset(iommu->cap);
680 while (1)
681 {
682 u8 fault_reason;
683 u16 source_id;
684 u32 data;
685 u64 guest_addr;
686 int type;
688 /* highest 32 bits */
689 spin_lock_irqsave(&iommu->register_lock, flags);
690 data = dmar_readl(iommu->reg, reg +
691 fault_index * PRIMARY_FAULT_REG_LEN + 12);
692 if ( !(data & DMA_FRCD_F) )
693 {
694 spin_unlock_irqrestore(&iommu->register_lock, flags);
695 break;
696 }
698 fault_reason = dma_frcd_fault_reason(data);
699 type = dma_frcd_type(data);
701 data = dmar_readl(iommu->reg, reg +
702 fault_index * PRIMARY_FAULT_REG_LEN + 8);
703 source_id = dma_frcd_source_id(data);
705 guest_addr = dmar_readq(iommu->reg, reg +
706 fault_index * PRIMARY_FAULT_REG_LEN);
707 guest_addr = dma_frcd_page_addr(guest_addr);
708 /* clear the fault */
709 dmar_writel(iommu->reg, reg +
710 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
711 spin_unlock_irqrestore(&iommu->register_lock, flags);
713 iommu_page_fault_do_one(iommu, type, fault_reason,
714 source_id, guest_addr);
716 fault_index++;
717 if ( fault_index > cap_num_fault_regs(iommu->cap) )
718 fault_index = 0;
719 }
720 clear_overflow:
721 /* clear primary fault overflow */
722 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
723 if ( fault_status & DMA_FSTS_PFO )
724 {
725 spin_lock_irqsave(&iommu->register_lock, flags);
726 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
727 spin_unlock_irqrestore(&iommu->register_lock, flags);
728 }
729 }
731 static void dma_msi_unmask(unsigned int vector)
732 {
733 struct iommu *iommu = vector_to_iommu[vector];
734 unsigned long flags;
736 /* unmask it */
737 spin_lock_irqsave(&iommu->register_lock, flags);
738 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
739 spin_unlock_irqrestore(&iommu->register_lock, flags);
740 }
742 static void dma_msi_mask(unsigned int vector)
743 {
744 unsigned long flags;
745 struct iommu *iommu = vector_to_iommu[vector];
747 /* mask it */
748 spin_lock_irqsave(&iommu->register_lock, flags);
749 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
750 spin_unlock_irqrestore(&iommu->register_lock, flags);
751 }
753 static unsigned int dma_msi_startup(unsigned int vector)
754 {
755 dma_msi_unmask(vector);
756 return 0;
757 }
759 static void dma_msi_end(unsigned int vector)
760 {
761 dma_msi_unmask(vector);
762 ack_APIC_irq();
763 }
765 static void dma_msi_data_init(struct iommu *iommu, int vector)
766 {
767 u32 msi_data = 0;
768 unsigned long flags;
770 /* Fixed, edge, assert mode. Follow MSI setting */
771 msi_data |= vector & 0xff;
772 msi_data |= 1 << 14;
774 spin_lock_irqsave(&iommu->register_lock, flags);
775 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
776 spin_unlock_irqrestore(&iommu->register_lock, flags);
777 }
779 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
780 {
781 u64 msi_address;
782 unsigned long flags;
784 /* Physical, dedicated cpu. Follow MSI setting */
785 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
786 msi_address |= MSI_PHYSICAL_MODE << 2;
787 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
788 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
790 spin_lock_irqsave(&iommu->register_lock, flags);
791 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
792 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
793 spin_unlock_irqrestore(&iommu->register_lock, flags);
794 }
796 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
797 {
798 struct iommu *iommu = vector_to_iommu[vector];
799 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
800 }
802 static struct hw_interrupt_type dma_msi_type = {
803 .typename = "DMA_MSI",
804 .startup = dma_msi_startup,
805 .shutdown = dma_msi_mask,
806 .enable = dma_msi_unmask,
807 .disable = dma_msi_mask,
808 .ack = dma_msi_mask,
809 .end = dma_msi_end,
810 .set_affinity = dma_msi_set_affinity,
811 };
813 static int iommu_set_interrupt(struct iommu *iommu)
814 {
815 int vector, ret;
817 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
818 if ( vector <= 0 )
819 {
820 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
821 return -EINVAL;
822 }
824 irq_desc[vector].handler = &dma_msi_type;
825 vector_to_iommu[vector] = iommu;
826 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
827 if ( ret )
828 {
829 irq_desc[vector].handler = &no_irq_type;
830 vector_to_iommu[vector] = NULL;
831 free_irq_vector(vector);
832 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
833 return ret;
834 }
836 /* Make sure that vector is never re-used. */
837 vector_irq[vector] = NEVER_ASSIGN_IRQ;
839 return vector;
840 }
842 static int iommu_alloc(struct acpi_drhd_unit *drhd)
843 {
844 struct iommu *iommu;
845 unsigned long sagaw;
846 int agaw;
848 if ( nr_iommus > MAX_IOMMUS )
849 {
850 gdprintk(XENLOG_ERR VTDPREFIX,
851 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
852 return -ENOMEM;
853 }
855 iommu = xmalloc(struct iommu);
856 if ( iommu == NULL )
857 return -ENOMEM;
858 memset(iommu, 0, sizeof(struct iommu));
860 iommu->vector = -1; /* No vector assigned yet. */
862 iommu->intel = alloc_intel_iommu();
863 if ( iommu->intel == NULL )
864 {
865 xfree(iommu);
866 return -ENOMEM;
867 }
869 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
870 iommu->index = nr_iommus++;
872 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
873 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
875 gdprintk(XENLOG_INFO VTDPREFIX,
876 "drhd->address = %"PRIx64"\n", drhd->address);
877 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
879 /* Calculate number of pagetable levels: between 2 and 4. */
880 sagaw = cap_sagaw(iommu->cap);
881 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
882 if ( test_bit(agaw, &sagaw) )
883 break;
884 if ( agaw < 0 )
885 {
886 gdprintk(XENLOG_ERR VTDPREFIX,
887 "IOMMU: unsupported sagaw %lx\n", sagaw);
888 xfree(iommu);
889 return -ENODEV;
890 }
891 iommu->nr_pt_levels = agaw_to_level(agaw);
893 if ( !ecap_coherent(iommu->ecap) )
894 iommus_incoherent = 1;
896 spin_lock_init(&iommu->lock);
897 spin_lock_init(&iommu->register_lock);
899 drhd->iommu = iommu;
900 return 0;
901 }
903 static void iommu_free(struct acpi_drhd_unit *drhd)
904 {
905 struct iommu *iommu = drhd->iommu;
907 if ( iommu == NULL )
908 return;
910 if ( iommu->root_maddr != 0 )
911 {
912 free_pgtable_maddr(iommu->root_maddr);
913 iommu->root_maddr = 0;
914 }
916 if ( iommu->reg )
917 iounmap(iommu->reg);
919 free_intel_iommu(iommu->intel);
920 release_irq_vector(iommu->vector);
921 xfree(iommu);
923 drhd->iommu = NULL;
924 }
926 #define guestwidth_to_adjustwidth(gaw) ({ \
927 int agaw, r = (gaw - 12) % 9; \
928 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
929 if ( agaw > 64 ) \
930 agaw = 64; \
931 agaw; })
933 static int intel_iommu_domain_init(struct domain *d)
934 {
935 struct hvm_iommu *hd = domain_hvm_iommu(d);
936 struct iommu *iommu = NULL;
937 struct acpi_drhd_unit *drhd;
939 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
940 iommu = drhd->iommu;
942 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
944 if ( d->domain_id == 0 )
945 {
946 /* Set up 1:1 page table for dom0 */
947 iommu_set_dom0_mapping(d);
949 setup_dom0_devices(d);
950 setup_dom0_rmrr(d);
952 iommu_flush_all();
954 for_each_drhd_unit ( drhd )
955 {
956 iommu = drhd->iommu;
957 iommu_enable_translation(iommu);
958 }
959 }
961 return 0;
962 }
964 static int domain_context_mapping_one(
965 struct domain *domain,
966 struct iommu *iommu,
967 u8 bus, u8 devfn)
968 {
969 struct hvm_iommu *hd = domain_hvm_iommu(domain);
970 struct context_entry *context, *context_entries;
971 u64 maddr, pgd_maddr;
972 struct pci_dev *pdev = NULL;
973 int agaw;
975 ASSERT(spin_is_locked(&pcidevs_lock));
976 spin_lock(&iommu->lock);
977 maddr = bus_to_context_maddr(iommu, bus);
978 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
979 context = &context_entries[devfn];
981 if ( context_present(*context) )
982 {
983 int res = 0;
985 pdev = pci_get_pdev(bus, devfn);
986 if (!pdev)
987 res = -ENODEV;
988 else if (pdev->domain != domain)
989 res = -EINVAL;
990 unmap_vtd_domain_page(context_entries);
991 spin_unlock(&iommu->lock);
992 return res;
993 }
995 if ( iommu_passthrough && (domain->domain_id == 0) )
996 {
997 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
998 agaw = level_to_agaw(iommu->nr_pt_levels);
999 }
1000 else
1002 spin_lock(&hd->mapping_lock);
1004 /* Ensure we have pagetables allocated down to leaf PTE. */
1005 if ( hd->pgd_maddr == 0 )
1007 addr_to_dma_page_maddr(domain, 0, 1);
1008 if ( hd->pgd_maddr == 0 )
1010 nomem:
1011 spin_unlock(&hd->mapping_lock);
1012 spin_unlock(&iommu->lock);
1013 unmap_vtd_domain_page(context_entries);
1014 return -ENOMEM;
1018 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1019 pgd_maddr = hd->pgd_maddr;
1020 for ( agaw = level_to_agaw(4);
1021 agaw != level_to_agaw(iommu->nr_pt_levels);
1022 agaw-- )
1024 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1025 pgd_maddr = dma_pte_addr(*p);
1026 unmap_vtd_domain_page(p);
1027 if ( pgd_maddr == 0 )
1028 goto nomem;
1031 context_set_address_root(*context, pgd_maddr);
1032 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1033 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1034 else
1035 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1037 spin_unlock(&hd->mapping_lock);
1040 /*
1041 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1042 * be 1 based as required by intel's iommu hw.
1043 */
1044 context_set_domain_id(context, domain);
1045 context_set_address_width(*context, agaw);
1046 context_set_fault_enable(*context);
1047 context_set_present(*context);
1048 iommu_flush_cache_entry(context);
1049 spin_unlock(&iommu->lock);
1051 /* Context entry was previously non-present (with domid 0). */
1052 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1053 DMA_CCMD_MASK_NOBIT, 1) )
1054 iommu_flush_write_buffer(iommu);
1055 else
1057 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1058 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1061 set_bit(iommu->index, &hd->iommu_bitmap);
1063 unmap_vtd_domain_page(context_entries);
1065 return 0;
1068 #define PCI_BASE_CLASS_BRIDGE 0x06
1069 #define PCI_CLASS_BRIDGE_PCI 0x0604
1071 enum {
1072 DEV_TYPE_PCIe_ENDPOINT,
1073 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1074 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1075 DEV_TYPE_PCI,
1076 };
1078 int pdev_type(u8 bus, u8 devfn)
1080 u16 class_device;
1081 u16 status, creg;
1082 int pos;
1083 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1085 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1086 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1088 pos = pci_find_next_cap(bus, devfn,
1089 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1090 if ( !pos )
1091 return DEV_TYPE_PCI_BRIDGE;
1092 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1093 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1094 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1097 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1098 if ( !(status & PCI_STATUS_CAP_LIST) )
1099 return DEV_TYPE_PCI;
1101 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1102 return DEV_TYPE_PCIe_ENDPOINT;
1104 return DEV_TYPE_PCI;
1107 #define MAX_BUSES 256
1108 static DEFINE_SPINLOCK(bus2bridge_lock);
1109 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1111 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1113 int cnt = 0;
1114 *secbus = *bus;
1116 ASSERT(spin_is_locked(&bus2bridge_lock));
1117 if ( !bus2bridge[*bus].map )
1118 return 0;
1120 while ( bus2bridge[*bus].map )
1122 *secbus = *bus;
1123 *devfn = bus2bridge[*bus].devfn;
1124 *bus = bus2bridge[*bus].bus;
1125 if ( cnt++ >= MAX_BUSES )
1126 return 0;
1129 return 1;
1132 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1134 int ret = 0;
1136 if ( *bus == 0 )
1137 /* assume integrated PCI devices in RC have valid requester-id */
1138 return 1;
1140 spin_lock(&bus2bridge_lock);
1141 ret = _find_pcie_endpoint(bus, devfn, secbus);
1142 spin_unlock(&bus2bridge_lock);
1144 return ret;
1147 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1149 struct acpi_drhd_unit *drhd;
1150 int ret = 0;
1151 u16 sec_bus, sub_bus;
1152 u32 type;
1153 u8 secbus, secdevfn;
1154 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1156 if ( pdev == NULL )
1158 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1159 * -> domain_context_mapping().
1160 * In the case a user enables VT-d and disables USB (that usually needs
1161 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1162 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1163 * the BDF and at last pci_get_pdev() returns NULL here.
1164 */
1165 gdprintk(XENLOG_WARNING VTDPREFIX,
1166 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1167 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1168 return 0;
1171 drhd = acpi_find_matched_drhd_unit(pdev);
1172 if ( !drhd )
1173 return -ENODEV;
1175 ASSERT(spin_is_locked(&pcidevs_lock));
1177 type = pdev_type(bus, devfn);
1178 switch ( type )
1180 case DEV_TYPE_PCIe_BRIDGE:
1181 break;
1183 case DEV_TYPE_PCI_BRIDGE:
1184 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1185 PCI_SECONDARY_BUS);
1186 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1187 PCI_SUBORDINATE_BUS);
1189 spin_lock(&bus2bridge_lock);
1190 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1192 bus2bridge[sec_bus].map = 1;
1193 bus2bridge[sec_bus].bus = bus;
1194 bus2bridge[sec_bus].devfn = devfn;
1196 spin_unlock(&bus2bridge_lock);
1197 break;
1199 case DEV_TYPE_PCIe_ENDPOINT:
1200 gdprintk(XENLOG_INFO VTDPREFIX,
1201 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1202 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1203 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1204 break;
1206 case DEV_TYPE_PCI:
1207 gdprintk(XENLOG_INFO VTDPREFIX,
1208 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1209 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1211 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1212 if ( ret )
1213 break;
1215 secbus = bus;
1216 secdevfn = devfn;
1217 /* dependent devices mapping */
1218 while ( bus2bridge[bus].map )
1220 secbus = bus;
1221 secdevfn = devfn;
1222 devfn = bus2bridge[bus].devfn;
1223 bus = bus2bridge[bus].bus;
1224 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1225 if ( ret )
1226 return ret;
1229 if ( (secbus != bus) && (secdevfn != 0) )
1230 /*
1231 * The source-id for transactions on non-PCIe buses seem
1232 * to originate from devfn=0 on the secondary bus behind
1233 * the bridge. Map that id as well. The id to use in
1234 * these scanarios is not particularly well documented
1235 * anywhere.
1236 */
1237 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1238 break;
1240 default:
1241 gdprintk(XENLOG_ERR VTDPREFIX,
1242 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1243 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1244 ret = -EINVAL;
1245 break;
1248 return ret;
1251 static int domain_context_unmap_one(
1252 struct domain *domain,
1253 struct iommu *iommu,
1254 u8 bus, u8 devfn)
1256 struct context_entry *context, *context_entries;
1257 u64 maddr;
1259 ASSERT(spin_is_locked(&pcidevs_lock));
1260 spin_lock(&iommu->lock);
1262 maddr = bus_to_context_maddr(iommu, bus);
1263 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1264 context = &context_entries[devfn];
1266 if ( !context_present(*context) )
1268 spin_unlock(&iommu->lock);
1269 unmap_vtd_domain_page(context_entries);
1270 return 0;
1273 context_clear_present(*context);
1274 context_clear_entry(*context);
1275 iommu_flush_cache_entry(context);
1277 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1278 (((u16)bus) << 8) | devfn,
1279 DMA_CCMD_MASK_NOBIT, 0) )
1280 iommu_flush_write_buffer(iommu);
1281 else
1283 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1284 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1287 spin_unlock(&iommu->lock);
1288 unmap_vtd_domain_page(context_entries);
1290 return 0;
1293 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1295 struct acpi_drhd_unit *drhd;
1296 int ret = 0;
1297 u32 type;
1298 u8 secbus, secdevfn;
1299 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1301 BUG_ON(!pdev);
1303 drhd = acpi_find_matched_drhd_unit(pdev);
1304 if ( !drhd )
1305 return -ENODEV;
1307 type = pdev_type(bus, devfn);
1308 switch ( type )
1310 case DEV_TYPE_PCIe_BRIDGE:
1311 case DEV_TYPE_PCI_BRIDGE:
1312 break;
1314 case DEV_TYPE_PCIe_ENDPOINT:
1315 gdprintk(XENLOG_INFO VTDPREFIX,
1316 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1317 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1318 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1319 break;
1321 case DEV_TYPE_PCI:
1322 gdprintk(XENLOG_INFO VTDPREFIX,
1323 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1324 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1325 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1326 if ( ret )
1327 break;
1329 secbus = bus;
1330 secdevfn = devfn;
1331 /* dependent devices unmapping */
1332 while ( bus2bridge[bus].map )
1334 secbus = bus;
1335 secdevfn = devfn;
1336 devfn = bus2bridge[bus].devfn;
1337 bus = bus2bridge[bus].bus;
1338 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1339 if ( ret )
1340 return ret;
1343 if ( (secbus != bus) && (secdevfn != 0) )
1344 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1345 break;
1347 default:
1348 gdprintk(XENLOG_ERR VTDPREFIX,
1349 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1350 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1351 ret = -EINVAL;
1352 break;
1355 return ret;
1358 static int reassign_device_ownership(
1359 struct domain *source,
1360 struct domain *target,
1361 u8 bus, u8 devfn)
1363 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1364 struct pci_dev *pdev;
1365 struct acpi_drhd_unit *drhd;
1366 struct iommu *pdev_iommu;
1367 int ret, found = 0;
1369 ASSERT(spin_is_locked(&pcidevs_lock));
1370 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1372 if (!pdev)
1373 return -ENODEV;
1375 drhd = acpi_find_matched_drhd_unit(pdev);
1376 pdev_iommu = drhd->iommu;
1377 domain_context_unmap(source, bus, devfn);
1379 ret = domain_context_mapping(target, bus, devfn);
1380 if ( ret )
1381 return ret;
1383 list_move(&pdev->domain_list, &target->arch.pdev_list);
1384 pdev->domain = target;
1386 for_each_pdev ( source, pdev )
1388 drhd = acpi_find_matched_drhd_unit(pdev);
1389 if ( drhd->iommu == pdev_iommu )
1391 found = 1;
1392 break;
1396 if ( !found )
1397 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1399 return ret;
1402 void iommu_domain_teardown(struct domain *d)
1404 struct hvm_iommu *hd = domain_hvm_iommu(d);
1406 if ( list_empty(&acpi_drhd_units) )
1407 return;
1409 spin_lock(&hd->mapping_lock);
1410 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1411 hd->pgd_maddr = 0;
1412 spin_unlock(&hd->mapping_lock);
1414 iommu_domid_release(d);
1417 int intel_iommu_map_page(
1418 struct domain *d, unsigned long gfn, unsigned long mfn)
1420 struct hvm_iommu *hd = domain_hvm_iommu(d);
1421 struct acpi_drhd_unit *drhd;
1422 struct iommu *iommu;
1423 struct dma_pte *page = NULL, *pte = NULL;
1424 u64 pg_maddr;
1425 int pte_present;
1426 int flush_dev_iotlb;
1428 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1429 iommu = drhd->iommu;
1431 /* do nothing if dom0 and iommu supports pass thru */
1432 if ( iommu_passthrough && (d->domain_id == 0) )
1433 return 0;
1435 spin_lock(&hd->mapping_lock);
1437 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1438 if ( pg_maddr == 0 )
1440 spin_unlock(&hd->mapping_lock);
1441 return -ENOMEM;
1443 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1444 pte = page + (gfn & LEVEL_MASK);
1445 pte_present = dma_pte_present(*pte);
1446 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1447 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1449 /* Set the SNP on leaf page table if Snoop Control available */
1450 if ( iommu_snoop )
1451 dma_set_pte_snp(*pte);
1453 iommu_flush_cache_entry(pte);
1454 spin_unlock(&hd->mapping_lock);
1455 unmap_vtd_domain_page(page);
1457 /*
1458 * No need pcideves_lock here because we have flush
1459 * when assign/deassign device
1460 */
1461 for_each_drhd_unit ( drhd )
1463 iommu = drhd->iommu;
1465 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1466 continue;
1468 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1469 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1470 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1471 !pte_present, flush_dev_iotlb) )
1472 iommu_flush_write_buffer(iommu);
1475 return 0;
1478 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1480 struct acpi_drhd_unit *drhd;
1481 struct iommu *iommu;
1483 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1484 iommu = drhd->iommu;
1486 /* do nothing if dom0 and iommu supports pass thru */
1487 if ( iommu_passthrough && (d->domain_id == 0) )
1488 return 0;
1490 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1492 return 0;
1495 static int iommu_prepare_rmrr_dev(struct domain *d,
1496 struct acpi_rmrr_unit *rmrr,
1497 u8 bus, u8 devfn)
1499 int ret = 0;
1500 u64 base, end;
1501 unsigned long base_pfn, end_pfn;
1503 ASSERT(spin_is_locked(&pcidevs_lock));
1504 ASSERT(rmrr->base_address < rmrr->end_address);
1506 base = rmrr->base_address & PAGE_MASK_4K;
1507 base_pfn = base >> PAGE_SHIFT_4K;
1508 end = PAGE_ALIGN_4K(rmrr->end_address);
1509 end_pfn = end >> PAGE_SHIFT_4K;
1511 while ( base_pfn < end_pfn )
1513 intel_iommu_map_page(d, base_pfn, base_pfn);
1514 base_pfn++;
1517 ret = domain_context_mapping(d, bus, devfn);
1519 return ret;
1522 static int intel_iommu_add_device(struct pci_dev *pdev)
1524 struct acpi_rmrr_unit *rmrr;
1525 u16 bdf;
1526 int ret, i;
1528 ASSERT(spin_is_locked(&pcidevs_lock));
1530 if ( !pdev->domain )
1531 return -EINVAL;
1533 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1534 if ( ret )
1536 gdprintk(XENLOG_ERR VTDPREFIX,
1537 "intel_iommu_add_device: context mapping failed\n");
1538 return ret;
1541 for_each_rmrr_device ( rmrr, bdf, i )
1543 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1545 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1546 pdev->bus, pdev->devfn);
1547 if ( ret )
1548 gdprintk(XENLOG_ERR VTDPREFIX,
1549 "intel_iommu_add_device: RMRR mapping failed\n");
1550 break;
1554 return ret;
1557 static int intel_iommu_remove_device(struct pci_dev *pdev)
1559 struct acpi_rmrr_unit *rmrr;
1560 u16 bdf;
1561 int i;
1563 if ( !pdev->domain )
1564 return -EINVAL;
1566 /* If the device belongs to dom0, and it has RMRR, don't remove it
1567 * from dom0, because BIOS may use RMRR at booting time.
1568 */
1569 if ( pdev->domain->domain_id == 0 )
1571 for_each_rmrr_device ( rmrr, bdf, i )
1573 if ( PCI_BUS(bdf) == pdev->bus &&
1574 PCI_DEVFN2(bdf) == pdev->devfn )
1575 return 0;
1579 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1582 static void setup_dom0_devices(struct domain *d)
1584 struct hvm_iommu *hd;
1585 struct pci_dev *pdev;
1586 int bus, dev, func;
1587 u32 l;
1589 hd = domain_hvm_iommu(d);
1591 spin_lock(&pcidevs_lock);
1592 for ( bus = 0; bus < 256; bus++ )
1594 for ( dev = 0; dev < 32; dev++ )
1596 for ( func = 0; func < 8; func++ )
1598 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1599 /* some broken boards return 0 or ~0 if a slot is empty: */
1600 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1601 (l == 0x0000ffff) || (l == 0xffff0000) )
1602 continue;
1604 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1605 pdev->domain = d;
1606 list_add(&pdev->domain_list, &d->arch.pdev_list);
1607 domain_context_mapping(d, pdev->bus, pdev->devfn);
1608 if ( ats_device(0, pdev->bus, pdev->devfn) )
1609 enable_ats_device(0, pdev->bus, pdev->devfn);
1613 spin_unlock(&pcidevs_lock);
1616 void clear_fault_bits(struct iommu *iommu)
1618 u64 val;
1620 val = dmar_readq(
1621 iommu->reg,
1622 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1623 dmar_writeq(
1624 iommu->reg,
1625 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1626 val);
1627 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1630 static int init_vtd_hw(void)
1632 struct acpi_drhd_unit *drhd;
1633 struct iommu *iommu;
1634 struct iommu_flush *flush = NULL;
1635 int vector;
1636 int ret;
1638 for_each_drhd_unit ( drhd )
1640 iommu = drhd->iommu;
1641 ret = iommu_set_root_entry(iommu);
1642 if ( ret )
1644 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1645 return -EIO;
1648 if ( iommu->vector < 0 )
1650 vector = iommu_set_interrupt(iommu);
1651 if ( vector < 0 )
1653 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1654 return vector;
1656 iommu->vector = vector;
1658 dma_msi_data_init(iommu, iommu->vector);
1659 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1660 clear_fault_bits(iommu);
1661 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1663 /* initialize flush functions */
1664 flush = iommu_get_flush(iommu);
1665 flush->context = flush_context_reg;
1666 flush->iotlb = flush_iotlb_reg;
1669 if ( iommu_qinval )
1671 for_each_drhd_unit ( drhd )
1673 iommu = drhd->iommu;
1674 if ( enable_qinval(iommu) != 0 )
1676 dprintk(XENLOG_INFO VTDPREFIX,
1677 "Failed to enable Queued Invalidation!\n");
1678 break;
1683 if ( iommu_intremap )
1685 for_each_drhd_unit ( drhd )
1687 iommu = drhd->iommu;
1688 if ( enable_intremap(iommu) != 0 )
1690 dprintk(XENLOG_INFO VTDPREFIX,
1691 "Failed to enable Interrupt Remapping!\n");
1692 break;
1697 return 0;
1700 static void setup_dom0_rmrr(struct domain *d)
1702 struct acpi_rmrr_unit *rmrr;
1703 u16 bdf;
1704 int ret, i;
1706 spin_lock(&pcidevs_lock);
1707 for_each_rmrr_device ( rmrr, bdf, i )
1709 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1710 if ( ret )
1711 gdprintk(XENLOG_ERR VTDPREFIX,
1712 "IOMMU: mapping reserved region failed\n");
1714 spin_unlock(&pcidevs_lock);
1717 static void platform_quirks(void)
1719 u32 id;
1721 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1722 id = pci_conf_read32(0, 0, 0, 0);
1723 if ( id == 0x2a408086 )
1725 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1726 rwbf_quirk = 1;
1730 int intel_vtd_setup(void)
1732 struct acpi_drhd_unit *drhd;
1733 struct iommu *iommu;
1735 if ( !iommu_enabled )
1736 return -ENODEV;
1738 platform_quirks();
1740 spin_lock_init(&domid_bitmap_lock);
1741 clflush_size = get_cache_line_size();
1743 /* We enable the following features only if they are supported by all VT-d
1744 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1745 * Interrupt Remapping.
1746 */
1747 for_each_drhd_unit ( drhd )
1749 if ( iommu_alloc(drhd) != 0 )
1750 goto error;
1752 iommu = drhd->iommu;
1754 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1755 iommu_snoop = 0;
1757 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1758 iommu_passthrough = 0;
1760 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1761 iommu_qinval = 0;
1763 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1764 iommu_intremap = 0;
1767 if ( !iommu_qinval && iommu_intremap )
1769 iommu_intremap = 0;
1770 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1771 "since Queued Invalidation isn't supported or enabled.\n");
1774 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1775 P(iommu_snoop, "Snoop Control");
1776 P(iommu_passthrough, "DMA Passthrough");
1777 P(iommu_qinval, "Queued Invalidation");
1778 P(iommu_intremap, "Interrupt Remapping");
1779 #undef P
1781 /* Allocate IO page directory page for the domain. */
1782 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1783 iommu = drhd->iommu;
1785 /* Allocate domain id bitmap, and set bit 0 as reserved */
1786 domid_bitmap_size = cap_ndoms(iommu->cap);
1787 domid_bitmap = xmalloc_array(unsigned long,
1788 BITS_TO_LONGS(domid_bitmap_size));
1789 if ( domid_bitmap == NULL )
1790 goto error;
1791 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1792 set_bit(0, domid_bitmap);
1794 if ( init_vtd_hw() )
1795 goto error;
1797 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1799 return 0;
1801 error:
1802 for_each_drhd_unit ( drhd )
1803 iommu_free(drhd);
1804 iommu_enabled = 0;
1805 iommu_snoop = 0;
1806 iommu_passthrough = 0;
1807 iommu_qinval = 0;
1808 iommu_intremap = 0;
1809 return -ENOMEM;
1812 /*
1813 * If the device isn't owned by dom0, it means it already
1814 * has been assigned to other domain, or it's not exist.
1815 */
1816 int device_assigned(u8 bus, u8 devfn)
1818 struct pci_dev *pdev;
1820 spin_lock(&pcidevs_lock);
1821 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1822 if (!pdev)
1824 spin_unlock(&pcidevs_lock);
1825 return -1;
1828 spin_unlock(&pcidevs_lock);
1829 return 0;
1832 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1834 struct acpi_rmrr_unit *rmrr;
1835 int ret = 0, i;
1836 struct pci_dev *pdev;
1837 u16 bdf;
1839 if ( list_empty(&acpi_drhd_units) )
1840 return -ENODEV;
1842 ASSERT(spin_is_locked(&pcidevs_lock));
1843 pdev = pci_get_pdev(bus, devfn);
1844 if (!pdev)
1845 return -ENODEV;
1847 if (pdev->domain != dom0)
1849 gdprintk(XENLOG_ERR VTDPREFIX,
1850 "IOMMU: assign a assigned device\n");
1851 return -EBUSY;
1854 ret = reassign_device_ownership(dom0, d, bus, devfn);
1855 if ( ret )
1856 goto done;
1858 /* Setup rmrr identity mapping */
1859 for_each_rmrr_device( rmrr, bdf, i )
1861 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1863 /* FIXME: Because USB RMRR conflicts with guest bios region,
1864 * ignore USB RMRR temporarily.
1865 */
1866 if ( is_usb_device(bus, devfn) )
1868 ret = 0;
1869 goto done;
1872 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1873 if ( ret )
1874 gdprintk(XENLOG_ERR VTDPREFIX,
1875 "IOMMU: mapping reserved region failed\n");
1876 goto done;
1880 done:
1881 return ret;
1884 static int intel_iommu_group_id(u8 bus, u8 devfn)
1886 u8 secbus;
1887 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1888 return PCI_BDF2(bus, devfn);
1889 else
1890 return -1;
1893 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1894 void iommu_suspend(void)
1896 struct acpi_drhd_unit *drhd;
1897 struct iommu *iommu;
1898 u32 i;
1900 if ( !iommu_enabled )
1901 return;
1903 iommu_flush_all();
1905 for_each_drhd_unit ( drhd )
1907 iommu = drhd->iommu;
1908 i = iommu->index;
1910 iommu_state[i][DMAR_FECTL_REG] =
1911 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1912 iommu_state[i][DMAR_FEDATA_REG] =
1913 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1914 iommu_state[i][DMAR_FEADDR_REG] =
1915 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1916 iommu_state[i][DMAR_FEUADDR_REG] =
1917 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1919 iommu_disable_translation(iommu);
1921 if ( iommu_intremap )
1922 disable_intremap(iommu);
1924 if ( iommu_qinval )
1925 disable_qinval(iommu);
1929 void iommu_resume(void)
1931 struct acpi_drhd_unit *drhd;
1932 struct iommu *iommu;
1933 struct iommu_flush *flush;
1934 u32 i;
1936 if ( !iommu_enabled )
1937 return;
1939 /* Re-initialize the register-based flush functions.
1940 * In iommu_flush_all(), we invoke iommu_flush_{context,iotlb}_global(),
1941 * but at this point, on hosts that support QI(Queued Invalidation), QI
1942 * hasn't been re-enabed yet, so for now let's use the register-based
1943 * invalidation method before invoking init_vtd_hw().
1944 */
1945 if ( iommu_qinval )
1947 for_each_drhd_unit ( drhd )
1949 iommu = drhd->iommu;
1950 flush = iommu_get_flush(iommu);
1951 flush->context = flush_context_reg;
1952 flush->iotlb = flush_iotlb_reg;
1956 /* Not sure whether the flush operation is required to meet iommu
1957 * specification. Note that BIOS also executes in S3 resume and iommu may
1958 * be touched again, so let us do the flush operation for safety.
1959 */
1960 iommu_flush_all();
1962 if ( init_vtd_hw() != 0 && force_iommu )
1963 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1965 for_each_drhd_unit ( drhd )
1967 iommu = drhd->iommu;
1968 i = iommu->index;
1970 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1971 (u32) iommu_state[i][DMAR_FECTL_REG]);
1972 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1973 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1974 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1975 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1976 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1977 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1979 iommu_enable_translation(iommu);
1983 struct iommu_ops intel_iommu_ops = {
1984 .init = intel_iommu_domain_init,
1985 .add_device = intel_iommu_add_device,
1986 .remove_device = intel_iommu_remove_device,
1987 .assign_device = intel_iommu_assign_device,
1988 .teardown = iommu_domain_teardown,
1989 .map_page = intel_iommu_map_page,
1990 .unmap_page = intel_iommu_unmap_page,
1991 .reassign_device = reassign_device_ownership,
1992 .get_device_group_id = intel_iommu_group_id,
1993 .update_ire_from_apic = io_apic_write_remap_rte,
1994 .update_ire_from_msi = msi_msg_write_remap_rte,
1995 };
1997 /*
1998 * Local variables:
1999 * mode: C
2000 * c-set-style: "BSD"
2001 * c-basic-offset: 4
2002 * tab-width: 4
2003 * indent-tabs-mode: nil
2004 * End:
2005 */