ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19752:fa51db0871e1

vtd: Fix flush for SRTP and SIRTP set

SRTP (Set Root Table Pointer) operation must be set before enable or
re-enable DMA remapping. And after set it, software must globally
invalidate the context-cache and then globally invalidate the
IOTLB. This is required to ensure hardware uses only the remapping
structures referenced by the new root-table pointer, and not stale
cached entries. Similarly, SIRTP (Set Interrupt Remap Table Pointer)
operation must be set before enable or re-enable Interrupt
remapping, and after set it, software must globally invalidate the
interrupt entry cache. This patch adds global context and iotlb
flush after set root entry, and globally flushs interrupt entry
cache before enabling Interrupt remapping. And remove the
iommu_flush_all in iommu_resume becuase it becomes redundant after
adds flush for SRTP in init_vtd_hw.

Signed-off-by: Weidong Han <weidong.han@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jun 16 11:30:45 2009 +0100 (2009-06-16)
parents 4fb8a6c993e2
children cc07094a02e4
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
234 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
235 return;
237 spin_lock_irqsave(&iommu->register_lock, flag);
238 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
239 dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
241 /* Make sure hardware complete it */
242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
243 !(val & DMA_GSTS_WBFS), val);
245 spin_unlock_irqrestore(&iommu->register_lock, flag);
246 }
248 /* return value determine if we need a write buffer flush */
249 static int flush_context_reg(
250 void *_iommu,
251 u16 did, u16 source_id, u8 function_mask, u64 type,
252 int flush_non_present_entry)
253 {
254 struct iommu *iommu = (struct iommu *) _iommu;
255 u64 val = 0;
256 unsigned long flag;
258 /*
259 * In the non-present entry flush case, if hardware doesn't cache
260 * non-present entry we do nothing and if hardware cache non-present
261 * entry, we flush entries of domain 0 (the domain id is used to cache
262 * any non-present entries)
263 */
264 if ( flush_non_present_entry )
265 {
266 if ( !cap_caching_mode(iommu->cap) )
267 return 1;
268 else
269 did = 0;
270 }
272 /* use register invalidation */
273 switch ( type )
274 {
275 case DMA_CCMD_GLOBAL_INVL:
276 val = DMA_CCMD_GLOBAL_INVL;
277 break;
278 case DMA_CCMD_DOMAIN_INVL:
279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
280 break;
281 case DMA_CCMD_DEVICE_INVL:
282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
283 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
284 break;
285 default:
286 BUG();
287 }
288 val |= DMA_CCMD_ICC;
290 spin_lock_irqsave(&iommu->register_lock, flag);
291 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
293 /* Make sure hardware complete it */
294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
295 !(val & DMA_CCMD_ICC), val);
297 spin_unlock_irqrestore(&iommu->register_lock, flag);
298 /* flush context entry will implicitly flush write buffer */
299 return 0;
300 }
302 static int inline iommu_flush_context_global(
303 struct iommu *iommu, int flush_non_present_entry)
304 {
305 struct iommu_flush *flush = iommu_get_flush(iommu);
306 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
307 flush_non_present_entry);
308 }
310 static int inline iommu_flush_context_domain(
311 struct iommu *iommu, u16 did, int flush_non_present_entry)
312 {
313 struct iommu_flush *flush = iommu_get_flush(iommu);
314 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
315 flush_non_present_entry);
316 }
318 static int inline iommu_flush_context_device(
319 struct iommu *iommu, u16 did, u16 source_id,
320 u8 function_mask, int flush_non_present_entry)
321 {
322 struct iommu_flush *flush = iommu_get_flush(iommu);
323 return flush->context(iommu, did, source_id, function_mask,
324 DMA_CCMD_DEVICE_INVL,
325 flush_non_present_entry);
326 }
328 /* return value determine if we need a write buffer flush */
329 static int flush_iotlb_reg(void *_iommu, u16 did,
330 u64 addr, unsigned int size_order, u64 type,
331 int flush_non_present_entry, int flush_dev_iotlb)
332 {
333 struct iommu *iommu = (struct iommu *) _iommu;
334 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
335 u64 val = 0, val_iva = 0;
336 unsigned long flag;
338 /*
339 * In the non-present entry flush case, if hardware doesn't cache
340 * non-present entry we do nothing and if hardware cache non-present
341 * entry, we flush entries of domain 0 (the domain id is used to cache
342 * any non-present entries)
343 */
344 if ( flush_non_present_entry )
345 {
346 if ( !cap_caching_mode(iommu->cap) )
347 return 1;
348 else
349 did = 0;
350 }
352 /* use register invalidation */
353 switch ( type )
354 {
355 case DMA_TLB_GLOBAL_FLUSH:
356 /* global flush doesn't need set IVA_REG */
357 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
358 break;
359 case DMA_TLB_DSI_FLUSH:
360 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
361 break;
362 case DMA_TLB_PSI_FLUSH:
363 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
364 /* Note: always flush non-leaf currently */
365 val_iva = size_order | addr;
366 break;
367 default:
368 BUG();
369 }
370 /* Note: set drain read/write */
371 if ( cap_read_drain(iommu->cap) )
372 val |= DMA_TLB_READ_DRAIN;
373 if ( cap_write_drain(iommu->cap) )
374 val |= DMA_TLB_WRITE_DRAIN;
376 spin_lock_irqsave(&iommu->register_lock, flag);
377 /* Note: Only uses first TLB reg currently */
378 if ( val_iva )
379 dmar_writeq(iommu->reg, tlb_offset, val_iva);
380 dmar_writeq(iommu->reg, tlb_offset + 8, val);
382 /* Make sure hardware complete it */
383 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
384 !(val & DMA_TLB_IVT), val);
385 spin_unlock_irqrestore(&iommu->register_lock, flag);
387 /* check IOTLB invalidation granularity */
388 if ( DMA_TLB_IAIG(val) == 0 )
389 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
391 /* flush iotlb entry will implicitly flush write buffer */
392 return 0;
393 }
395 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
396 int flush_non_present_entry, int flush_dev_iotlb)
397 {
398 struct iommu_flush *flush = iommu_get_flush(iommu);
399 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
400 flush_non_present_entry, flush_dev_iotlb);
401 }
403 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
404 int flush_non_present_entry, int flush_dev_iotlb)
405 {
406 struct iommu_flush *flush = iommu_get_flush(iommu);
407 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
408 flush_non_present_entry, flush_dev_iotlb);
409 }
411 static int inline get_alignment(u64 base, unsigned int size)
412 {
413 int t = 0;
414 u64 end;
416 end = base + size - 1;
417 while ( base != end )
418 {
419 t++;
420 base >>= 1;
421 end >>= 1;
422 }
423 return t;
424 }
426 static int inline iommu_flush_iotlb_psi(
427 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
428 int flush_non_present_entry, int flush_dev_iotlb)
429 {
430 unsigned int align;
431 struct iommu_flush *flush = iommu_get_flush(iommu);
433 ASSERT(!(addr & (~PAGE_MASK_4K)));
434 ASSERT(pages > 0);
436 /* Fallback to domain selective flush if no PSI support */
437 if ( !cap_pgsel_inv(iommu->cap) )
438 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
440 /*
441 * PSI requires page size is 2 ^ x, and the base address is naturally
442 * aligned to the size
443 */
444 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
445 /* Fallback to domain selective flush if size is too big */
446 if ( align > cap_max_amask_val(iommu->cap) )
447 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
449 addr >>= PAGE_SHIFT_4K + align;
450 addr <<= PAGE_SHIFT_4K + align;
452 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
453 flush_non_present_entry, flush_dev_iotlb);
454 }
456 void iommu_flush_all(void)
457 {
458 struct acpi_drhd_unit *drhd;
459 struct iommu *iommu;
460 int flush_dev_iotlb;
462 flush_all_cache();
463 for_each_drhd_unit ( drhd )
464 {
465 iommu = drhd->iommu;
466 iommu_flush_context_global(iommu, 0);
467 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
468 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
469 }
470 }
472 /* clear one page's page table */
473 static void dma_pte_clear_one(struct domain *domain, u64 addr)
474 {
475 struct hvm_iommu *hd = domain_hvm_iommu(domain);
476 struct acpi_drhd_unit *drhd;
477 struct iommu *iommu;
478 struct dma_pte *page = NULL, *pte = NULL;
479 u64 pg_maddr;
480 int flush_dev_iotlb;
482 spin_lock(&hd->mapping_lock);
483 /* get last level pte */
484 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
485 if ( pg_maddr == 0 )
486 {
487 spin_unlock(&hd->mapping_lock);
488 return;
489 }
491 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
492 pte = page + address_level_offset(addr, 1);
494 if ( !dma_pte_present(*pte) )
495 {
496 spin_unlock(&hd->mapping_lock);
497 unmap_vtd_domain_page(page);
498 return;
499 }
501 dma_clear_pte(*pte);
502 spin_unlock(&hd->mapping_lock);
503 iommu_flush_cache_entry(pte);
505 /* No need pcidevs_lock here since do that on assign/deassign device*/
506 for_each_drhd_unit ( drhd )
507 {
508 iommu = drhd->iommu;
509 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
510 {
511 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
512 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
513 addr, 1, 0, flush_dev_iotlb) )
514 iommu_flush_write_buffer(iommu);
515 }
516 }
518 unmap_vtd_domain_page(page);
519 }
521 static void iommu_free_pagetable(u64 pt_maddr, int level)
522 {
523 int i;
524 struct dma_pte *pt_vaddr, *pte;
525 int next_level = level - 1;
527 if ( pt_maddr == 0 )
528 return;
530 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
532 for ( i = 0; i < PTE_NUM; i++ )
533 {
534 pte = &pt_vaddr[i];
535 if ( !dma_pte_present(*pte) )
536 continue;
538 if ( next_level >= 1 )
539 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
541 dma_clear_pte(*pte);
542 iommu_flush_cache_entry(pte);
543 }
545 unmap_vtd_domain_page(pt_vaddr);
546 free_pgtable_maddr(pt_maddr);
547 }
549 static int iommu_set_root_entry(struct iommu *iommu)
550 {
551 u32 sts;
552 unsigned long flags;
554 spin_lock(&iommu->lock);
556 if ( iommu->root_maddr == 0 )
557 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
558 if ( iommu->root_maddr == 0 )
559 {
560 spin_unlock(&iommu->lock);
561 return -ENOMEM;
562 }
564 spin_unlock(&iommu->lock);
565 spin_lock_irqsave(&iommu->register_lock, flags);
566 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
568 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
569 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
571 /* Make sure hardware complete it */
572 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
573 (sts & DMA_GSTS_RTPS), sts);
574 spin_unlock_irqrestore(&iommu->register_lock, flags);
576 return 0;
577 }
579 static void iommu_enable_translation(struct iommu *iommu)
580 {
581 u32 sts;
582 unsigned long flags;
584 dprintk(XENLOG_INFO VTDPREFIX,
585 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
586 spin_lock_irqsave(&iommu->register_lock, flags);
587 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
588 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
590 /* Make sure hardware complete it */
591 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
592 (sts & DMA_GSTS_TES), sts);
594 /* Disable PMRs when VT-d engine takes effect per spec definition */
595 disable_pmr(iommu);
596 spin_unlock_irqrestore(&iommu->register_lock, flags);
597 }
599 static void iommu_disable_translation(struct iommu *iommu)
600 {
601 u32 sts;
602 unsigned long flags;
604 spin_lock_irqsave(&iommu->register_lock, flags);
605 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
606 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
608 /* Make sure hardware complete it */
609 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
610 !(sts & DMA_GSTS_TES), sts);
611 spin_unlock_irqrestore(&iommu->register_lock, flags);
612 }
614 static struct iommu *vector_to_iommu[NR_VECTORS];
615 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
616 u8 fault_reason, u16 source_id, u64 addr)
617 {
618 dprintk(XENLOG_WARNING VTDPREFIX,
619 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
620 "iommu->reg = %p\n",
621 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
622 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
623 fault_reason, iommu->reg);
625 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
626 if ( fault_reason < 0x20 )
627 print_vtd_entries(iommu, (source_id >> 8),
628 (source_id & 0xff), (addr >> PAGE_SHIFT));
629 #endif
631 return 0;
632 }
634 static void iommu_fault_status(u32 fault_status)
635 {
636 if ( fault_status & DMA_FSTS_PFO )
637 dprintk(XENLOG_ERR VTDPREFIX,
638 "iommu_fault_status: Fault Overflow\n");
639 if ( fault_status & DMA_FSTS_PPF )
640 dprintk(XENLOG_ERR VTDPREFIX,
641 "iommu_fault_status: Primary Pending Fault\n");
642 if ( fault_status & DMA_FSTS_AFO )
643 dprintk(XENLOG_ERR VTDPREFIX,
644 "iommu_fault_status: Advanced Fault Overflow\n");
645 if ( fault_status & DMA_FSTS_APF )
646 dprintk(XENLOG_ERR VTDPREFIX,
647 "iommu_fault_status: Advanced Pending Fault\n");
648 if ( fault_status & DMA_FSTS_IQE )
649 dprintk(XENLOG_ERR VTDPREFIX,
650 "iommu_fault_status: Invalidation Queue Error\n");
651 if ( fault_status & DMA_FSTS_ICE )
652 dprintk(XENLOG_ERR VTDPREFIX,
653 "iommu_fault_status: Invalidation Completion Error\n");
654 if ( fault_status & DMA_FSTS_ITE )
655 dprintk(XENLOG_ERR VTDPREFIX,
656 "iommu_fault_status: Invalidation Time-out Error\n");
657 }
659 #define PRIMARY_FAULT_REG_LEN (16)
660 static void iommu_page_fault(int vector, void *dev_id,
661 struct cpu_user_regs *regs)
662 {
663 struct iommu *iommu = dev_id;
664 int reg, fault_index;
665 u32 fault_status;
666 unsigned long flags;
668 dprintk(XENLOG_WARNING VTDPREFIX,
669 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
671 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
673 iommu_fault_status(fault_status);
675 /* FIXME: ignore advanced fault log */
676 if ( !(fault_status & DMA_FSTS_PPF) )
677 goto clear_overflow;
679 fault_index = dma_fsts_fault_record_index(fault_status);
680 reg = cap_fault_reg_offset(iommu->cap);
681 while (1)
682 {
683 u8 fault_reason;
684 u16 source_id;
685 u32 data;
686 u64 guest_addr;
687 int type;
689 /* highest 32 bits */
690 spin_lock_irqsave(&iommu->register_lock, flags);
691 data = dmar_readl(iommu->reg, reg +
692 fault_index * PRIMARY_FAULT_REG_LEN + 12);
693 if ( !(data & DMA_FRCD_F) )
694 {
695 spin_unlock_irqrestore(&iommu->register_lock, flags);
696 break;
697 }
699 fault_reason = dma_frcd_fault_reason(data);
700 type = dma_frcd_type(data);
702 data = dmar_readl(iommu->reg, reg +
703 fault_index * PRIMARY_FAULT_REG_LEN + 8);
704 source_id = dma_frcd_source_id(data);
706 guest_addr = dmar_readq(iommu->reg, reg +
707 fault_index * PRIMARY_FAULT_REG_LEN);
708 guest_addr = dma_frcd_page_addr(guest_addr);
709 /* clear the fault */
710 dmar_writel(iommu->reg, reg +
711 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
712 spin_unlock_irqrestore(&iommu->register_lock, flags);
714 iommu_page_fault_do_one(iommu, type, fault_reason,
715 source_id, guest_addr);
717 fault_index++;
718 if ( fault_index > cap_num_fault_regs(iommu->cap) )
719 fault_index = 0;
720 }
721 clear_overflow:
722 /* clear primary fault overflow */
723 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
724 if ( fault_status & DMA_FSTS_PFO )
725 {
726 spin_lock_irqsave(&iommu->register_lock, flags);
727 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
728 spin_unlock_irqrestore(&iommu->register_lock, flags);
729 }
730 }
732 static void dma_msi_unmask(unsigned int vector)
733 {
734 struct iommu *iommu = vector_to_iommu[vector];
735 unsigned long flags;
737 /* unmask it */
738 spin_lock_irqsave(&iommu->register_lock, flags);
739 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
740 spin_unlock_irqrestore(&iommu->register_lock, flags);
741 }
743 static void dma_msi_mask(unsigned int vector)
744 {
745 unsigned long flags;
746 struct iommu *iommu = vector_to_iommu[vector];
748 /* mask it */
749 spin_lock_irqsave(&iommu->register_lock, flags);
750 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
751 spin_unlock_irqrestore(&iommu->register_lock, flags);
752 }
754 static unsigned int dma_msi_startup(unsigned int vector)
755 {
756 dma_msi_unmask(vector);
757 return 0;
758 }
760 static void dma_msi_end(unsigned int vector)
761 {
762 dma_msi_unmask(vector);
763 ack_APIC_irq();
764 }
766 static void dma_msi_data_init(struct iommu *iommu, int vector)
767 {
768 u32 msi_data = 0;
769 unsigned long flags;
771 /* Fixed, edge, assert mode. Follow MSI setting */
772 msi_data |= vector & 0xff;
773 msi_data |= 1 << 14;
775 spin_lock_irqsave(&iommu->register_lock, flags);
776 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
777 spin_unlock_irqrestore(&iommu->register_lock, flags);
778 }
780 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
781 {
782 u64 msi_address;
783 unsigned long flags;
785 /* Physical, dedicated cpu. Follow MSI setting */
786 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
787 msi_address |= MSI_PHYSICAL_MODE << 2;
788 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
789 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
791 spin_lock_irqsave(&iommu->register_lock, flags);
792 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
793 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
794 spin_unlock_irqrestore(&iommu->register_lock, flags);
795 }
797 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
798 {
799 struct iommu *iommu = vector_to_iommu[vector];
800 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
801 }
803 static struct hw_interrupt_type dma_msi_type = {
804 .typename = "DMA_MSI",
805 .startup = dma_msi_startup,
806 .shutdown = dma_msi_mask,
807 .enable = dma_msi_unmask,
808 .disable = dma_msi_mask,
809 .ack = dma_msi_mask,
810 .end = dma_msi_end,
811 .set_affinity = dma_msi_set_affinity,
812 };
814 static int iommu_set_interrupt(struct iommu *iommu)
815 {
816 int vector, ret;
818 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
819 if ( vector <= 0 )
820 {
821 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
822 return -EINVAL;
823 }
825 irq_desc[vector].handler = &dma_msi_type;
826 vector_to_iommu[vector] = iommu;
827 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
828 if ( ret )
829 {
830 irq_desc[vector].handler = &no_irq_type;
831 vector_to_iommu[vector] = NULL;
832 free_irq_vector(vector);
833 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
834 return ret;
835 }
837 /* Make sure that vector is never re-used. */
838 vector_irq[vector] = NEVER_ASSIGN_IRQ;
840 return vector;
841 }
843 static int iommu_alloc(struct acpi_drhd_unit *drhd)
844 {
845 struct iommu *iommu;
846 unsigned long sagaw;
847 int agaw;
849 if ( nr_iommus > MAX_IOMMUS )
850 {
851 gdprintk(XENLOG_ERR VTDPREFIX,
852 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
853 return -ENOMEM;
854 }
856 iommu = xmalloc(struct iommu);
857 if ( iommu == NULL )
858 return -ENOMEM;
859 memset(iommu, 0, sizeof(struct iommu));
861 iommu->vector = -1; /* No vector assigned yet. */
863 iommu->intel = alloc_intel_iommu();
864 if ( iommu->intel == NULL )
865 {
866 xfree(iommu);
867 return -ENOMEM;
868 }
870 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
871 iommu->index = nr_iommus++;
873 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
874 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
876 gdprintk(XENLOG_INFO VTDPREFIX,
877 "drhd->address = %"PRIx64"\n", drhd->address);
878 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
880 /* Calculate number of pagetable levels: between 2 and 4. */
881 sagaw = cap_sagaw(iommu->cap);
882 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
883 if ( test_bit(agaw, &sagaw) )
884 break;
885 if ( agaw < 0 )
886 {
887 gdprintk(XENLOG_ERR VTDPREFIX,
888 "IOMMU: unsupported sagaw %lx\n", sagaw);
889 xfree(iommu);
890 return -ENODEV;
891 }
892 iommu->nr_pt_levels = agaw_to_level(agaw);
894 if ( !ecap_coherent(iommu->ecap) )
895 iommus_incoherent = 1;
897 spin_lock_init(&iommu->lock);
898 spin_lock_init(&iommu->register_lock);
900 drhd->iommu = iommu;
901 return 0;
902 }
904 static void iommu_free(struct acpi_drhd_unit *drhd)
905 {
906 struct iommu *iommu = drhd->iommu;
908 if ( iommu == NULL )
909 return;
911 if ( iommu->root_maddr != 0 )
912 {
913 free_pgtable_maddr(iommu->root_maddr);
914 iommu->root_maddr = 0;
915 }
917 if ( iommu->reg )
918 iounmap(iommu->reg);
920 free_intel_iommu(iommu->intel);
921 release_irq_vector(iommu->vector);
922 xfree(iommu);
924 drhd->iommu = NULL;
925 }
927 #define guestwidth_to_adjustwidth(gaw) ({ \
928 int agaw, r = (gaw - 12) % 9; \
929 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
930 if ( agaw > 64 ) \
931 agaw = 64; \
932 agaw; })
934 static int intel_iommu_domain_init(struct domain *d)
935 {
936 struct hvm_iommu *hd = domain_hvm_iommu(d);
937 struct iommu *iommu = NULL;
938 struct acpi_drhd_unit *drhd;
940 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
941 iommu = drhd->iommu;
943 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
945 if ( d->domain_id == 0 )
946 {
947 /* Set up 1:1 page table for dom0 */
948 iommu_set_dom0_mapping(d);
950 setup_dom0_devices(d);
951 setup_dom0_rmrr(d);
953 iommu_flush_all();
955 for_each_drhd_unit ( drhd )
956 {
957 iommu = drhd->iommu;
958 iommu_enable_translation(iommu);
959 }
960 }
962 return 0;
963 }
965 static int domain_context_mapping_one(
966 struct domain *domain,
967 struct iommu *iommu,
968 u8 bus, u8 devfn)
969 {
970 struct hvm_iommu *hd = domain_hvm_iommu(domain);
971 struct context_entry *context, *context_entries;
972 u64 maddr, pgd_maddr;
973 struct pci_dev *pdev = NULL;
974 int agaw;
976 ASSERT(spin_is_locked(&pcidevs_lock));
977 spin_lock(&iommu->lock);
978 maddr = bus_to_context_maddr(iommu, bus);
979 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
980 context = &context_entries[devfn];
982 if ( context_present(*context) )
983 {
984 int res = 0;
986 pdev = pci_get_pdev(bus, devfn);
987 if (!pdev)
988 res = -ENODEV;
989 else if (pdev->domain != domain)
990 res = -EINVAL;
991 unmap_vtd_domain_page(context_entries);
992 spin_unlock(&iommu->lock);
993 return res;
994 }
996 if ( iommu_passthrough && (domain->domain_id == 0) )
997 {
998 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
999 agaw = level_to_agaw(iommu->nr_pt_levels);
1001 else
1003 spin_lock(&hd->mapping_lock);
1005 /* Ensure we have pagetables allocated down to leaf PTE. */
1006 if ( hd->pgd_maddr == 0 )
1008 addr_to_dma_page_maddr(domain, 0, 1);
1009 if ( hd->pgd_maddr == 0 )
1011 nomem:
1012 spin_unlock(&hd->mapping_lock);
1013 spin_unlock(&iommu->lock);
1014 unmap_vtd_domain_page(context_entries);
1015 return -ENOMEM;
1019 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1020 pgd_maddr = hd->pgd_maddr;
1021 for ( agaw = level_to_agaw(4);
1022 agaw != level_to_agaw(iommu->nr_pt_levels);
1023 agaw-- )
1025 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1026 pgd_maddr = dma_pte_addr(*p);
1027 unmap_vtd_domain_page(p);
1028 if ( pgd_maddr == 0 )
1029 goto nomem;
1032 context_set_address_root(*context, pgd_maddr);
1033 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1034 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1035 else
1036 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1038 spin_unlock(&hd->mapping_lock);
1041 /*
1042 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1043 * be 1 based as required by intel's iommu hw.
1044 */
1045 context_set_domain_id(context, domain);
1046 context_set_address_width(*context, agaw);
1047 context_set_fault_enable(*context);
1048 context_set_present(*context);
1049 iommu_flush_cache_entry(context);
1050 spin_unlock(&iommu->lock);
1052 /* Context entry was previously non-present (with domid 0). */
1053 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1054 DMA_CCMD_MASK_NOBIT, 1) )
1055 iommu_flush_write_buffer(iommu);
1056 else
1058 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1059 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1062 set_bit(iommu->index, &hd->iommu_bitmap);
1064 unmap_vtd_domain_page(context_entries);
1066 return 0;
1069 #define PCI_BASE_CLASS_BRIDGE 0x06
1070 #define PCI_CLASS_BRIDGE_PCI 0x0604
1072 enum {
1073 DEV_TYPE_PCIe_ENDPOINT,
1074 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1075 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1076 DEV_TYPE_PCI,
1077 };
1079 int pdev_type(u8 bus, u8 devfn)
1081 u16 class_device;
1082 u16 status, creg;
1083 int pos;
1084 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1086 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1087 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1089 pos = pci_find_next_cap(bus, devfn,
1090 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1091 if ( !pos )
1092 return DEV_TYPE_PCI_BRIDGE;
1093 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1094 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1095 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1098 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1099 if ( !(status & PCI_STATUS_CAP_LIST) )
1100 return DEV_TYPE_PCI;
1102 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1103 return DEV_TYPE_PCIe_ENDPOINT;
1105 return DEV_TYPE_PCI;
1108 #define MAX_BUSES 256
1109 static DEFINE_SPINLOCK(bus2bridge_lock);
1110 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1112 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1114 int cnt = 0;
1115 *secbus = *bus;
1117 ASSERT(spin_is_locked(&bus2bridge_lock));
1118 if ( !bus2bridge[*bus].map )
1119 return 0;
1121 while ( bus2bridge[*bus].map )
1123 *secbus = *bus;
1124 *devfn = bus2bridge[*bus].devfn;
1125 *bus = bus2bridge[*bus].bus;
1126 if ( cnt++ >= MAX_BUSES )
1127 return 0;
1130 return 1;
1133 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1135 int ret = 0;
1137 if ( *bus == 0 )
1138 /* assume integrated PCI devices in RC have valid requester-id */
1139 return 1;
1141 spin_lock(&bus2bridge_lock);
1142 ret = _find_pcie_endpoint(bus, devfn, secbus);
1143 spin_unlock(&bus2bridge_lock);
1145 return ret;
1148 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1150 struct acpi_drhd_unit *drhd;
1151 int ret = 0;
1152 u16 sec_bus, sub_bus;
1153 u32 type;
1154 u8 secbus, secdevfn;
1155 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1157 if ( pdev == NULL )
1159 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1160 * -> domain_context_mapping().
1161 * In the case a user enables VT-d and disables USB (that usually needs
1162 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1163 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1164 * the BDF and at last pci_get_pdev() returns NULL here.
1165 */
1166 gdprintk(XENLOG_WARNING VTDPREFIX,
1167 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1168 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1169 return 0;
1172 drhd = acpi_find_matched_drhd_unit(pdev);
1173 if ( !drhd )
1174 return -ENODEV;
1176 ASSERT(spin_is_locked(&pcidevs_lock));
1178 type = pdev_type(bus, devfn);
1179 switch ( type )
1181 case DEV_TYPE_PCIe_BRIDGE:
1182 break;
1184 case DEV_TYPE_PCI_BRIDGE:
1185 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1186 PCI_SECONDARY_BUS);
1187 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1188 PCI_SUBORDINATE_BUS);
1190 spin_lock(&bus2bridge_lock);
1191 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1193 bus2bridge[sec_bus].map = 1;
1194 bus2bridge[sec_bus].bus = bus;
1195 bus2bridge[sec_bus].devfn = devfn;
1197 spin_unlock(&bus2bridge_lock);
1198 break;
1200 case DEV_TYPE_PCIe_ENDPOINT:
1201 gdprintk(XENLOG_INFO VTDPREFIX,
1202 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1203 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1204 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1205 break;
1207 case DEV_TYPE_PCI:
1208 gdprintk(XENLOG_INFO VTDPREFIX,
1209 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1210 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1212 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1213 if ( ret )
1214 break;
1216 secbus = bus;
1217 secdevfn = devfn;
1218 /* dependent devices mapping */
1219 while ( bus2bridge[bus].map )
1221 secbus = bus;
1222 secdevfn = devfn;
1223 devfn = bus2bridge[bus].devfn;
1224 bus = bus2bridge[bus].bus;
1225 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1226 if ( ret )
1227 return ret;
1230 if ( (secbus != bus) && (secdevfn != 0) )
1231 /*
1232 * The source-id for transactions on non-PCIe buses seem
1233 * to originate from devfn=0 on the secondary bus behind
1234 * the bridge. Map that id as well. The id to use in
1235 * these scanarios is not particularly well documented
1236 * anywhere.
1237 */
1238 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1239 break;
1241 default:
1242 gdprintk(XENLOG_ERR VTDPREFIX,
1243 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1244 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1245 ret = -EINVAL;
1246 break;
1249 return ret;
1252 static int domain_context_unmap_one(
1253 struct domain *domain,
1254 struct iommu *iommu,
1255 u8 bus, u8 devfn)
1257 struct context_entry *context, *context_entries;
1258 u64 maddr;
1260 ASSERT(spin_is_locked(&pcidevs_lock));
1261 spin_lock(&iommu->lock);
1263 maddr = bus_to_context_maddr(iommu, bus);
1264 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1265 context = &context_entries[devfn];
1267 if ( !context_present(*context) )
1269 spin_unlock(&iommu->lock);
1270 unmap_vtd_domain_page(context_entries);
1271 return 0;
1274 context_clear_present(*context);
1275 context_clear_entry(*context);
1276 iommu_flush_cache_entry(context);
1278 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1279 (((u16)bus) << 8) | devfn,
1280 DMA_CCMD_MASK_NOBIT, 0) )
1281 iommu_flush_write_buffer(iommu);
1282 else
1284 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1285 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1288 spin_unlock(&iommu->lock);
1289 unmap_vtd_domain_page(context_entries);
1291 return 0;
1294 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1296 struct acpi_drhd_unit *drhd;
1297 int ret = 0;
1298 u32 type;
1299 u8 secbus, secdevfn;
1300 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1302 BUG_ON(!pdev);
1304 drhd = acpi_find_matched_drhd_unit(pdev);
1305 if ( !drhd )
1306 return -ENODEV;
1308 type = pdev_type(bus, devfn);
1309 switch ( type )
1311 case DEV_TYPE_PCIe_BRIDGE:
1312 case DEV_TYPE_PCI_BRIDGE:
1313 break;
1315 case DEV_TYPE_PCIe_ENDPOINT:
1316 gdprintk(XENLOG_INFO VTDPREFIX,
1317 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1318 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1319 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1320 break;
1322 case DEV_TYPE_PCI:
1323 gdprintk(XENLOG_INFO VTDPREFIX,
1324 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1325 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1326 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1327 if ( ret )
1328 break;
1330 secbus = bus;
1331 secdevfn = devfn;
1332 /* dependent devices unmapping */
1333 while ( bus2bridge[bus].map )
1335 secbus = bus;
1336 secdevfn = devfn;
1337 devfn = bus2bridge[bus].devfn;
1338 bus = bus2bridge[bus].bus;
1339 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1340 if ( ret )
1341 return ret;
1344 if ( (secbus != bus) && (secdevfn != 0) )
1345 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1346 break;
1348 default:
1349 gdprintk(XENLOG_ERR VTDPREFIX,
1350 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1351 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1352 ret = -EINVAL;
1353 break;
1356 return ret;
1359 static int reassign_device_ownership(
1360 struct domain *source,
1361 struct domain *target,
1362 u8 bus, u8 devfn)
1364 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1365 struct pci_dev *pdev;
1366 struct acpi_drhd_unit *drhd;
1367 struct iommu *pdev_iommu;
1368 int ret, found = 0;
1370 ASSERT(spin_is_locked(&pcidevs_lock));
1371 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1373 if (!pdev)
1374 return -ENODEV;
1376 drhd = acpi_find_matched_drhd_unit(pdev);
1377 pdev_iommu = drhd->iommu;
1378 domain_context_unmap(source, bus, devfn);
1380 ret = domain_context_mapping(target, bus, devfn);
1381 if ( ret )
1382 return ret;
1384 list_move(&pdev->domain_list, &target->arch.pdev_list);
1385 pdev->domain = target;
1387 for_each_pdev ( source, pdev )
1389 drhd = acpi_find_matched_drhd_unit(pdev);
1390 if ( drhd->iommu == pdev_iommu )
1392 found = 1;
1393 break;
1397 if ( !found )
1398 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1400 return ret;
1403 void iommu_domain_teardown(struct domain *d)
1405 struct hvm_iommu *hd = domain_hvm_iommu(d);
1407 if ( list_empty(&acpi_drhd_units) )
1408 return;
1410 spin_lock(&hd->mapping_lock);
1411 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1412 hd->pgd_maddr = 0;
1413 spin_unlock(&hd->mapping_lock);
1415 iommu_domid_release(d);
1418 int intel_iommu_map_page(
1419 struct domain *d, unsigned long gfn, unsigned long mfn)
1421 struct hvm_iommu *hd = domain_hvm_iommu(d);
1422 struct acpi_drhd_unit *drhd;
1423 struct iommu *iommu;
1424 struct dma_pte *page = NULL, *pte = NULL;
1425 u64 pg_maddr;
1426 int pte_present;
1427 int flush_dev_iotlb;
1429 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1430 iommu = drhd->iommu;
1432 /* do nothing if dom0 and iommu supports pass thru */
1433 if ( iommu_passthrough && (d->domain_id == 0) )
1434 return 0;
1436 spin_lock(&hd->mapping_lock);
1438 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1439 if ( pg_maddr == 0 )
1441 spin_unlock(&hd->mapping_lock);
1442 return -ENOMEM;
1444 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1445 pte = page + (gfn & LEVEL_MASK);
1446 pte_present = dma_pte_present(*pte);
1447 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1448 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1450 /* Set the SNP on leaf page table if Snoop Control available */
1451 if ( iommu_snoop )
1452 dma_set_pte_snp(*pte);
1454 iommu_flush_cache_entry(pte);
1455 spin_unlock(&hd->mapping_lock);
1456 unmap_vtd_domain_page(page);
1458 /*
1459 * No need pcideves_lock here because we have flush
1460 * when assign/deassign device
1461 */
1462 for_each_drhd_unit ( drhd )
1464 iommu = drhd->iommu;
1466 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1467 continue;
1469 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1470 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1471 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1472 !pte_present, flush_dev_iotlb) )
1473 iommu_flush_write_buffer(iommu);
1476 return 0;
1479 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1481 struct acpi_drhd_unit *drhd;
1482 struct iommu *iommu;
1484 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1485 iommu = drhd->iommu;
1487 /* do nothing if dom0 and iommu supports pass thru */
1488 if ( iommu_passthrough && (d->domain_id == 0) )
1489 return 0;
1491 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1493 return 0;
1496 static int iommu_prepare_rmrr_dev(struct domain *d,
1497 struct acpi_rmrr_unit *rmrr,
1498 u8 bus, u8 devfn)
1500 int ret = 0;
1501 u64 base, end;
1502 unsigned long base_pfn, end_pfn;
1504 ASSERT(spin_is_locked(&pcidevs_lock));
1505 ASSERT(rmrr->base_address < rmrr->end_address);
1507 base = rmrr->base_address & PAGE_MASK_4K;
1508 base_pfn = base >> PAGE_SHIFT_4K;
1509 end = PAGE_ALIGN_4K(rmrr->end_address);
1510 end_pfn = end >> PAGE_SHIFT_4K;
1512 while ( base_pfn < end_pfn )
1514 intel_iommu_map_page(d, base_pfn, base_pfn);
1515 base_pfn++;
1518 ret = domain_context_mapping(d, bus, devfn);
1520 return ret;
1523 static int intel_iommu_add_device(struct pci_dev *pdev)
1525 struct acpi_rmrr_unit *rmrr;
1526 u16 bdf;
1527 int ret, i;
1529 ASSERT(spin_is_locked(&pcidevs_lock));
1531 if ( !pdev->domain )
1532 return -EINVAL;
1534 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1535 if ( ret )
1537 gdprintk(XENLOG_ERR VTDPREFIX,
1538 "intel_iommu_add_device: context mapping failed\n");
1539 return ret;
1542 for_each_rmrr_device ( rmrr, bdf, i )
1544 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1546 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1547 pdev->bus, pdev->devfn);
1548 if ( ret )
1549 gdprintk(XENLOG_ERR VTDPREFIX,
1550 "intel_iommu_add_device: RMRR mapping failed\n");
1551 break;
1555 return ret;
1558 static int intel_iommu_remove_device(struct pci_dev *pdev)
1560 struct acpi_rmrr_unit *rmrr;
1561 u16 bdf;
1562 int i;
1564 if ( !pdev->domain )
1565 return -EINVAL;
1567 /* If the device belongs to dom0, and it has RMRR, don't remove it
1568 * from dom0, because BIOS may use RMRR at booting time.
1569 */
1570 if ( pdev->domain->domain_id == 0 )
1572 for_each_rmrr_device ( rmrr, bdf, i )
1574 if ( PCI_BUS(bdf) == pdev->bus &&
1575 PCI_DEVFN2(bdf) == pdev->devfn )
1576 return 0;
1580 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1583 static void setup_dom0_devices(struct domain *d)
1585 struct hvm_iommu *hd;
1586 struct pci_dev *pdev;
1587 int bus, dev, func;
1588 u32 l;
1590 hd = domain_hvm_iommu(d);
1592 spin_lock(&pcidevs_lock);
1593 for ( bus = 0; bus < 256; bus++ )
1595 for ( dev = 0; dev < 32; dev++ )
1597 for ( func = 0; func < 8; func++ )
1599 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1600 /* some broken boards return 0 or ~0 if a slot is empty: */
1601 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1602 (l == 0x0000ffff) || (l == 0xffff0000) )
1603 continue;
1605 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1606 pdev->domain = d;
1607 list_add(&pdev->domain_list, &d->arch.pdev_list);
1608 domain_context_mapping(d, pdev->bus, pdev->devfn);
1609 if ( ats_device(0, pdev->bus, pdev->devfn) )
1610 enable_ats_device(0, pdev->bus, pdev->devfn);
1614 spin_unlock(&pcidevs_lock);
1617 void clear_fault_bits(struct iommu *iommu)
1619 u64 val;
1621 val = dmar_readq(
1622 iommu->reg,
1623 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1624 dmar_writeq(
1625 iommu->reg,
1626 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1627 val);
1628 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1631 static int init_vtd_hw(void)
1633 struct acpi_drhd_unit *drhd;
1634 struct iommu *iommu;
1635 struct iommu_flush *flush = NULL;
1636 int vector;
1637 int ret;
1639 for_each_drhd_unit ( drhd )
1641 iommu = drhd->iommu;
1642 if ( iommu->vector < 0 )
1644 vector = iommu_set_interrupt(iommu);
1645 if ( vector < 0 )
1647 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1648 return vector;
1650 iommu->vector = vector;
1652 dma_msi_data_init(iommu, iommu->vector);
1653 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1654 clear_fault_bits(iommu);
1655 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1657 /* initialize flush functions */
1658 flush = iommu_get_flush(iommu);
1659 flush->context = flush_context_reg;
1660 flush->iotlb = flush_iotlb_reg;
1663 if ( iommu_qinval )
1665 for_each_drhd_unit ( drhd )
1667 iommu = drhd->iommu;
1668 if ( enable_qinval(iommu) != 0 )
1670 dprintk(XENLOG_INFO VTDPREFIX,
1671 "Failed to enable Queued Invalidation!\n");
1672 break;
1677 if ( iommu_intremap )
1679 for_each_drhd_unit ( drhd )
1681 iommu = drhd->iommu;
1682 if ( enable_intremap(iommu) != 0 )
1684 dprintk(XENLOG_INFO VTDPREFIX,
1685 "Failed to enable Interrupt Remapping!\n");
1686 break;
1691 for_each_drhd_unit ( drhd )
1693 iommu = drhd->iommu;
1694 ret = iommu_set_root_entry(iommu);
1695 if ( ret )
1697 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1698 return -EIO;
1702 /*
1703 * After set root entry, must globally invalidate context cache, and
1704 * then globally invalidate IOTLB
1705 */
1706 iommu_flush_all();
1708 return 0;
1711 static void setup_dom0_rmrr(struct domain *d)
1713 struct acpi_rmrr_unit *rmrr;
1714 u16 bdf;
1715 int ret, i;
1717 spin_lock(&pcidevs_lock);
1718 for_each_rmrr_device ( rmrr, bdf, i )
1720 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1721 if ( ret )
1722 gdprintk(XENLOG_ERR VTDPREFIX,
1723 "IOMMU: mapping reserved region failed\n");
1725 spin_unlock(&pcidevs_lock);
1728 static void platform_quirks(void)
1730 u32 id;
1732 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1733 id = pci_conf_read32(0, 0, 0, 0);
1734 if ( id == 0x2a408086 )
1736 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1737 rwbf_quirk = 1;
1741 int intel_vtd_setup(void)
1743 struct acpi_drhd_unit *drhd;
1744 struct iommu *iommu;
1746 if ( !iommu_enabled )
1747 return -ENODEV;
1749 platform_quirks();
1751 spin_lock_init(&domid_bitmap_lock);
1752 clflush_size = get_cache_line_size();
1754 /* We enable the following features only if they are supported by all VT-d
1755 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1756 * Interrupt Remapping.
1757 */
1758 for_each_drhd_unit ( drhd )
1760 if ( iommu_alloc(drhd) != 0 )
1761 goto error;
1763 iommu = drhd->iommu;
1765 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1766 iommu_snoop = 0;
1768 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1769 iommu_passthrough = 0;
1771 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1772 iommu_qinval = 0;
1774 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1775 iommu_intremap = 0;
1778 if ( !iommu_qinval && iommu_intremap )
1780 iommu_intremap = 0;
1781 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1782 "since Queued Invalidation isn't supported or enabled.\n");
1785 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1786 P(iommu_snoop, "Snoop Control");
1787 P(iommu_passthrough, "DMA Passthrough");
1788 P(iommu_qinval, "Queued Invalidation");
1789 P(iommu_intremap, "Interrupt Remapping");
1790 #undef P
1792 /* Allocate IO page directory page for the domain. */
1793 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1794 iommu = drhd->iommu;
1796 /* Allocate domain id bitmap, and set bit 0 as reserved */
1797 domid_bitmap_size = cap_ndoms(iommu->cap);
1798 domid_bitmap = xmalloc_array(unsigned long,
1799 BITS_TO_LONGS(domid_bitmap_size));
1800 if ( domid_bitmap == NULL )
1801 goto error;
1802 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1803 set_bit(0, domid_bitmap);
1805 if ( init_vtd_hw() )
1806 goto error;
1808 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1810 return 0;
1812 error:
1813 for_each_drhd_unit ( drhd )
1814 iommu_free(drhd);
1815 iommu_enabled = 0;
1816 iommu_snoop = 0;
1817 iommu_passthrough = 0;
1818 iommu_qinval = 0;
1819 iommu_intremap = 0;
1820 return -ENOMEM;
1823 /*
1824 * If the device isn't owned by dom0, it means it already
1825 * has been assigned to other domain, or it's not exist.
1826 */
1827 int device_assigned(u8 bus, u8 devfn)
1829 struct pci_dev *pdev;
1831 spin_lock(&pcidevs_lock);
1832 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1833 if (!pdev)
1835 spin_unlock(&pcidevs_lock);
1836 return -1;
1839 spin_unlock(&pcidevs_lock);
1840 return 0;
1843 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1845 struct acpi_rmrr_unit *rmrr;
1846 int ret = 0, i;
1847 struct pci_dev *pdev;
1848 u16 bdf;
1850 if ( list_empty(&acpi_drhd_units) )
1851 return -ENODEV;
1853 ASSERT(spin_is_locked(&pcidevs_lock));
1854 pdev = pci_get_pdev(bus, devfn);
1855 if (!pdev)
1856 return -ENODEV;
1858 if (pdev->domain != dom0)
1860 gdprintk(XENLOG_ERR VTDPREFIX,
1861 "IOMMU: assign a assigned device\n");
1862 return -EBUSY;
1865 ret = reassign_device_ownership(dom0, d, bus, devfn);
1866 if ( ret )
1867 goto done;
1869 /* Setup rmrr identity mapping */
1870 for_each_rmrr_device( rmrr, bdf, i )
1872 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1874 /* FIXME: Because USB RMRR conflicts with guest bios region,
1875 * ignore USB RMRR temporarily.
1876 */
1877 if ( is_usb_device(bus, devfn) )
1879 ret = 0;
1880 goto done;
1883 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1884 if ( ret )
1885 gdprintk(XENLOG_ERR VTDPREFIX,
1886 "IOMMU: mapping reserved region failed\n");
1887 goto done;
1891 done:
1892 return ret;
1895 static int intel_iommu_group_id(u8 bus, u8 devfn)
1897 u8 secbus;
1898 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1899 return PCI_BDF2(bus, devfn);
1900 else
1901 return -1;
1904 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1905 void iommu_suspend(void)
1907 struct acpi_drhd_unit *drhd;
1908 struct iommu *iommu;
1909 u32 i;
1911 if ( !iommu_enabled )
1912 return;
1914 iommu_flush_all();
1916 for_each_drhd_unit ( drhd )
1918 iommu = drhd->iommu;
1919 i = iommu->index;
1921 iommu_state[i][DMAR_FECTL_REG] =
1922 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1923 iommu_state[i][DMAR_FEDATA_REG] =
1924 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1925 iommu_state[i][DMAR_FEADDR_REG] =
1926 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1927 iommu_state[i][DMAR_FEUADDR_REG] =
1928 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1930 iommu_disable_translation(iommu);
1932 if ( iommu_intremap )
1933 disable_intremap(iommu);
1935 if ( iommu_qinval )
1936 disable_qinval(iommu);
1940 void iommu_resume(void)
1942 struct acpi_drhd_unit *drhd;
1943 struct iommu *iommu;
1944 u32 i;
1946 if ( !iommu_enabled )
1947 return;
1949 if ( init_vtd_hw() != 0 && force_iommu )
1950 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1952 for_each_drhd_unit ( drhd )
1954 iommu = drhd->iommu;
1955 i = iommu->index;
1957 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1958 (u32) iommu_state[i][DMAR_FECTL_REG]);
1959 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1960 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1961 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1962 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1963 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1964 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1966 iommu_enable_translation(iommu);
1970 struct iommu_ops intel_iommu_ops = {
1971 .init = intel_iommu_domain_init,
1972 .add_device = intel_iommu_add_device,
1973 .remove_device = intel_iommu_remove_device,
1974 .assign_device = intel_iommu_assign_device,
1975 .teardown = iommu_domain_teardown,
1976 .map_page = intel_iommu_map_page,
1977 .unmap_page = intel_iommu_unmap_page,
1978 .reassign_device = reassign_device_ownership,
1979 .get_device_group_id = intel_iommu_group_id,
1980 .update_ire_from_apic = io_apic_write_remap_rte,
1981 .update_ire_from_msi = msi_msg_write_remap_rte,
1982 };
1984 /*
1985 * Local variables:
1986 * mode: C
1987 * c-set-style: "BSD"
1988 * c-basic-offset: 4
1989 * tab-width: 4
1990 * indent-tabs-mode: nil
1991 * End:
1992 */