ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19800:78962f85c562

IOMMU: Add two generic functions to vendor neutral interface

Add 2 generic functions into the vendor neutral iommu interface, The
reason is that from changeset 19732, there is only one global flag
"iommu_enabled" that controls iommu enablement for both vtd and amd
systems, so we need different code paths for vtd and amd iommu systems
if this flag has been turned on. Also, the early checking of
"iommu_enabled" in iommu_setup() is removed to prevent iommu
functionalities from been disabled on amd systems.

Signed-off-by: Wei Wang <wei.wang2@amd.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 19 08:41:50 2009 +0100 (2009-06-19)
parents cc07094a02e4
children 2f1fa2215e60
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flags;
234 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
235 return;
237 spin_lock_irqsave(&iommu->register_lock, flags);
238 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
239 dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
241 /* Make sure hardware complete it */
242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
243 !(val & DMA_GSTS_WBFS), val);
245 spin_unlock_irqrestore(&iommu->register_lock, flags);
246 }
248 /* return value determine if we need a write buffer flush */
249 static int flush_context_reg(
250 void *_iommu,
251 u16 did, u16 source_id, u8 function_mask, u64 type,
252 int flush_non_present_entry)
253 {
254 struct iommu *iommu = (struct iommu *) _iommu;
255 u64 val = 0;
256 unsigned long flags;
258 /*
259 * In the non-present entry flush case, if hardware doesn't cache
260 * non-present entry we do nothing and if hardware cache non-present
261 * entry, we flush entries of domain 0 (the domain id is used to cache
262 * any non-present entries)
263 */
264 if ( flush_non_present_entry )
265 {
266 if ( !cap_caching_mode(iommu->cap) )
267 return 1;
268 else
269 did = 0;
270 }
272 /* use register invalidation */
273 switch ( type )
274 {
275 case DMA_CCMD_GLOBAL_INVL:
276 val = DMA_CCMD_GLOBAL_INVL;
277 break;
278 case DMA_CCMD_DOMAIN_INVL:
279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
280 break;
281 case DMA_CCMD_DEVICE_INVL:
282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
283 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
284 break;
285 default:
286 BUG();
287 }
288 val |= DMA_CCMD_ICC;
290 spin_lock_irqsave(&iommu->register_lock, flags);
291 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
293 /* Make sure hardware complete it */
294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
295 !(val & DMA_CCMD_ICC), val);
297 spin_unlock_irqrestore(&iommu->register_lock, flags);
298 /* flush context entry will implicitly flush write buffer */
299 return 0;
300 }
302 static int inline iommu_flush_context_global(
303 struct iommu *iommu, int flush_non_present_entry)
304 {
305 struct iommu_flush *flush = iommu_get_flush(iommu);
306 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
307 flush_non_present_entry);
308 }
310 static int inline iommu_flush_context_domain(
311 struct iommu *iommu, u16 did, int flush_non_present_entry)
312 {
313 struct iommu_flush *flush = iommu_get_flush(iommu);
314 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
315 flush_non_present_entry);
316 }
318 static int inline iommu_flush_context_device(
319 struct iommu *iommu, u16 did, u16 source_id,
320 u8 function_mask, int flush_non_present_entry)
321 {
322 struct iommu_flush *flush = iommu_get_flush(iommu);
323 return flush->context(iommu, did, source_id, function_mask,
324 DMA_CCMD_DEVICE_INVL,
325 flush_non_present_entry);
326 }
328 /* return value determine if we need a write buffer flush */
329 static int flush_iotlb_reg(void *_iommu, u16 did,
330 u64 addr, unsigned int size_order, u64 type,
331 int flush_non_present_entry, int flush_dev_iotlb)
332 {
333 struct iommu *iommu = (struct iommu *) _iommu;
334 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
335 u64 val = 0, val_iva = 0;
336 unsigned long flags;
338 /*
339 * In the non-present entry flush case, if hardware doesn't cache
340 * non-present entry we do nothing and if hardware cache non-present
341 * entry, we flush entries of domain 0 (the domain id is used to cache
342 * any non-present entries)
343 */
344 if ( flush_non_present_entry )
345 {
346 if ( !cap_caching_mode(iommu->cap) )
347 return 1;
348 else
349 did = 0;
350 }
352 /* use register invalidation */
353 switch ( type )
354 {
355 case DMA_TLB_GLOBAL_FLUSH:
356 /* global flush doesn't need set IVA_REG */
357 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
358 break;
359 case DMA_TLB_DSI_FLUSH:
360 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
361 break;
362 case DMA_TLB_PSI_FLUSH:
363 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
364 /* Note: always flush non-leaf currently */
365 val_iva = size_order | addr;
366 break;
367 default:
368 BUG();
369 }
370 /* Note: set drain read/write */
371 if ( cap_read_drain(iommu->cap) )
372 val |= DMA_TLB_READ_DRAIN;
373 if ( cap_write_drain(iommu->cap) )
374 val |= DMA_TLB_WRITE_DRAIN;
376 spin_lock_irqsave(&iommu->register_lock, flags);
377 /* Note: Only uses first TLB reg currently */
378 if ( val_iva )
379 dmar_writeq(iommu->reg, tlb_offset, val_iva);
380 dmar_writeq(iommu->reg, tlb_offset + 8, val);
382 /* Make sure hardware complete it */
383 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
384 !(val & DMA_TLB_IVT), val);
385 spin_unlock_irqrestore(&iommu->register_lock, flags);
387 /* check IOTLB invalidation granularity */
388 if ( DMA_TLB_IAIG(val) == 0 )
389 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
391 /* flush iotlb entry will implicitly flush write buffer */
392 return 0;
393 }
395 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
396 int flush_non_present_entry, int flush_dev_iotlb)
397 {
398 struct iommu_flush *flush = iommu_get_flush(iommu);
399 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
400 flush_non_present_entry, flush_dev_iotlb);
401 }
403 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
404 int flush_non_present_entry, int flush_dev_iotlb)
405 {
406 struct iommu_flush *flush = iommu_get_flush(iommu);
407 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
408 flush_non_present_entry, flush_dev_iotlb);
409 }
411 static int inline get_alignment(u64 base, unsigned int size)
412 {
413 int t = 0;
414 u64 end;
416 end = base + size - 1;
417 while ( base != end )
418 {
419 t++;
420 base >>= 1;
421 end >>= 1;
422 }
423 return t;
424 }
426 static int inline iommu_flush_iotlb_psi(
427 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
428 int flush_non_present_entry, int flush_dev_iotlb)
429 {
430 unsigned int align;
431 struct iommu_flush *flush = iommu_get_flush(iommu);
433 ASSERT(!(addr & (~PAGE_MASK_4K)));
434 ASSERT(pages > 0);
436 /* Fallback to domain selective flush if no PSI support */
437 if ( !cap_pgsel_inv(iommu->cap) )
438 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
440 /*
441 * PSI requires page size is 2 ^ x, and the base address is naturally
442 * aligned to the size
443 */
444 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
445 /* Fallback to domain selective flush if size is too big */
446 if ( align > cap_max_amask_val(iommu->cap) )
447 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
449 addr >>= PAGE_SHIFT_4K + align;
450 addr <<= PAGE_SHIFT_4K + align;
452 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
453 flush_non_present_entry, flush_dev_iotlb);
454 }
456 void iommu_flush_all(void)
457 {
458 struct acpi_drhd_unit *drhd;
459 struct iommu *iommu;
460 int flush_dev_iotlb;
462 flush_all_cache();
463 for_each_drhd_unit ( drhd )
464 {
465 iommu = drhd->iommu;
466 iommu_flush_context_global(iommu, 0);
467 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
468 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
469 }
470 }
472 /* clear one page's page table */
473 static void dma_pte_clear_one(struct domain *domain, u64 addr)
474 {
475 struct hvm_iommu *hd = domain_hvm_iommu(domain);
476 struct acpi_drhd_unit *drhd;
477 struct iommu *iommu;
478 struct dma_pte *page = NULL, *pte = NULL;
479 u64 pg_maddr;
480 int flush_dev_iotlb;
482 spin_lock(&hd->mapping_lock);
483 /* get last level pte */
484 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
485 if ( pg_maddr == 0 )
486 {
487 spin_unlock(&hd->mapping_lock);
488 return;
489 }
491 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
492 pte = page + address_level_offset(addr, 1);
494 if ( !dma_pte_present(*pte) )
495 {
496 spin_unlock(&hd->mapping_lock);
497 unmap_vtd_domain_page(page);
498 return;
499 }
501 dma_clear_pte(*pte);
502 spin_unlock(&hd->mapping_lock);
503 iommu_flush_cache_entry(pte);
505 /* No need pcidevs_lock here since do that on assign/deassign device*/
506 for_each_drhd_unit ( drhd )
507 {
508 iommu = drhd->iommu;
509 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
510 {
511 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
512 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
513 addr, 1, 0, flush_dev_iotlb) )
514 iommu_flush_write_buffer(iommu);
515 }
516 }
518 unmap_vtd_domain_page(page);
519 }
521 static void iommu_free_pagetable(u64 pt_maddr, int level)
522 {
523 int i;
524 struct dma_pte *pt_vaddr, *pte;
525 int next_level = level - 1;
527 if ( pt_maddr == 0 )
528 return;
530 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
532 for ( i = 0; i < PTE_NUM; i++ )
533 {
534 pte = &pt_vaddr[i];
535 if ( !dma_pte_present(*pte) )
536 continue;
538 if ( next_level >= 1 )
539 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
541 dma_clear_pte(*pte);
542 iommu_flush_cache_entry(pte);
543 }
545 unmap_vtd_domain_page(pt_vaddr);
546 free_pgtable_maddr(pt_maddr);
547 }
549 static int iommu_set_root_entry(struct iommu *iommu)
550 {
551 u32 sts;
552 unsigned long flags;
554 spin_lock(&iommu->lock);
556 if ( iommu->root_maddr == 0 )
557 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
558 if ( iommu->root_maddr == 0 )
559 {
560 spin_unlock(&iommu->lock);
561 return -ENOMEM;
562 }
564 spin_unlock(&iommu->lock);
565 spin_lock_irqsave(&iommu->register_lock, flags);
566 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
568 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
569 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
571 /* Make sure hardware complete it */
572 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
573 (sts & DMA_GSTS_RTPS), sts);
574 spin_unlock_irqrestore(&iommu->register_lock, flags);
576 return 0;
577 }
579 static void iommu_enable_translation(struct iommu *iommu)
580 {
581 u32 sts;
582 unsigned long flags;
584 dprintk(XENLOG_INFO VTDPREFIX,
585 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
586 spin_lock_irqsave(&iommu->register_lock, flags);
587 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
588 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
590 /* Make sure hardware complete it */
591 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
592 (sts & DMA_GSTS_TES), sts);
593 spin_unlock_irqrestore(&iommu->register_lock, flags);
595 /* Disable PMRs when VT-d engine takes effect per spec definition */
596 disable_pmr(iommu);
597 }
599 static void iommu_disable_translation(struct iommu *iommu)
600 {
601 u32 sts;
602 unsigned long flags;
604 spin_lock_irqsave(&iommu->register_lock, flags);
605 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
606 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
608 /* Make sure hardware complete it */
609 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
610 !(sts & DMA_GSTS_TES), sts);
611 spin_unlock_irqrestore(&iommu->register_lock, flags);
612 }
614 static struct iommu *vector_to_iommu[NR_VECTORS];
615 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
616 u8 fault_reason, u16 source_id, u64 addr)
617 {
618 dprintk(XENLOG_WARNING VTDPREFIX,
619 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
620 "iommu->reg = %p\n",
621 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
622 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
623 fault_reason, iommu->reg);
625 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
626 if ( fault_reason < 0x20 )
627 print_vtd_entries(iommu, (source_id >> 8),
628 (source_id & 0xff), (addr >> PAGE_SHIFT));
629 #endif
631 return 0;
632 }
634 static void iommu_fault_status(u32 fault_status)
635 {
636 if ( fault_status & DMA_FSTS_PFO )
637 dprintk(XENLOG_ERR VTDPREFIX,
638 "iommu_fault_status: Fault Overflow\n");
639 if ( fault_status & DMA_FSTS_PPF )
640 dprintk(XENLOG_ERR VTDPREFIX,
641 "iommu_fault_status: Primary Pending Fault\n");
642 if ( fault_status & DMA_FSTS_AFO )
643 dprintk(XENLOG_ERR VTDPREFIX,
644 "iommu_fault_status: Advanced Fault Overflow\n");
645 if ( fault_status & DMA_FSTS_APF )
646 dprintk(XENLOG_ERR VTDPREFIX,
647 "iommu_fault_status: Advanced Pending Fault\n");
648 if ( fault_status & DMA_FSTS_IQE )
649 dprintk(XENLOG_ERR VTDPREFIX,
650 "iommu_fault_status: Invalidation Queue Error\n");
651 if ( fault_status & DMA_FSTS_ICE )
652 dprintk(XENLOG_ERR VTDPREFIX,
653 "iommu_fault_status: Invalidation Completion Error\n");
654 if ( fault_status & DMA_FSTS_ITE )
655 dprintk(XENLOG_ERR VTDPREFIX,
656 "iommu_fault_status: Invalidation Time-out Error\n");
657 }
659 #define PRIMARY_FAULT_REG_LEN (16)
660 static void iommu_page_fault(int vector, void *dev_id,
661 struct cpu_user_regs *regs)
662 {
663 struct iommu *iommu = dev_id;
664 int reg, fault_index;
665 u32 fault_status;
666 unsigned long flags;
668 dprintk(XENLOG_WARNING VTDPREFIX,
669 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
671 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
673 iommu_fault_status(fault_status);
675 /* FIXME: ignore advanced fault log */
676 if ( !(fault_status & DMA_FSTS_PPF) )
677 goto clear_overflow;
679 fault_index = dma_fsts_fault_record_index(fault_status);
680 reg = cap_fault_reg_offset(iommu->cap);
681 while (1)
682 {
683 u8 fault_reason;
684 u16 source_id;
685 u32 data;
686 u64 guest_addr;
687 int type;
689 /* highest 32 bits */
690 spin_lock_irqsave(&iommu->register_lock, flags);
691 data = dmar_readl(iommu->reg, reg +
692 fault_index * PRIMARY_FAULT_REG_LEN + 12);
693 if ( !(data & DMA_FRCD_F) )
694 {
695 spin_unlock_irqrestore(&iommu->register_lock, flags);
696 break;
697 }
699 fault_reason = dma_frcd_fault_reason(data);
700 type = dma_frcd_type(data);
702 data = dmar_readl(iommu->reg, reg +
703 fault_index * PRIMARY_FAULT_REG_LEN + 8);
704 source_id = dma_frcd_source_id(data);
706 guest_addr = dmar_readq(iommu->reg, reg +
707 fault_index * PRIMARY_FAULT_REG_LEN);
708 guest_addr = dma_frcd_page_addr(guest_addr);
709 /* clear the fault */
710 dmar_writel(iommu->reg, reg +
711 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
712 spin_unlock_irqrestore(&iommu->register_lock, flags);
714 iommu_page_fault_do_one(iommu, type, fault_reason,
715 source_id, guest_addr);
717 fault_index++;
718 if ( fault_index > cap_num_fault_regs(iommu->cap) )
719 fault_index = 0;
720 }
721 clear_overflow:
722 /* clear primary fault overflow */
723 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
724 if ( fault_status & DMA_FSTS_PFO )
725 {
726 spin_lock_irqsave(&iommu->register_lock, flags);
727 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
728 spin_unlock_irqrestore(&iommu->register_lock, flags);
729 }
730 }
732 static void dma_msi_unmask(unsigned int vector)
733 {
734 struct iommu *iommu = vector_to_iommu[vector];
735 unsigned long flags;
737 /* unmask it */
738 spin_lock_irqsave(&iommu->register_lock, flags);
739 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
740 spin_unlock_irqrestore(&iommu->register_lock, flags);
741 }
743 static void dma_msi_mask(unsigned int vector)
744 {
745 unsigned long flags;
746 struct iommu *iommu = vector_to_iommu[vector];
748 /* mask it */
749 spin_lock_irqsave(&iommu->register_lock, flags);
750 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
751 spin_unlock_irqrestore(&iommu->register_lock, flags);
752 }
754 static unsigned int dma_msi_startup(unsigned int vector)
755 {
756 dma_msi_unmask(vector);
757 return 0;
758 }
760 static void dma_msi_end(unsigned int vector)
761 {
762 dma_msi_unmask(vector);
763 ack_APIC_irq();
764 }
766 static void dma_msi_data_init(struct iommu *iommu, int vector)
767 {
768 u32 msi_data = 0;
769 unsigned long flags;
771 /* Fixed, edge, assert mode. Follow MSI setting */
772 msi_data |= vector & 0xff;
773 msi_data |= 1 << 14;
775 spin_lock_irqsave(&iommu->register_lock, flags);
776 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
777 spin_unlock_irqrestore(&iommu->register_lock, flags);
778 }
780 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
781 {
782 u64 msi_address;
783 unsigned long flags;
785 /* Physical, dedicated cpu. Follow MSI setting */
786 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
787 msi_address |= MSI_PHYSICAL_MODE << 2;
788 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
789 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
791 spin_lock_irqsave(&iommu->register_lock, flags);
792 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
793 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
794 spin_unlock_irqrestore(&iommu->register_lock, flags);
795 }
797 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
798 {
799 struct iommu *iommu = vector_to_iommu[vector];
800 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
801 }
803 static struct hw_interrupt_type dma_msi_type = {
804 .typename = "DMA_MSI",
805 .startup = dma_msi_startup,
806 .shutdown = dma_msi_mask,
807 .enable = dma_msi_unmask,
808 .disable = dma_msi_mask,
809 .ack = dma_msi_mask,
810 .end = dma_msi_end,
811 .set_affinity = dma_msi_set_affinity,
812 };
814 static int iommu_set_interrupt(struct iommu *iommu)
815 {
816 int vector, ret;
818 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
819 if ( vector <= 0 )
820 {
821 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
822 return -EINVAL;
823 }
825 irq_desc[vector].handler = &dma_msi_type;
826 vector_to_iommu[vector] = iommu;
827 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
828 if ( ret )
829 {
830 irq_desc[vector].handler = &no_irq_type;
831 vector_to_iommu[vector] = NULL;
832 free_irq_vector(vector);
833 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
834 return ret;
835 }
837 /* Make sure that vector is never re-used. */
838 vector_irq[vector] = NEVER_ASSIGN_IRQ;
840 return vector;
841 }
843 static int iommu_alloc(struct acpi_drhd_unit *drhd)
844 {
845 struct iommu *iommu;
846 unsigned long sagaw;
847 int agaw;
849 if ( nr_iommus > MAX_IOMMUS )
850 {
851 gdprintk(XENLOG_ERR VTDPREFIX,
852 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
853 return -ENOMEM;
854 }
856 iommu = xmalloc(struct iommu);
857 if ( iommu == NULL )
858 return -ENOMEM;
859 memset(iommu, 0, sizeof(struct iommu));
861 iommu->vector = -1; /* No vector assigned yet. */
863 iommu->intel = alloc_intel_iommu();
864 if ( iommu->intel == NULL )
865 {
866 xfree(iommu);
867 return -ENOMEM;
868 }
870 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
871 iommu->index = nr_iommus++;
873 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
874 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
876 gdprintk(XENLOG_INFO VTDPREFIX,
877 "drhd->address = %"PRIx64"\n", drhd->address);
878 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
880 /* Calculate number of pagetable levels: between 2 and 4. */
881 sagaw = cap_sagaw(iommu->cap);
882 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
883 if ( test_bit(agaw, &sagaw) )
884 break;
885 if ( agaw < 0 )
886 {
887 gdprintk(XENLOG_ERR VTDPREFIX,
888 "IOMMU: unsupported sagaw %lx\n", sagaw);
889 xfree(iommu);
890 return -ENODEV;
891 }
892 iommu->nr_pt_levels = agaw_to_level(agaw);
894 if ( !ecap_coherent(iommu->ecap) )
895 iommus_incoherent = 1;
897 spin_lock_init(&iommu->lock);
898 spin_lock_init(&iommu->register_lock);
900 drhd->iommu = iommu;
901 return 0;
902 }
904 static void iommu_free(struct acpi_drhd_unit *drhd)
905 {
906 struct iommu *iommu = drhd->iommu;
908 if ( iommu == NULL )
909 return;
911 if ( iommu->root_maddr != 0 )
912 {
913 free_pgtable_maddr(iommu->root_maddr);
914 iommu->root_maddr = 0;
915 }
917 if ( iommu->reg )
918 iounmap(iommu->reg);
920 free_intel_iommu(iommu->intel);
921 release_irq_vector(iommu->vector);
922 xfree(iommu);
924 drhd->iommu = NULL;
925 }
927 #define guestwidth_to_adjustwidth(gaw) ({ \
928 int agaw, r = (gaw - 12) % 9; \
929 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
930 if ( agaw > 64 ) \
931 agaw = 64; \
932 agaw; })
934 static int intel_iommu_domain_init(struct domain *d)
935 {
936 struct hvm_iommu *hd = domain_hvm_iommu(d);
937 struct iommu *iommu = NULL;
938 struct acpi_drhd_unit *drhd;
940 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
941 iommu = drhd->iommu;
943 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
945 if ( d->domain_id == 0 )
946 {
947 /* Set up 1:1 page table for dom0 */
948 iommu_set_dom0_mapping(d);
950 setup_dom0_devices(d);
951 setup_dom0_rmrr(d);
953 iommu_flush_all();
955 for_each_drhd_unit ( drhd )
956 {
957 iommu = drhd->iommu;
958 iommu_enable_translation(iommu);
959 }
960 }
962 return 0;
963 }
965 static int domain_context_mapping_one(
966 struct domain *domain,
967 struct iommu *iommu,
968 u8 bus, u8 devfn)
969 {
970 struct hvm_iommu *hd = domain_hvm_iommu(domain);
971 struct context_entry *context, *context_entries;
972 u64 maddr, pgd_maddr;
973 struct pci_dev *pdev = NULL;
974 int agaw;
976 ASSERT(spin_is_locked(&pcidevs_lock));
977 spin_lock(&iommu->lock);
978 maddr = bus_to_context_maddr(iommu, bus);
979 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
980 context = &context_entries[devfn];
982 if ( context_present(*context) )
983 {
984 int res = 0;
986 pdev = pci_get_pdev(bus, devfn);
987 if (!pdev)
988 res = -ENODEV;
989 else if (pdev->domain != domain)
990 res = -EINVAL;
991 unmap_vtd_domain_page(context_entries);
992 spin_unlock(&iommu->lock);
993 return res;
994 }
996 if ( iommu_passthrough && (domain->domain_id == 0) )
997 {
998 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
999 agaw = level_to_agaw(iommu->nr_pt_levels);
1001 else
1003 spin_lock(&hd->mapping_lock);
1005 /* Ensure we have pagetables allocated down to leaf PTE. */
1006 if ( hd->pgd_maddr == 0 )
1008 addr_to_dma_page_maddr(domain, 0, 1);
1009 if ( hd->pgd_maddr == 0 )
1011 nomem:
1012 spin_unlock(&hd->mapping_lock);
1013 spin_unlock(&iommu->lock);
1014 unmap_vtd_domain_page(context_entries);
1015 return -ENOMEM;
1019 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1020 pgd_maddr = hd->pgd_maddr;
1021 for ( agaw = level_to_agaw(4);
1022 agaw != level_to_agaw(iommu->nr_pt_levels);
1023 agaw-- )
1025 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1026 pgd_maddr = dma_pte_addr(*p);
1027 unmap_vtd_domain_page(p);
1028 if ( pgd_maddr == 0 )
1029 goto nomem;
1032 context_set_address_root(*context, pgd_maddr);
1033 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1034 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1035 else
1036 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1038 spin_unlock(&hd->mapping_lock);
1041 /*
1042 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1043 * be 1 based as required by intel's iommu hw.
1044 */
1045 context_set_domain_id(context, domain);
1046 context_set_address_width(*context, agaw);
1047 context_set_fault_enable(*context);
1048 context_set_present(*context);
1049 iommu_flush_cache_entry(context);
1050 spin_unlock(&iommu->lock);
1052 /* Context entry was previously non-present (with domid 0). */
1053 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1054 DMA_CCMD_MASK_NOBIT, 1) )
1055 iommu_flush_write_buffer(iommu);
1056 else
1058 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1059 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1062 set_bit(iommu->index, &hd->iommu_bitmap);
1064 unmap_vtd_domain_page(context_entries);
1066 return 0;
1069 #define PCI_BASE_CLASS_BRIDGE 0x06
1070 #define PCI_CLASS_BRIDGE_PCI 0x0604
1072 enum {
1073 DEV_TYPE_PCIe_ENDPOINT,
1074 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1075 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1076 DEV_TYPE_PCI,
1077 };
1079 int pdev_type(u8 bus, u8 devfn)
1081 u16 class_device;
1082 u16 status, creg;
1083 int pos;
1084 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1086 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1087 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1089 pos = pci_find_next_cap(bus, devfn,
1090 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1091 if ( !pos )
1092 return DEV_TYPE_PCI_BRIDGE;
1093 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1094 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1095 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1098 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1099 if ( !(status & PCI_STATUS_CAP_LIST) )
1100 return DEV_TYPE_PCI;
1102 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1103 return DEV_TYPE_PCIe_ENDPOINT;
1105 return DEV_TYPE_PCI;
1108 #define MAX_BUSES 256
1109 static DEFINE_SPINLOCK(bus2bridge_lock);
1110 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1112 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1114 int cnt = 0;
1115 *secbus = *bus;
1117 ASSERT(spin_is_locked(&bus2bridge_lock));
1118 if ( !bus2bridge[*bus].map )
1119 return 0;
1121 while ( bus2bridge[*bus].map )
1123 *secbus = *bus;
1124 *devfn = bus2bridge[*bus].devfn;
1125 *bus = bus2bridge[*bus].bus;
1126 if ( cnt++ >= MAX_BUSES )
1127 return 0;
1130 return 1;
1133 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1135 int ret = 0;
1137 if ( *bus == 0 )
1138 /* assume integrated PCI devices in RC have valid requester-id */
1139 return 1;
1141 spin_lock(&bus2bridge_lock);
1142 ret = _find_pcie_endpoint(bus, devfn, secbus);
1143 spin_unlock(&bus2bridge_lock);
1145 return ret;
1148 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1150 struct acpi_drhd_unit *drhd;
1151 int ret = 0;
1152 u16 sec_bus, sub_bus;
1153 u32 type;
1154 u8 secbus, secdevfn;
1155 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1157 if ( pdev == NULL )
1159 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1160 * -> domain_context_mapping().
1161 * In the case a user enables VT-d and disables USB (that usually needs
1162 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1163 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1164 * the BDF and at last pci_get_pdev() returns NULL here.
1165 */
1166 gdprintk(XENLOG_WARNING VTDPREFIX,
1167 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1168 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1169 return 0;
1172 drhd = acpi_find_matched_drhd_unit(pdev);
1173 if ( !drhd )
1174 return -ENODEV;
1176 ASSERT(spin_is_locked(&pcidevs_lock));
1178 type = pdev_type(bus, devfn);
1179 switch ( type )
1181 case DEV_TYPE_PCIe_BRIDGE:
1182 break;
1184 case DEV_TYPE_PCI_BRIDGE:
1185 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1186 PCI_SECONDARY_BUS);
1187 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1188 PCI_SUBORDINATE_BUS);
1190 spin_lock(&bus2bridge_lock);
1191 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1193 bus2bridge[sec_bus].map = 1;
1194 bus2bridge[sec_bus].bus = bus;
1195 bus2bridge[sec_bus].devfn = devfn;
1197 spin_unlock(&bus2bridge_lock);
1198 break;
1200 case DEV_TYPE_PCIe_ENDPOINT:
1201 gdprintk(XENLOG_INFO VTDPREFIX,
1202 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1203 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1204 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1205 break;
1207 case DEV_TYPE_PCI:
1208 gdprintk(XENLOG_INFO VTDPREFIX,
1209 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1210 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1212 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1213 if ( ret )
1214 break;
1216 secbus = bus;
1217 secdevfn = devfn;
1218 /* dependent devices mapping */
1219 while ( bus2bridge[bus].map )
1221 secbus = bus;
1222 secdevfn = devfn;
1223 devfn = bus2bridge[bus].devfn;
1224 bus = bus2bridge[bus].bus;
1225 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1226 if ( ret )
1227 return ret;
1230 if ( (secbus != bus) && (secdevfn != 0) )
1231 /*
1232 * The source-id for transactions on non-PCIe buses seem
1233 * to originate from devfn=0 on the secondary bus behind
1234 * the bridge. Map that id as well. The id to use in
1235 * these scanarios is not particularly well documented
1236 * anywhere.
1237 */
1238 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1239 break;
1241 default:
1242 gdprintk(XENLOG_ERR VTDPREFIX,
1243 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1244 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1245 ret = -EINVAL;
1246 break;
1249 return ret;
1252 static int domain_context_unmap_one(
1253 struct domain *domain,
1254 struct iommu *iommu,
1255 u8 bus, u8 devfn)
1257 struct context_entry *context, *context_entries;
1258 u64 maddr;
1260 ASSERT(spin_is_locked(&pcidevs_lock));
1261 spin_lock(&iommu->lock);
1263 maddr = bus_to_context_maddr(iommu, bus);
1264 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1265 context = &context_entries[devfn];
1267 if ( !context_present(*context) )
1269 spin_unlock(&iommu->lock);
1270 unmap_vtd_domain_page(context_entries);
1271 return 0;
1274 context_clear_present(*context);
1275 context_clear_entry(*context);
1276 iommu_flush_cache_entry(context);
1278 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1279 (((u16)bus) << 8) | devfn,
1280 DMA_CCMD_MASK_NOBIT, 0) )
1281 iommu_flush_write_buffer(iommu);
1282 else
1284 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1285 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1288 spin_unlock(&iommu->lock);
1289 unmap_vtd_domain_page(context_entries);
1291 return 0;
1294 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1296 struct acpi_drhd_unit *drhd;
1297 int ret = 0;
1298 u32 type;
1299 u8 secbus, secdevfn;
1300 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1302 BUG_ON(!pdev);
1304 drhd = acpi_find_matched_drhd_unit(pdev);
1305 if ( !drhd )
1306 return -ENODEV;
1308 type = pdev_type(bus, devfn);
1309 switch ( type )
1311 case DEV_TYPE_PCIe_BRIDGE:
1312 case DEV_TYPE_PCI_BRIDGE:
1313 break;
1315 case DEV_TYPE_PCIe_ENDPOINT:
1316 gdprintk(XENLOG_INFO VTDPREFIX,
1317 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1318 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1319 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1320 break;
1322 case DEV_TYPE_PCI:
1323 gdprintk(XENLOG_INFO VTDPREFIX,
1324 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1325 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1326 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1327 if ( ret )
1328 break;
1330 secbus = bus;
1331 secdevfn = devfn;
1332 /* dependent devices unmapping */
1333 while ( bus2bridge[bus].map )
1335 secbus = bus;
1336 secdevfn = devfn;
1337 devfn = bus2bridge[bus].devfn;
1338 bus = bus2bridge[bus].bus;
1339 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1340 if ( ret )
1341 return ret;
1344 if ( (secbus != bus) && (secdevfn != 0) )
1345 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1346 break;
1348 default:
1349 gdprintk(XENLOG_ERR VTDPREFIX,
1350 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1351 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1352 ret = -EINVAL;
1353 break;
1356 return ret;
1359 static int reassign_device_ownership(
1360 struct domain *source,
1361 struct domain *target,
1362 u8 bus, u8 devfn)
1364 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1365 struct pci_dev *pdev;
1366 struct acpi_drhd_unit *drhd;
1367 struct iommu *pdev_iommu;
1368 int ret, found = 0;
1370 ASSERT(spin_is_locked(&pcidevs_lock));
1371 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1373 if (!pdev)
1374 return -ENODEV;
1376 drhd = acpi_find_matched_drhd_unit(pdev);
1377 pdev_iommu = drhd->iommu;
1378 domain_context_unmap(source, bus, devfn);
1380 ret = domain_context_mapping(target, bus, devfn);
1381 if ( ret )
1382 return ret;
1384 list_move(&pdev->domain_list, &target->arch.pdev_list);
1385 pdev->domain = target;
1387 for_each_pdev ( source, pdev )
1389 drhd = acpi_find_matched_drhd_unit(pdev);
1390 if ( drhd->iommu == pdev_iommu )
1392 found = 1;
1393 break;
1397 if ( !found )
1398 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1400 return ret;
1403 void iommu_domain_teardown(struct domain *d)
1405 struct hvm_iommu *hd = domain_hvm_iommu(d);
1407 if ( list_empty(&acpi_drhd_units) )
1408 return;
1410 spin_lock(&hd->mapping_lock);
1411 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1412 hd->pgd_maddr = 0;
1413 spin_unlock(&hd->mapping_lock);
1415 iommu_domid_release(d);
1418 int intel_iommu_map_page(
1419 struct domain *d, unsigned long gfn, unsigned long mfn)
1421 struct hvm_iommu *hd = domain_hvm_iommu(d);
1422 struct acpi_drhd_unit *drhd;
1423 struct iommu *iommu;
1424 struct dma_pte *page = NULL, *pte = NULL;
1425 u64 pg_maddr;
1426 int pte_present;
1427 int flush_dev_iotlb;
1429 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1430 iommu = drhd->iommu;
1432 /* do nothing if dom0 and iommu supports pass thru */
1433 if ( iommu_passthrough && (d->domain_id == 0) )
1434 return 0;
1436 spin_lock(&hd->mapping_lock);
1438 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1439 if ( pg_maddr == 0 )
1441 spin_unlock(&hd->mapping_lock);
1442 return -ENOMEM;
1444 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1445 pte = page + (gfn & LEVEL_MASK);
1446 pte_present = dma_pte_present(*pte);
1447 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1448 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1450 /* Set the SNP on leaf page table if Snoop Control available */
1451 if ( iommu_snoop )
1452 dma_set_pte_snp(*pte);
1454 iommu_flush_cache_entry(pte);
1455 spin_unlock(&hd->mapping_lock);
1456 unmap_vtd_domain_page(page);
1458 /*
1459 * No need pcideves_lock here because we have flush
1460 * when assign/deassign device
1461 */
1462 for_each_drhd_unit ( drhd )
1464 iommu = drhd->iommu;
1466 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1467 continue;
1469 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1470 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1471 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1472 !pte_present, flush_dev_iotlb) )
1473 iommu_flush_write_buffer(iommu);
1476 return 0;
1479 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1481 struct acpi_drhd_unit *drhd;
1482 struct iommu *iommu;
1484 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1485 iommu = drhd->iommu;
1487 /* do nothing if dom0 and iommu supports pass thru */
1488 if ( iommu_passthrough && (d->domain_id == 0) )
1489 return 0;
1491 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1493 return 0;
1496 static int iommu_prepare_rmrr_dev(struct domain *d,
1497 struct acpi_rmrr_unit *rmrr,
1498 u8 bus, u8 devfn)
1500 int ret = 0;
1501 u64 base, end;
1502 unsigned long base_pfn, end_pfn;
1504 ASSERT(spin_is_locked(&pcidevs_lock));
1505 ASSERT(rmrr->base_address < rmrr->end_address);
1507 base = rmrr->base_address & PAGE_MASK_4K;
1508 base_pfn = base >> PAGE_SHIFT_4K;
1509 end = PAGE_ALIGN_4K(rmrr->end_address);
1510 end_pfn = end >> PAGE_SHIFT_4K;
1512 while ( base_pfn < end_pfn )
1514 intel_iommu_map_page(d, base_pfn, base_pfn);
1515 base_pfn++;
1518 ret = domain_context_mapping(d, bus, devfn);
1520 return ret;
1523 static int intel_iommu_add_device(struct pci_dev *pdev)
1525 struct acpi_rmrr_unit *rmrr;
1526 u16 bdf;
1527 int ret, i;
1529 ASSERT(spin_is_locked(&pcidevs_lock));
1531 if ( !pdev->domain )
1532 return -EINVAL;
1534 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1535 if ( ret )
1537 gdprintk(XENLOG_ERR VTDPREFIX,
1538 "intel_iommu_add_device: context mapping failed\n");
1539 return ret;
1542 for_each_rmrr_device ( rmrr, bdf, i )
1544 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1546 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1547 pdev->bus, pdev->devfn);
1548 if ( ret )
1549 gdprintk(XENLOG_ERR VTDPREFIX,
1550 "intel_iommu_add_device: RMRR mapping failed\n");
1551 break;
1555 return ret;
1558 static int intel_iommu_remove_device(struct pci_dev *pdev)
1560 struct acpi_rmrr_unit *rmrr;
1561 u16 bdf;
1562 int i;
1564 if ( !pdev->domain )
1565 return -EINVAL;
1567 /* If the device belongs to dom0, and it has RMRR, don't remove it
1568 * from dom0, because BIOS may use RMRR at booting time.
1569 */
1570 if ( pdev->domain->domain_id == 0 )
1572 for_each_rmrr_device ( rmrr, bdf, i )
1574 if ( PCI_BUS(bdf) == pdev->bus &&
1575 PCI_DEVFN2(bdf) == pdev->devfn )
1576 return 0;
1580 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1583 static void setup_dom0_devices(struct domain *d)
1585 struct hvm_iommu *hd;
1586 struct pci_dev *pdev;
1587 int bus, dev, func;
1588 u32 l;
1590 hd = domain_hvm_iommu(d);
1592 spin_lock(&pcidevs_lock);
1593 for ( bus = 0; bus < 256; bus++ )
1595 for ( dev = 0; dev < 32; dev++ )
1597 for ( func = 0; func < 8; func++ )
1599 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1600 /* some broken boards return 0 or ~0 if a slot is empty: */
1601 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1602 (l == 0x0000ffff) || (l == 0xffff0000) )
1603 continue;
1605 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1606 pdev->domain = d;
1607 list_add(&pdev->domain_list, &d->arch.pdev_list);
1608 domain_context_mapping(d, pdev->bus, pdev->devfn);
1609 if ( ats_device(0, pdev->bus, pdev->devfn) )
1610 enable_ats_device(0, pdev->bus, pdev->devfn);
1614 spin_unlock(&pcidevs_lock);
1617 void clear_fault_bits(struct iommu *iommu)
1619 u64 val;
1620 unsigned long flags;
1622 spin_lock_irqsave(&iommu->register_lock, flags);
1623 val = dmar_readq(
1624 iommu->reg,
1625 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1626 dmar_writeq(
1627 iommu->reg,
1628 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1629 val);
1630 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1631 spin_unlock_irqrestore(&iommu->register_lock, flags);
1634 static int init_vtd_hw(void)
1636 struct acpi_drhd_unit *drhd;
1637 struct iommu *iommu;
1638 struct iommu_flush *flush = NULL;
1639 int vector;
1640 int ret;
1641 unsigned long flags;
1643 for_each_drhd_unit ( drhd )
1645 iommu = drhd->iommu;
1646 if ( iommu->vector < 0 )
1648 vector = iommu_set_interrupt(iommu);
1649 if ( vector < 0 )
1651 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1652 return vector;
1654 iommu->vector = vector;
1656 dma_msi_data_init(iommu, iommu->vector);
1657 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1658 clear_fault_bits(iommu);
1660 spin_lock_irqsave(&iommu->register_lock, flags);
1661 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1662 spin_unlock_irqrestore(&iommu->register_lock, flags);
1664 /* initialize flush functions */
1665 flush = iommu_get_flush(iommu);
1666 flush->context = flush_context_reg;
1667 flush->iotlb = flush_iotlb_reg;
1670 if ( iommu_qinval )
1672 for_each_drhd_unit ( drhd )
1674 iommu = drhd->iommu;
1675 if ( enable_qinval(iommu) != 0 )
1677 dprintk(XENLOG_INFO VTDPREFIX,
1678 "Failed to enable Queued Invalidation!\n");
1679 break;
1684 if ( iommu_intremap )
1686 for_each_drhd_unit ( drhd )
1688 iommu = drhd->iommu;
1689 if ( enable_intremap(iommu) != 0 )
1691 dprintk(XENLOG_INFO VTDPREFIX,
1692 "Failed to enable Interrupt Remapping!\n");
1693 break;
1698 for_each_drhd_unit ( drhd )
1700 iommu = drhd->iommu;
1701 ret = iommu_set_root_entry(iommu);
1702 if ( ret )
1704 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1705 return -EIO;
1709 /*
1710 * After set root entry, must globally invalidate context cache, and
1711 * then globally invalidate IOTLB
1712 */
1713 iommu_flush_all();
1715 return 0;
1718 static void setup_dom0_rmrr(struct domain *d)
1720 struct acpi_rmrr_unit *rmrr;
1721 u16 bdf;
1722 int ret, i;
1724 spin_lock(&pcidevs_lock);
1725 for_each_rmrr_device ( rmrr, bdf, i )
1727 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1728 if ( ret )
1729 gdprintk(XENLOG_ERR VTDPREFIX,
1730 "IOMMU: mapping reserved region failed\n");
1732 spin_unlock(&pcidevs_lock);
1735 static void platform_quirks(void)
1737 u32 id;
1739 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1740 id = pci_conf_read32(0, 0, 0, 0);
1741 if ( id == 0x2a408086 )
1743 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1744 rwbf_quirk = 1;
1748 int intel_vtd_setup(void)
1750 struct acpi_drhd_unit *drhd;
1751 struct iommu *iommu;
1753 if ( !iommu_enabled )
1754 return -ENODEV;
1756 platform_quirks();
1758 spin_lock_init(&domid_bitmap_lock);
1759 clflush_size = get_cache_line_size();
1761 /* We enable the following features only if they are supported by all VT-d
1762 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1763 * Interrupt Remapping.
1764 */
1765 for_each_drhd_unit ( drhd )
1767 if ( iommu_alloc(drhd) != 0 )
1768 goto error;
1770 iommu = drhd->iommu;
1772 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1773 iommu_snoop = 0;
1775 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1776 iommu_passthrough = 0;
1778 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1779 iommu_qinval = 0;
1781 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1782 iommu_intremap = 0;
1785 if ( !iommu_qinval && iommu_intremap )
1787 iommu_intremap = 0;
1788 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1789 "since Queued Invalidation isn't supported or enabled.\n");
1792 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1793 P(iommu_snoop, "Snoop Control");
1794 P(iommu_passthrough, "DMA Passthrough");
1795 P(iommu_qinval, "Queued Invalidation");
1796 P(iommu_intremap, "Interrupt Remapping");
1797 #undef P
1799 /* Allocate IO page directory page for the domain. */
1800 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1801 iommu = drhd->iommu;
1803 /* Allocate domain id bitmap, and set bit 0 as reserved */
1804 domid_bitmap_size = cap_ndoms(iommu->cap);
1805 domid_bitmap = xmalloc_array(unsigned long,
1806 BITS_TO_LONGS(domid_bitmap_size));
1807 if ( domid_bitmap == NULL )
1808 goto error;
1809 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1810 set_bit(0, domid_bitmap);
1812 if ( init_vtd_hw() )
1813 goto error;
1815 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1817 return 0;
1819 error:
1820 for_each_drhd_unit ( drhd )
1821 iommu_free(drhd);
1822 iommu_enabled = 0;
1823 iommu_snoop = 0;
1824 iommu_passthrough = 0;
1825 iommu_qinval = 0;
1826 iommu_intremap = 0;
1827 return -ENOMEM;
1830 /*
1831 * If the device isn't owned by dom0, it means it already
1832 * has been assigned to other domain, or it's not exist.
1833 */
1834 int device_assigned(u8 bus, u8 devfn)
1836 struct pci_dev *pdev;
1838 spin_lock(&pcidevs_lock);
1839 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1840 if (!pdev)
1842 spin_unlock(&pcidevs_lock);
1843 return -1;
1846 spin_unlock(&pcidevs_lock);
1847 return 0;
1850 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1852 struct acpi_rmrr_unit *rmrr;
1853 int ret = 0, i;
1854 struct pci_dev *pdev;
1855 u16 bdf;
1857 if ( list_empty(&acpi_drhd_units) )
1858 return -ENODEV;
1860 ASSERT(spin_is_locked(&pcidevs_lock));
1861 pdev = pci_get_pdev(bus, devfn);
1862 if (!pdev)
1863 return -ENODEV;
1865 if (pdev->domain != dom0)
1867 gdprintk(XENLOG_ERR VTDPREFIX,
1868 "IOMMU: assign a assigned device\n");
1869 return -EBUSY;
1872 ret = reassign_device_ownership(dom0, d, bus, devfn);
1873 if ( ret )
1874 goto done;
1876 /* Setup rmrr identity mapping */
1877 for_each_rmrr_device( rmrr, bdf, i )
1879 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1881 /* FIXME: Because USB RMRR conflicts with guest bios region,
1882 * ignore USB RMRR temporarily.
1883 */
1884 if ( is_usb_device(bus, devfn) )
1886 ret = 0;
1887 goto done;
1890 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1891 if ( ret )
1892 gdprintk(XENLOG_ERR VTDPREFIX,
1893 "IOMMU: mapping reserved region failed\n");
1894 goto done;
1898 done:
1899 return ret;
1902 static int intel_iommu_group_id(u8 bus, u8 devfn)
1904 u8 secbus;
1905 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1906 return PCI_BDF2(bus, devfn);
1907 else
1908 return -1;
1911 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1912 void iommu_suspend(void)
1914 struct acpi_drhd_unit *drhd;
1915 struct iommu *iommu;
1916 u32 i;
1918 if ( !iommu_enabled )
1919 return;
1921 iommu_flush_all();
1923 for_each_drhd_unit ( drhd )
1925 iommu = drhd->iommu;
1926 i = iommu->index;
1928 iommu_state[i][DMAR_FECTL_REG] =
1929 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1930 iommu_state[i][DMAR_FEDATA_REG] =
1931 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1932 iommu_state[i][DMAR_FEADDR_REG] =
1933 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1934 iommu_state[i][DMAR_FEUADDR_REG] =
1935 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1937 iommu_disable_translation(iommu);
1939 if ( iommu_intremap )
1940 disable_intremap(iommu);
1942 if ( iommu_qinval )
1943 disable_qinval(iommu);
1947 void iommu_resume(void)
1949 struct acpi_drhd_unit *drhd;
1950 struct iommu *iommu;
1951 u32 i;
1952 unsigned long flags;
1954 if ( !iommu_enabled )
1955 return;
1957 if ( init_vtd_hw() != 0 && force_iommu )
1958 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1960 for_each_drhd_unit ( drhd )
1962 iommu = drhd->iommu;
1963 i = iommu->index;
1965 spin_lock_irqsave(&iommu->register_lock, flags);
1966 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1967 (u32) iommu_state[i][DMAR_FECTL_REG]);
1968 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1969 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1970 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1971 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1972 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1973 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1974 spin_unlock_irqrestore(&iommu->register_lock, flags);
1976 iommu_enable_translation(iommu);
1980 struct iommu_ops intel_iommu_ops = {
1981 .init = intel_iommu_domain_init,
1982 .add_device = intel_iommu_add_device,
1983 .remove_device = intel_iommu_remove_device,
1984 .assign_device = intel_iommu_assign_device,
1985 .teardown = iommu_domain_teardown,
1986 .map_page = intel_iommu_map_page,
1987 .unmap_page = intel_iommu_unmap_page,
1988 .reassign_device = reassign_device_ownership,
1989 .get_device_group_id = intel_iommu_group_id,
1990 .update_ire_from_apic = io_apic_write_remap_rte,
1991 .update_ire_from_msi = msi_msg_write_remap_rte,
1992 .read_apic_from_ire = io_apic_read_remap_rte,
1993 .read_msi_from_ire = msi_msg_read_remap_rte,
1994 };
1996 /*
1997 * Local variables:
1998 * mode: C
1999 * c-set-style: "BSD"
2000 * c-basic-offset: 4
2001 * tab-width: 4
2002 * indent-tabs-mode: nil
2003 * End:
2004 */