ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19805:2f1fa2215e60

VT-d: pci code cleanup

This patch moves the pci code from iommu.c to pci.c. Instead of setup
pci hierarchy in array bus2bridge in iommu_context_mapping, use
scan_pci_devices once to add all existed PCI devices in system to
alldevs_list and setup pci hierarchy in array bus2bridge. In addition,
implement find_upstream_bridge to find the upstream PCIe-to-PCI/PCIX
bridge or PCI legacy bridge for a PCI device, therefore it's cleanly
to handle context map/unmap for PCI device, even for source-id
setting.

Signed-off-by: Weidong Han <weidong.han@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 19 08:45:20 2009 +0100 (2009-06-19)
parents 78962f85c562
children aa472909b39c
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flags;
234 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
235 return;
237 spin_lock_irqsave(&iommu->register_lock, flags);
238 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
239 dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
241 /* Make sure hardware complete it */
242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
243 !(val & DMA_GSTS_WBFS), val);
245 spin_unlock_irqrestore(&iommu->register_lock, flags);
246 }
248 /* return value determine if we need a write buffer flush */
249 static int flush_context_reg(
250 void *_iommu,
251 u16 did, u16 source_id, u8 function_mask, u64 type,
252 int flush_non_present_entry)
253 {
254 struct iommu *iommu = (struct iommu *) _iommu;
255 u64 val = 0;
256 unsigned long flags;
258 /*
259 * In the non-present entry flush case, if hardware doesn't cache
260 * non-present entry we do nothing and if hardware cache non-present
261 * entry, we flush entries of domain 0 (the domain id is used to cache
262 * any non-present entries)
263 */
264 if ( flush_non_present_entry )
265 {
266 if ( !cap_caching_mode(iommu->cap) )
267 return 1;
268 else
269 did = 0;
270 }
272 /* use register invalidation */
273 switch ( type )
274 {
275 case DMA_CCMD_GLOBAL_INVL:
276 val = DMA_CCMD_GLOBAL_INVL;
277 break;
278 case DMA_CCMD_DOMAIN_INVL:
279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
280 break;
281 case DMA_CCMD_DEVICE_INVL:
282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
283 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
284 break;
285 default:
286 BUG();
287 }
288 val |= DMA_CCMD_ICC;
290 spin_lock_irqsave(&iommu->register_lock, flags);
291 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
293 /* Make sure hardware complete it */
294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
295 !(val & DMA_CCMD_ICC), val);
297 spin_unlock_irqrestore(&iommu->register_lock, flags);
298 /* flush context entry will implicitly flush write buffer */
299 return 0;
300 }
302 static int inline iommu_flush_context_global(
303 struct iommu *iommu, int flush_non_present_entry)
304 {
305 struct iommu_flush *flush = iommu_get_flush(iommu);
306 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
307 flush_non_present_entry);
308 }
310 static int inline iommu_flush_context_domain(
311 struct iommu *iommu, u16 did, int flush_non_present_entry)
312 {
313 struct iommu_flush *flush = iommu_get_flush(iommu);
314 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
315 flush_non_present_entry);
316 }
318 static int inline iommu_flush_context_device(
319 struct iommu *iommu, u16 did, u16 source_id,
320 u8 function_mask, int flush_non_present_entry)
321 {
322 struct iommu_flush *flush = iommu_get_flush(iommu);
323 return flush->context(iommu, did, source_id, function_mask,
324 DMA_CCMD_DEVICE_INVL,
325 flush_non_present_entry);
326 }
328 /* return value determine if we need a write buffer flush */
329 static int flush_iotlb_reg(void *_iommu, u16 did,
330 u64 addr, unsigned int size_order, u64 type,
331 int flush_non_present_entry, int flush_dev_iotlb)
332 {
333 struct iommu *iommu = (struct iommu *) _iommu;
334 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
335 u64 val = 0, val_iva = 0;
336 unsigned long flags;
338 /*
339 * In the non-present entry flush case, if hardware doesn't cache
340 * non-present entry we do nothing and if hardware cache non-present
341 * entry, we flush entries of domain 0 (the domain id is used to cache
342 * any non-present entries)
343 */
344 if ( flush_non_present_entry )
345 {
346 if ( !cap_caching_mode(iommu->cap) )
347 return 1;
348 else
349 did = 0;
350 }
352 /* use register invalidation */
353 switch ( type )
354 {
355 case DMA_TLB_GLOBAL_FLUSH:
356 /* global flush doesn't need set IVA_REG */
357 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
358 break;
359 case DMA_TLB_DSI_FLUSH:
360 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
361 break;
362 case DMA_TLB_PSI_FLUSH:
363 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
364 /* Note: always flush non-leaf currently */
365 val_iva = size_order | addr;
366 break;
367 default:
368 BUG();
369 }
370 /* Note: set drain read/write */
371 if ( cap_read_drain(iommu->cap) )
372 val |= DMA_TLB_READ_DRAIN;
373 if ( cap_write_drain(iommu->cap) )
374 val |= DMA_TLB_WRITE_DRAIN;
376 spin_lock_irqsave(&iommu->register_lock, flags);
377 /* Note: Only uses first TLB reg currently */
378 if ( val_iva )
379 dmar_writeq(iommu->reg, tlb_offset, val_iva);
380 dmar_writeq(iommu->reg, tlb_offset + 8, val);
382 /* Make sure hardware complete it */
383 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
384 !(val & DMA_TLB_IVT), val);
385 spin_unlock_irqrestore(&iommu->register_lock, flags);
387 /* check IOTLB invalidation granularity */
388 if ( DMA_TLB_IAIG(val) == 0 )
389 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
391 /* flush iotlb entry will implicitly flush write buffer */
392 return 0;
393 }
395 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
396 int flush_non_present_entry, int flush_dev_iotlb)
397 {
398 struct iommu_flush *flush = iommu_get_flush(iommu);
399 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
400 flush_non_present_entry, flush_dev_iotlb);
401 }
403 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
404 int flush_non_present_entry, int flush_dev_iotlb)
405 {
406 struct iommu_flush *flush = iommu_get_flush(iommu);
407 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
408 flush_non_present_entry, flush_dev_iotlb);
409 }
411 static int inline get_alignment(u64 base, unsigned int size)
412 {
413 int t = 0;
414 u64 end;
416 end = base + size - 1;
417 while ( base != end )
418 {
419 t++;
420 base >>= 1;
421 end >>= 1;
422 }
423 return t;
424 }
426 static int inline iommu_flush_iotlb_psi(
427 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
428 int flush_non_present_entry, int flush_dev_iotlb)
429 {
430 unsigned int align;
431 struct iommu_flush *flush = iommu_get_flush(iommu);
433 ASSERT(!(addr & (~PAGE_MASK_4K)));
434 ASSERT(pages > 0);
436 /* Fallback to domain selective flush if no PSI support */
437 if ( !cap_pgsel_inv(iommu->cap) )
438 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
440 /*
441 * PSI requires page size is 2 ^ x, and the base address is naturally
442 * aligned to the size
443 */
444 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
445 /* Fallback to domain selective flush if size is too big */
446 if ( align > cap_max_amask_val(iommu->cap) )
447 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
449 addr >>= PAGE_SHIFT_4K + align;
450 addr <<= PAGE_SHIFT_4K + align;
452 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
453 flush_non_present_entry, flush_dev_iotlb);
454 }
456 void iommu_flush_all(void)
457 {
458 struct acpi_drhd_unit *drhd;
459 struct iommu *iommu;
460 int flush_dev_iotlb;
462 flush_all_cache();
463 for_each_drhd_unit ( drhd )
464 {
465 iommu = drhd->iommu;
466 iommu_flush_context_global(iommu, 0);
467 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
468 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
469 }
470 }
472 /* clear one page's page table */
473 static void dma_pte_clear_one(struct domain *domain, u64 addr)
474 {
475 struct hvm_iommu *hd = domain_hvm_iommu(domain);
476 struct acpi_drhd_unit *drhd;
477 struct iommu *iommu;
478 struct dma_pte *page = NULL, *pte = NULL;
479 u64 pg_maddr;
480 int flush_dev_iotlb;
482 spin_lock(&hd->mapping_lock);
483 /* get last level pte */
484 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
485 if ( pg_maddr == 0 )
486 {
487 spin_unlock(&hd->mapping_lock);
488 return;
489 }
491 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
492 pte = page + address_level_offset(addr, 1);
494 if ( !dma_pte_present(*pte) )
495 {
496 spin_unlock(&hd->mapping_lock);
497 unmap_vtd_domain_page(page);
498 return;
499 }
501 dma_clear_pte(*pte);
502 spin_unlock(&hd->mapping_lock);
503 iommu_flush_cache_entry(pte);
505 /* No need pcidevs_lock here since do that on assign/deassign device*/
506 for_each_drhd_unit ( drhd )
507 {
508 iommu = drhd->iommu;
509 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
510 {
511 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
512 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
513 addr, 1, 0, flush_dev_iotlb) )
514 iommu_flush_write_buffer(iommu);
515 }
516 }
518 unmap_vtd_domain_page(page);
519 }
521 static void iommu_free_pagetable(u64 pt_maddr, int level)
522 {
523 int i;
524 struct dma_pte *pt_vaddr, *pte;
525 int next_level = level - 1;
527 if ( pt_maddr == 0 )
528 return;
530 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
532 for ( i = 0; i < PTE_NUM; i++ )
533 {
534 pte = &pt_vaddr[i];
535 if ( !dma_pte_present(*pte) )
536 continue;
538 if ( next_level >= 1 )
539 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
541 dma_clear_pte(*pte);
542 iommu_flush_cache_entry(pte);
543 }
545 unmap_vtd_domain_page(pt_vaddr);
546 free_pgtable_maddr(pt_maddr);
547 }
549 static int iommu_set_root_entry(struct iommu *iommu)
550 {
551 u32 sts;
552 unsigned long flags;
554 spin_lock(&iommu->lock);
556 if ( iommu->root_maddr == 0 )
557 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
558 if ( iommu->root_maddr == 0 )
559 {
560 spin_unlock(&iommu->lock);
561 return -ENOMEM;
562 }
564 spin_unlock(&iommu->lock);
565 spin_lock_irqsave(&iommu->register_lock, flags);
566 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
568 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
569 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
571 /* Make sure hardware complete it */
572 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
573 (sts & DMA_GSTS_RTPS), sts);
574 spin_unlock_irqrestore(&iommu->register_lock, flags);
576 return 0;
577 }
579 static void iommu_enable_translation(struct iommu *iommu)
580 {
581 u32 sts;
582 unsigned long flags;
584 dprintk(XENLOG_INFO VTDPREFIX,
585 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
586 spin_lock_irqsave(&iommu->register_lock, flags);
587 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
588 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
590 /* Make sure hardware complete it */
591 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
592 (sts & DMA_GSTS_TES), sts);
593 spin_unlock_irqrestore(&iommu->register_lock, flags);
595 /* Disable PMRs when VT-d engine takes effect per spec definition */
596 disable_pmr(iommu);
597 }
599 static void iommu_disable_translation(struct iommu *iommu)
600 {
601 u32 sts;
602 unsigned long flags;
604 spin_lock_irqsave(&iommu->register_lock, flags);
605 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
606 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
608 /* Make sure hardware complete it */
609 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
610 !(sts & DMA_GSTS_TES), sts);
611 spin_unlock_irqrestore(&iommu->register_lock, flags);
612 }
614 static struct iommu *vector_to_iommu[NR_VECTORS];
615 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
616 u8 fault_reason, u16 source_id, u64 addr)
617 {
618 dprintk(XENLOG_WARNING VTDPREFIX,
619 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
620 "iommu->reg = %p\n",
621 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
622 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
623 fault_reason, iommu->reg);
625 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
626 if ( fault_reason < 0x20 )
627 print_vtd_entries(iommu, (source_id >> 8),
628 (source_id & 0xff), (addr >> PAGE_SHIFT));
629 #endif
631 return 0;
632 }
634 static void iommu_fault_status(u32 fault_status)
635 {
636 if ( fault_status & DMA_FSTS_PFO )
637 dprintk(XENLOG_ERR VTDPREFIX,
638 "iommu_fault_status: Fault Overflow\n");
639 if ( fault_status & DMA_FSTS_PPF )
640 dprintk(XENLOG_ERR VTDPREFIX,
641 "iommu_fault_status: Primary Pending Fault\n");
642 if ( fault_status & DMA_FSTS_AFO )
643 dprintk(XENLOG_ERR VTDPREFIX,
644 "iommu_fault_status: Advanced Fault Overflow\n");
645 if ( fault_status & DMA_FSTS_APF )
646 dprintk(XENLOG_ERR VTDPREFIX,
647 "iommu_fault_status: Advanced Pending Fault\n");
648 if ( fault_status & DMA_FSTS_IQE )
649 dprintk(XENLOG_ERR VTDPREFIX,
650 "iommu_fault_status: Invalidation Queue Error\n");
651 if ( fault_status & DMA_FSTS_ICE )
652 dprintk(XENLOG_ERR VTDPREFIX,
653 "iommu_fault_status: Invalidation Completion Error\n");
654 if ( fault_status & DMA_FSTS_ITE )
655 dprintk(XENLOG_ERR VTDPREFIX,
656 "iommu_fault_status: Invalidation Time-out Error\n");
657 }
659 #define PRIMARY_FAULT_REG_LEN (16)
660 static void iommu_page_fault(int vector, void *dev_id,
661 struct cpu_user_regs *regs)
662 {
663 struct iommu *iommu = dev_id;
664 int reg, fault_index;
665 u32 fault_status;
666 unsigned long flags;
668 dprintk(XENLOG_WARNING VTDPREFIX,
669 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
671 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
673 iommu_fault_status(fault_status);
675 /* FIXME: ignore advanced fault log */
676 if ( !(fault_status & DMA_FSTS_PPF) )
677 goto clear_overflow;
679 fault_index = dma_fsts_fault_record_index(fault_status);
680 reg = cap_fault_reg_offset(iommu->cap);
681 while (1)
682 {
683 u8 fault_reason;
684 u16 source_id;
685 u32 data;
686 u64 guest_addr;
687 int type;
689 /* highest 32 bits */
690 spin_lock_irqsave(&iommu->register_lock, flags);
691 data = dmar_readl(iommu->reg, reg +
692 fault_index * PRIMARY_FAULT_REG_LEN + 12);
693 if ( !(data & DMA_FRCD_F) )
694 {
695 spin_unlock_irqrestore(&iommu->register_lock, flags);
696 break;
697 }
699 fault_reason = dma_frcd_fault_reason(data);
700 type = dma_frcd_type(data);
702 data = dmar_readl(iommu->reg, reg +
703 fault_index * PRIMARY_FAULT_REG_LEN + 8);
704 source_id = dma_frcd_source_id(data);
706 guest_addr = dmar_readq(iommu->reg, reg +
707 fault_index * PRIMARY_FAULT_REG_LEN);
708 guest_addr = dma_frcd_page_addr(guest_addr);
709 /* clear the fault */
710 dmar_writel(iommu->reg, reg +
711 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
712 spin_unlock_irqrestore(&iommu->register_lock, flags);
714 iommu_page_fault_do_one(iommu, type, fault_reason,
715 source_id, guest_addr);
717 fault_index++;
718 if ( fault_index > cap_num_fault_regs(iommu->cap) )
719 fault_index = 0;
720 }
721 clear_overflow:
722 /* clear primary fault overflow */
723 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
724 if ( fault_status & DMA_FSTS_PFO )
725 {
726 spin_lock_irqsave(&iommu->register_lock, flags);
727 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
728 spin_unlock_irqrestore(&iommu->register_lock, flags);
729 }
730 }
732 static void dma_msi_unmask(unsigned int vector)
733 {
734 struct iommu *iommu = vector_to_iommu[vector];
735 unsigned long flags;
737 /* unmask it */
738 spin_lock_irqsave(&iommu->register_lock, flags);
739 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
740 spin_unlock_irqrestore(&iommu->register_lock, flags);
741 }
743 static void dma_msi_mask(unsigned int vector)
744 {
745 unsigned long flags;
746 struct iommu *iommu = vector_to_iommu[vector];
748 /* mask it */
749 spin_lock_irqsave(&iommu->register_lock, flags);
750 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
751 spin_unlock_irqrestore(&iommu->register_lock, flags);
752 }
754 static unsigned int dma_msi_startup(unsigned int vector)
755 {
756 dma_msi_unmask(vector);
757 return 0;
758 }
760 static void dma_msi_end(unsigned int vector)
761 {
762 dma_msi_unmask(vector);
763 ack_APIC_irq();
764 }
766 static void dma_msi_data_init(struct iommu *iommu, int vector)
767 {
768 u32 msi_data = 0;
769 unsigned long flags;
771 /* Fixed, edge, assert mode. Follow MSI setting */
772 msi_data |= vector & 0xff;
773 msi_data |= 1 << 14;
775 spin_lock_irqsave(&iommu->register_lock, flags);
776 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
777 spin_unlock_irqrestore(&iommu->register_lock, flags);
778 }
780 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
781 {
782 u64 msi_address;
783 unsigned long flags;
785 /* Physical, dedicated cpu. Follow MSI setting */
786 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
787 msi_address |= MSI_PHYSICAL_MODE << 2;
788 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
789 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
791 spin_lock_irqsave(&iommu->register_lock, flags);
792 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
793 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
794 spin_unlock_irqrestore(&iommu->register_lock, flags);
795 }
797 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
798 {
799 struct iommu *iommu = vector_to_iommu[vector];
800 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
801 }
803 static struct hw_interrupt_type dma_msi_type = {
804 .typename = "DMA_MSI",
805 .startup = dma_msi_startup,
806 .shutdown = dma_msi_mask,
807 .enable = dma_msi_unmask,
808 .disable = dma_msi_mask,
809 .ack = dma_msi_mask,
810 .end = dma_msi_end,
811 .set_affinity = dma_msi_set_affinity,
812 };
814 static int iommu_set_interrupt(struct iommu *iommu)
815 {
816 int vector, ret;
818 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
819 if ( vector <= 0 )
820 {
821 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
822 return -EINVAL;
823 }
825 irq_desc[vector].handler = &dma_msi_type;
826 vector_to_iommu[vector] = iommu;
827 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
828 if ( ret )
829 {
830 irq_desc[vector].handler = &no_irq_type;
831 vector_to_iommu[vector] = NULL;
832 free_irq_vector(vector);
833 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
834 return ret;
835 }
837 /* Make sure that vector is never re-used. */
838 vector_irq[vector] = NEVER_ASSIGN_IRQ;
840 return vector;
841 }
843 static int iommu_alloc(struct acpi_drhd_unit *drhd)
844 {
845 struct iommu *iommu;
846 unsigned long sagaw;
847 int agaw;
849 if ( nr_iommus > MAX_IOMMUS )
850 {
851 gdprintk(XENLOG_ERR VTDPREFIX,
852 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
853 return -ENOMEM;
854 }
856 iommu = xmalloc(struct iommu);
857 if ( iommu == NULL )
858 return -ENOMEM;
859 memset(iommu, 0, sizeof(struct iommu));
861 iommu->vector = -1; /* No vector assigned yet. */
863 iommu->intel = alloc_intel_iommu();
864 if ( iommu->intel == NULL )
865 {
866 xfree(iommu);
867 return -ENOMEM;
868 }
870 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
871 iommu->index = nr_iommus++;
873 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
874 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
876 gdprintk(XENLOG_INFO VTDPREFIX,
877 "drhd->address = %"PRIx64"\n", drhd->address);
878 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
880 /* Calculate number of pagetable levels: between 2 and 4. */
881 sagaw = cap_sagaw(iommu->cap);
882 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
883 if ( test_bit(agaw, &sagaw) )
884 break;
885 if ( agaw < 0 )
886 {
887 gdprintk(XENLOG_ERR VTDPREFIX,
888 "IOMMU: unsupported sagaw %lx\n", sagaw);
889 xfree(iommu);
890 return -ENODEV;
891 }
892 iommu->nr_pt_levels = agaw_to_level(agaw);
894 if ( !ecap_coherent(iommu->ecap) )
895 iommus_incoherent = 1;
897 spin_lock_init(&iommu->lock);
898 spin_lock_init(&iommu->register_lock);
900 drhd->iommu = iommu;
901 return 0;
902 }
904 static void iommu_free(struct acpi_drhd_unit *drhd)
905 {
906 struct iommu *iommu = drhd->iommu;
908 if ( iommu == NULL )
909 return;
911 if ( iommu->root_maddr != 0 )
912 {
913 free_pgtable_maddr(iommu->root_maddr);
914 iommu->root_maddr = 0;
915 }
917 if ( iommu->reg )
918 iounmap(iommu->reg);
920 free_intel_iommu(iommu->intel);
921 release_irq_vector(iommu->vector);
922 xfree(iommu);
924 drhd->iommu = NULL;
925 }
927 #define guestwidth_to_adjustwidth(gaw) ({ \
928 int agaw, r = (gaw - 12) % 9; \
929 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
930 if ( agaw > 64 ) \
931 agaw = 64; \
932 agaw; })
934 static int intel_iommu_domain_init(struct domain *d)
935 {
936 struct hvm_iommu *hd = domain_hvm_iommu(d);
937 struct iommu *iommu = NULL;
938 struct acpi_drhd_unit *drhd;
940 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
941 iommu = drhd->iommu;
943 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
945 if ( d->domain_id == 0 )
946 {
947 /* Set up 1:1 page table for dom0 */
948 iommu_set_dom0_mapping(d);
950 setup_dom0_devices(d);
951 setup_dom0_rmrr(d);
953 iommu_flush_all();
955 for_each_drhd_unit ( drhd )
956 {
957 iommu = drhd->iommu;
958 iommu_enable_translation(iommu);
959 }
960 }
962 return 0;
963 }
965 static int domain_context_mapping_one(
966 struct domain *domain,
967 struct iommu *iommu,
968 u8 bus, u8 devfn)
969 {
970 struct hvm_iommu *hd = domain_hvm_iommu(domain);
971 struct context_entry *context, *context_entries;
972 u64 maddr, pgd_maddr;
973 struct pci_dev *pdev = NULL;
974 int agaw;
976 ASSERT(spin_is_locked(&pcidevs_lock));
977 spin_lock(&iommu->lock);
978 maddr = bus_to_context_maddr(iommu, bus);
979 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
980 context = &context_entries[devfn];
982 if ( context_present(*context) )
983 {
984 int res = 0;
986 pdev = pci_get_pdev(bus, devfn);
987 if (!pdev)
988 res = -ENODEV;
989 else if (pdev->domain != domain)
990 res = -EINVAL;
991 unmap_vtd_domain_page(context_entries);
992 spin_unlock(&iommu->lock);
993 return res;
994 }
996 if ( iommu_passthrough && (domain->domain_id == 0) )
997 {
998 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
999 agaw = level_to_agaw(iommu->nr_pt_levels);
1001 else
1003 spin_lock(&hd->mapping_lock);
1005 /* Ensure we have pagetables allocated down to leaf PTE. */
1006 if ( hd->pgd_maddr == 0 )
1008 addr_to_dma_page_maddr(domain, 0, 1);
1009 if ( hd->pgd_maddr == 0 )
1011 nomem:
1012 spin_unlock(&hd->mapping_lock);
1013 spin_unlock(&iommu->lock);
1014 unmap_vtd_domain_page(context_entries);
1015 return -ENOMEM;
1019 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1020 pgd_maddr = hd->pgd_maddr;
1021 for ( agaw = level_to_agaw(4);
1022 agaw != level_to_agaw(iommu->nr_pt_levels);
1023 agaw-- )
1025 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1026 pgd_maddr = dma_pte_addr(*p);
1027 unmap_vtd_domain_page(p);
1028 if ( pgd_maddr == 0 )
1029 goto nomem;
1032 context_set_address_root(*context, pgd_maddr);
1033 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1034 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1035 else
1036 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1038 spin_unlock(&hd->mapping_lock);
1041 /*
1042 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1043 * be 1 based as required by intel's iommu hw.
1044 */
1045 context_set_domain_id(context, domain);
1046 context_set_address_width(*context, agaw);
1047 context_set_fault_enable(*context);
1048 context_set_present(*context);
1049 iommu_flush_cache_entry(context);
1050 spin_unlock(&iommu->lock);
1052 /* Context entry was previously non-present (with domid 0). */
1053 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1054 DMA_CCMD_MASK_NOBIT, 1) )
1055 iommu_flush_write_buffer(iommu);
1056 else
1058 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1059 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1062 set_bit(iommu->index, &hd->iommu_bitmap);
1064 unmap_vtd_domain_page(context_entries);
1066 return 0;
1069 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1071 struct acpi_drhd_unit *drhd;
1072 int ret = 0;
1073 u32 type;
1074 u8 secbus;
1075 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1077 if ( pdev == NULL )
1079 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1080 * -> domain_context_mapping().
1081 * In the case a user enables VT-d and disables USB (that usually needs
1082 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1083 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1084 * the BDF and at last pci_get_pdev() returns NULL here.
1085 */
1086 gdprintk(XENLOG_WARNING VTDPREFIX,
1087 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1088 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1089 return 0;
1092 drhd = acpi_find_matched_drhd_unit(pdev);
1093 if ( !drhd )
1094 return -ENODEV;
1096 ASSERT(spin_is_locked(&pcidevs_lock));
1098 type = pdev_type(bus, devfn);
1099 switch ( type )
1101 case DEV_TYPE_PCIe_BRIDGE:
1102 case DEV_TYPE_PCIe2PCI_BRIDGE:
1103 case DEV_TYPE_LEGACY_PCI_BRIDGE:
1104 break;
1106 case DEV_TYPE_PCIe_ENDPOINT:
1107 gdprintk(XENLOG_INFO VTDPREFIX,
1108 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1109 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1110 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1111 break;
1113 case DEV_TYPE_PCI:
1114 gdprintk(XENLOG_INFO VTDPREFIX,
1115 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1116 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1118 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1119 if ( ret )
1120 break;
1122 if ( find_upstream_bridge(&bus, &devfn, &secbus) < 1 )
1123 break;
1125 /* PCIe to PCI/PCIx bridge */
1126 if ( pdev_type(bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1128 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1129 if ( ret )
1130 return ret;
1132 /*
1133 * Devices behind PCIe-to-PCI/PCIx bridge may generate
1134 * different requester-id. It may originate from devfn=0
1135 * on the secondary bus behind the bridge. Map that id
1136 * as well.
1137 */
1138 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1140 else /* Legacy PCI bridge */
1141 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1143 break;
1145 default:
1146 gdprintk(XENLOG_ERR VTDPREFIX,
1147 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1148 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1149 ret = -EINVAL;
1150 break;
1153 return ret;
1156 static int domain_context_unmap_one(
1157 struct domain *domain,
1158 struct iommu *iommu,
1159 u8 bus, u8 devfn)
1161 struct context_entry *context, *context_entries;
1162 u64 maddr;
1164 ASSERT(spin_is_locked(&pcidevs_lock));
1165 spin_lock(&iommu->lock);
1167 maddr = bus_to_context_maddr(iommu, bus);
1168 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1169 context = &context_entries[devfn];
1171 if ( !context_present(*context) )
1173 spin_unlock(&iommu->lock);
1174 unmap_vtd_domain_page(context_entries);
1175 return 0;
1178 context_clear_present(*context);
1179 context_clear_entry(*context);
1180 iommu_flush_cache_entry(context);
1182 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1183 (((u16)bus) << 8) | devfn,
1184 DMA_CCMD_MASK_NOBIT, 0) )
1185 iommu_flush_write_buffer(iommu);
1186 else
1188 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1189 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1192 spin_unlock(&iommu->lock);
1193 unmap_vtd_domain_page(context_entries);
1195 return 0;
1198 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1200 struct acpi_drhd_unit *drhd;
1201 int ret = 0;
1202 u32 type;
1203 u8 secbus;
1204 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1206 BUG_ON(!pdev);
1208 drhd = acpi_find_matched_drhd_unit(pdev);
1209 if ( !drhd )
1210 return -ENODEV;
1212 type = pdev_type(bus, devfn);
1213 switch ( type )
1215 case DEV_TYPE_PCIe_BRIDGE:
1216 case DEV_TYPE_PCIe2PCI_BRIDGE:
1217 case DEV_TYPE_LEGACY_PCI_BRIDGE:
1218 break;
1220 case DEV_TYPE_PCIe_ENDPOINT:
1221 gdprintk(XENLOG_INFO VTDPREFIX,
1222 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1223 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1224 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1225 break;
1227 case DEV_TYPE_PCI:
1228 gdprintk(XENLOG_INFO VTDPREFIX,
1229 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1230 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1231 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1232 if ( ret )
1233 break;
1235 if ( find_upstream_bridge(&bus, &devfn, &secbus) < 1 )
1236 break;
1238 /* PCIe to PCI/PCIx bridge */
1239 if ( pdev_type(bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1241 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1242 if ( ret )
1243 return ret;
1245 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1247 else /* Legacy PCI bridge */
1248 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1250 break;
1252 default:
1253 gdprintk(XENLOG_ERR VTDPREFIX,
1254 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1255 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1256 ret = -EINVAL;
1257 break;
1260 return ret;
1263 static int reassign_device_ownership(
1264 struct domain *source,
1265 struct domain *target,
1266 u8 bus, u8 devfn)
1268 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1269 struct pci_dev *pdev;
1270 struct acpi_drhd_unit *drhd;
1271 struct iommu *pdev_iommu;
1272 int ret, found = 0;
1274 ASSERT(spin_is_locked(&pcidevs_lock));
1275 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1277 if (!pdev)
1278 return -ENODEV;
1280 drhd = acpi_find_matched_drhd_unit(pdev);
1281 pdev_iommu = drhd->iommu;
1282 domain_context_unmap(source, bus, devfn);
1284 ret = domain_context_mapping(target, bus, devfn);
1285 if ( ret )
1286 return ret;
1288 list_move(&pdev->domain_list, &target->arch.pdev_list);
1289 pdev->domain = target;
1291 for_each_pdev ( source, pdev )
1293 drhd = acpi_find_matched_drhd_unit(pdev);
1294 if ( drhd->iommu == pdev_iommu )
1296 found = 1;
1297 break;
1301 if ( !found )
1302 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1304 return ret;
1307 void iommu_domain_teardown(struct domain *d)
1309 struct hvm_iommu *hd = domain_hvm_iommu(d);
1311 if ( list_empty(&acpi_drhd_units) )
1312 return;
1314 spin_lock(&hd->mapping_lock);
1315 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1316 hd->pgd_maddr = 0;
1317 spin_unlock(&hd->mapping_lock);
1319 iommu_domid_release(d);
1322 int intel_iommu_map_page(
1323 struct domain *d, unsigned long gfn, unsigned long mfn)
1325 struct hvm_iommu *hd = domain_hvm_iommu(d);
1326 struct acpi_drhd_unit *drhd;
1327 struct iommu *iommu;
1328 struct dma_pte *page = NULL, *pte = NULL;
1329 u64 pg_maddr;
1330 int pte_present;
1331 int flush_dev_iotlb;
1333 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1334 iommu = drhd->iommu;
1336 /* do nothing if dom0 and iommu supports pass thru */
1337 if ( iommu_passthrough && (d->domain_id == 0) )
1338 return 0;
1340 spin_lock(&hd->mapping_lock);
1342 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1343 if ( pg_maddr == 0 )
1345 spin_unlock(&hd->mapping_lock);
1346 return -ENOMEM;
1348 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1349 pte = page + (gfn & LEVEL_MASK);
1350 pte_present = dma_pte_present(*pte);
1351 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1352 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1354 /* Set the SNP on leaf page table if Snoop Control available */
1355 if ( iommu_snoop )
1356 dma_set_pte_snp(*pte);
1358 iommu_flush_cache_entry(pte);
1359 spin_unlock(&hd->mapping_lock);
1360 unmap_vtd_domain_page(page);
1362 /*
1363 * No need pcideves_lock here because we have flush
1364 * when assign/deassign device
1365 */
1366 for_each_drhd_unit ( drhd )
1368 iommu = drhd->iommu;
1370 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1371 continue;
1373 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1374 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1375 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1376 !pte_present, flush_dev_iotlb) )
1377 iommu_flush_write_buffer(iommu);
1380 return 0;
1383 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1385 struct acpi_drhd_unit *drhd;
1386 struct iommu *iommu;
1388 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1389 iommu = drhd->iommu;
1391 /* do nothing if dom0 and iommu supports pass thru */
1392 if ( iommu_passthrough && (d->domain_id == 0) )
1393 return 0;
1395 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1397 return 0;
1400 static int iommu_prepare_rmrr_dev(struct domain *d,
1401 struct acpi_rmrr_unit *rmrr,
1402 u8 bus, u8 devfn)
1404 int ret = 0;
1405 u64 base, end;
1406 unsigned long base_pfn, end_pfn;
1408 ASSERT(spin_is_locked(&pcidevs_lock));
1409 ASSERT(rmrr->base_address < rmrr->end_address);
1411 base = rmrr->base_address & PAGE_MASK_4K;
1412 base_pfn = base >> PAGE_SHIFT_4K;
1413 end = PAGE_ALIGN_4K(rmrr->end_address);
1414 end_pfn = end >> PAGE_SHIFT_4K;
1416 while ( base_pfn < end_pfn )
1418 intel_iommu_map_page(d, base_pfn, base_pfn);
1419 base_pfn++;
1422 ret = domain_context_mapping(d, bus, devfn);
1424 return ret;
1427 static int intel_iommu_add_device(struct pci_dev *pdev)
1429 struct acpi_rmrr_unit *rmrr;
1430 u16 bdf;
1431 int ret, i;
1433 ASSERT(spin_is_locked(&pcidevs_lock));
1435 if ( !pdev->domain )
1436 return -EINVAL;
1438 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1439 if ( ret )
1441 gdprintk(XENLOG_ERR VTDPREFIX,
1442 "intel_iommu_add_device: context mapping failed\n");
1443 return ret;
1446 for_each_rmrr_device ( rmrr, bdf, i )
1448 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1450 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1451 pdev->bus, pdev->devfn);
1452 if ( ret )
1453 gdprintk(XENLOG_ERR VTDPREFIX,
1454 "intel_iommu_add_device: RMRR mapping failed\n");
1455 break;
1459 return ret;
1462 static int intel_iommu_remove_device(struct pci_dev *pdev)
1464 struct acpi_rmrr_unit *rmrr;
1465 u16 bdf;
1466 int i;
1468 if ( !pdev->domain )
1469 return -EINVAL;
1471 /* If the device belongs to dom0, and it has RMRR, don't remove it
1472 * from dom0, because BIOS may use RMRR at booting time.
1473 */
1474 if ( pdev->domain->domain_id == 0 )
1476 for_each_rmrr_device ( rmrr, bdf, i )
1478 if ( PCI_BUS(bdf) == pdev->bus &&
1479 PCI_DEVFN2(bdf) == pdev->devfn )
1480 return 0;
1484 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1487 static void setup_dom0_devices(struct domain *d)
1489 struct hvm_iommu *hd;
1490 struct pci_dev *pdev;
1491 int bus, devfn;
1493 hd = domain_hvm_iommu(d);
1495 spin_lock(&pcidevs_lock);
1496 for ( bus = 0; bus < 256; bus++ )
1498 for ( devfn = 0; devfn < 256; devfn++ )
1500 pdev = pci_get_pdev(bus, devfn);
1501 if ( !pdev )
1502 continue;
1504 pdev->domain = d;
1505 list_add(&pdev->domain_list, &d->arch.pdev_list);
1506 domain_context_mapping(d, pdev->bus, pdev->devfn);
1507 if ( ats_device(0, pdev->bus, pdev->devfn) )
1508 enable_ats_device(0, pdev->bus, pdev->devfn);
1511 spin_unlock(&pcidevs_lock);
1514 void clear_fault_bits(struct iommu *iommu)
1516 u64 val;
1517 unsigned long flags;
1519 spin_lock_irqsave(&iommu->register_lock, flags);
1520 val = dmar_readq(
1521 iommu->reg,
1522 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1523 dmar_writeq(
1524 iommu->reg,
1525 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1526 val);
1527 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1528 spin_unlock_irqrestore(&iommu->register_lock, flags);
1531 static int init_vtd_hw(void)
1533 struct acpi_drhd_unit *drhd;
1534 struct iommu *iommu;
1535 struct iommu_flush *flush = NULL;
1536 int vector;
1537 int ret;
1538 unsigned long flags;
1540 for_each_drhd_unit ( drhd )
1542 iommu = drhd->iommu;
1543 if ( iommu->vector < 0 )
1545 vector = iommu_set_interrupt(iommu);
1546 if ( vector < 0 )
1548 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1549 return vector;
1551 iommu->vector = vector;
1553 dma_msi_data_init(iommu, iommu->vector);
1554 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1555 clear_fault_bits(iommu);
1557 spin_lock_irqsave(&iommu->register_lock, flags);
1558 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1559 spin_unlock_irqrestore(&iommu->register_lock, flags);
1561 /* initialize flush functions */
1562 flush = iommu_get_flush(iommu);
1563 flush->context = flush_context_reg;
1564 flush->iotlb = flush_iotlb_reg;
1567 if ( iommu_qinval )
1569 for_each_drhd_unit ( drhd )
1571 iommu = drhd->iommu;
1572 if ( enable_qinval(iommu) != 0 )
1574 dprintk(XENLOG_INFO VTDPREFIX,
1575 "Failed to enable Queued Invalidation!\n");
1576 break;
1581 if ( iommu_intremap )
1583 for_each_drhd_unit ( drhd )
1585 iommu = drhd->iommu;
1586 if ( enable_intremap(iommu) != 0 )
1588 dprintk(XENLOG_INFO VTDPREFIX,
1589 "Failed to enable Interrupt Remapping!\n");
1590 break;
1595 for_each_drhd_unit ( drhd )
1597 iommu = drhd->iommu;
1598 ret = iommu_set_root_entry(iommu);
1599 if ( ret )
1601 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1602 return -EIO;
1606 /*
1607 * After set root entry, must globally invalidate context cache, and
1608 * then globally invalidate IOTLB
1609 */
1610 iommu_flush_all();
1612 return 0;
1615 static void setup_dom0_rmrr(struct domain *d)
1617 struct acpi_rmrr_unit *rmrr;
1618 u16 bdf;
1619 int ret, i;
1621 spin_lock(&pcidevs_lock);
1622 for_each_rmrr_device ( rmrr, bdf, i )
1624 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1625 if ( ret )
1626 gdprintk(XENLOG_ERR VTDPREFIX,
1627 "IOMMU: mapping reserved region failed\n");
1629 spin_unlock(&pcidevs_lock);
1632 static void platform_quirks(void)
1634 u32 id;
1636 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1637 id = pci_conf_read32(0, 0, 0, 0);
1638 if ( id == 0x2a408086 )
1640 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1641 rwbf_quirk = 1;
1645 int intel_vtd_setup(void)
1647 struct acpi_drhd_unit *drhd;
1648 struct iommu *iommu;
1650 if ( !iommu_enabled )
1651 return -ENODEV;
1653 platform_quirks();
1655 spin_lock_init(&domid_bitmap_lock);
1656 clflush_size = get_cache_line_size();
1658 /* We enable the following features only if they are supported by all VT-d
1659 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1660 * Interrupt Remapping.
1661 */
1662 for_each_drhd_unit ( drhd )
1664 if ( iommu_alloc(drhd) != 0 )
1665 goto error;
1667 iommu = drhd->iommu;
1669 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1670 iommu_snoop = 0;
1672 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1673 iommu_passthrough = 0;
1675 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1676 iommu_qinval = 0;
1678 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1679 iommu_intremap = 0;
1682 if ( !iommu_qinval && iommu_intremap )
1684 iommu_intremap = 0;
1685 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1686 "since Queued Invalidation isn't supported or enabled.\n");
1689 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1690 P(iommu_snoop, "Snoop Control");
1691 P(iommu_passthrough, "DMA Passthrough");
1692 P(iommu_qinval, "Queued Invalidation");
1693 P(iommu_intremap, "Interrupt Remapping");
1694 #undef P
1696 /* Allocate IO page directory page for the domain. */
1697 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1698 iommu = drhd->iommu;
1700 /* Allocate domain id bitmap, and set bit 0 as reserved */
1701 domid_bitmap_size = cap_ndoms(iommu->cap);
1702 domid_bitmap = xmalloc_array(unsigned long,
1703 BITS_TO_LONGS(domid_bitmap_size));
1704 if ( domid_bitmap == NULL )
1705 goto error;
1706 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1707 set_bit(0, domid_bitmap);
1709 scan_pci_devices();
1711 if ( init_vtd_hw() )
1712 goto error;
1714 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1716 return 0;
1718 error:
1719 for_each_drhd_unit ( drhd )
1720 iommu_free(drhd);
1721 iommu_enabled = 0;
1722 iommu_snoop = 0;
1723 iommu_passthrough = 0;
1724 iommu_qinval = 0;
1725 iommu_intremap = 0;
1726 return -ENOMEM;
1729 /*
1730 * If the device isn't owned by dom0, it means it already
1731 * has been assigned to other domain, or it's not exist.
1732 */
1733 int device_assigned(u8 bus, u8 devfn)
1735 struct pci_dev *pdev;
1737 spin_lock(&pcidevs_lock);
1738 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1739 if (!pdev)
1741 spin_unlock(&pcidevs_lock);
1742 return -1;
1745 spin_unlock(&pcidevs_lock);
1746 return 0;
1749 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1751 struct acpi_rmrr_unit *rmrr;
1752 int ret = 0, i;
1753 struct pci_dev *pdev;
1754 u16 bdf;
1756 if ( list_empty(&acpi_drhd_units) )
1757 return -ENODEV;
1759 ASSERT(spin_is_locked(&pcidevs_lock));
1760 pdev = pci_get_pdev(bus, devfn);
1761 if (!pdev)
1762 return -ENODEV;
1764 if (pdev->domain != dom0)
1766 gdprintk(XENLOG_ERR VTDPREFIX,
1767 "IOMMU: assign a assigned device\n");
1768 return -EBUSY;
1771 ret = reassign_device_ownership(dom0, d, bus, devfn);
1772 if ( ret )
1773 goto done;
1775 /* Setup rmrr identity mapping */
1776 for_each_rmrr_device( rmrr, bdf, i )
1778 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1780 /* FIXME: Because USB RMRR conflicts with guest bios region,
1781 * ignore USB RMRR temporarily.
1782 */
1783 if ( is_usb_device(bus, devfn) )
1785 ret = 0;
1786 goto done;
1789 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1790 if ( ret )
1791 gdprintk(XENLOG_ERR VTDPREFIX,
1792 "IOMMU: mapping reserved region failed\n");
1793 goto done;
1797 done:
1798 return ret;
1801 static int intel_iommu_group_id(u8 bus, u8 devfn)
1803 u8 secbus;
1804 if ( find_upstream_bridge(&bus, &devfn, &secbus) < 0 )
1805 return -1;
1806 else
1807 return PCI_BDF2(bus, devfn);
1810 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1811 void iommu_suspend(void)
1813 struct acpi_drhd_unit *drhd;
1814 struct iommu *iommu;
1815 u32 i;
1817 if ( !iommu_enabled )
1818 return;
1820 iommu_flush_all();
1822 for_each_drhd_unit ( drhd )
1824 iommu = drhd->iommu;
1825 i = iommu->index;
1827 iommu_state[i][DMAR_FECTL_REG] =
1828 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1829 iommu_state[i][DMAR_FEDATA_REG] =
1830 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1831 iommu_state[i][DMAR_FEADDR_REG] =
1832 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1833 iommu_state[i][DMAR_FEUADDR_REG] =
1834 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1836 iommu_disable_translation(iommu);
1838 if ( iommu_intremap )
1839 disable_intremap(iommu);
1841 if ( iommu_qinval )
1842 disable_qinval(iommu);
1846 void iommu_resume(void)
1848 struct acpi_drhd_unit *drhd;
1849 struct iommu *iommu;
1850 u32 i;
1851 unsigned long flags;
1853 if ( !iommu_enabled )
1854 return;
1856 if ( init_vtd_hw() != 0 && force_iommu )
1857 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1859 for_each_drhd_unit ( drhd )
1861 iommu = drhd->iommu;
1862 i = iommu->index;
1864 spin_lock_irqsave(&iommu->register_lock, flags);
1865 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1866 (u32) iommu_state[i][DMAR_FECTL_REG]);
1867 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1868 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1869 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1870 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1871 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1872 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1873 spin_unlock_irqrestore(&iommu->register_lock, flags);
1875 iommu_enable_translation(iommu);
1879 struct iommu_ops intel_iommu_ops = {
1880 .init = intel_iommu_domain_init,
1881 .add_device = intel_iommu_add_device,
1882 .remove_device = intel_iommu_remove_device,
1883 .assign_device = intel_iommu_assign_device,
1884 .teardown = iommu_domain_teardown,
1885 .map_page = intel_iommu_map_page,
1886 .unmap_page = intel_iommu_unmap_page,
1887 .reassign_device = reassign_device_ownership,
1888 .get_device_group_id = intel_iommu_group_id,
1889 .update_ire_from_apic = io_apic_write_remap_rte,
1890 .update_ire_from_msi = msi_msg_write_remap_rte,
1891 .read_apic_from_ire = io_apic_read_remap_rte,
1892 .read_msi_from_ire = msi_msg_read_remap_rte,
1893 };
1895 /*
1896 * Local variables:
1897 * mode: C
1898 * c-set-style: "BSD"
1899 * c-basic-offset: 4
1900 * tab-width: 4
1901 * indent-tabs-mode: nil
1902 * End:
1903 */