ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19734:4fb8a6c993e2

VT-d: correct way to submit command to GCMD register

Per VT-d spec, software should submit only one "incremental" command
at a time to Global Command reigster. Current implementation uses a
variable (gcmd) to record the state of Global Status register. It's
error prone.

Signed-off-by: Weidong Han <weidong.han@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 09:29:42 2009 +0100 (2009-06-05)
parents a69daf23602a
children fa51db0871e1
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
234 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
235 return;
237 spin_lock_irqsave(&iommu->register_lock, flag);
238 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
239 dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
241 /* Make sure hardware complete it */
242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
243 !(val & DMA_GSTS_WBFS), val);
245 spin_unlock_irqrestore(&iommu->register_lock, flag);
246 }
248 /* return value determine if we need a write buffer flush */
249 static int flush_context_reg(
250 void *_iommu,
251 u16 did, u16 source_id, u8 function_mask, u64 type,
252 int flush_non_present_entry)
253 {
254 struct iommu *iommu = (struct iommu *) _iommu;
255 u64 val = 0;
256 unsigned long flag;
258 /*
259 * In the non-present entry flush case, if hardware doesn't cache
260 * non-present entry we do nothing and if hardware cache non-present
261 * entry, we flush entries of domain 0 (the domain id is used to cache
262 * any non-present entries)
263 */
264 if ( flush_non_present_entry )
265 {
266 if ( !cap_caching_mode(iommu->cap) )
267 return 1;
268 else
269 did = 0;
270 }
272 /* use register invalidation */
273 switch ( type )
274 {
275 case DMA_CCMD_GLOBAL_INVL:
276 val = DMA_CCMD_GLOBAL_INVL;
277 break;
278 case DMA_CCMD_DOMAIN_INVL:
279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
280 break;
281 case DMA_CCMD_DEVICE_INVL:
282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
283 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
284 break;
285 default:
286 BUG();
287 }
288 val |= DMA_CCMD_ICC;
290 spin_lock_irqsave(&iommu->register_lock, flag);
291 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
293 /* Make sure hardware complete it */
294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
295 !(val & DMA_CCMD_ICC), val);
297 spin_unlock_irqrestore(&iommu->register_lock, flag);
298 /* flush context entry will implicitly flush write buffer */
299 return 0;
300 }
302 static int inline iommu_flush_context_global(
303 struct iommu *iommu, int flush_non_present_entry)
304 {
305 struct iommu_flush *flush = iommu_get_flush(iommu);
306 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
307 flush_non_present_entry);
308 }
310 static int inline iommu_flush_context_domain(
311 struct iommu *iommu, u16 did, int flush_non_present_entry)
312 {
313 struct iommu_flush *flush = iommu_get_flush(iommu);
314 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
315 flush_non_present_entry);
316 }
318 static int inline iommu_flush_context_device(
319 struct iommu *iommu, u16 did, u16 source_id,
320 u8 function_mask, int flush_non_present_entry)
321 {
322 struct iommu_flush *flush = iommu_get_flush(iommu);
323 return flush->context(iommu, did, source_id, function_mask,
324 DMA_CCMD_DEVICE_INVL,
325 flush_non_present_entry);
326 }
328 /* return value determine if we need a write buffer flush */
329 static int flush_iotlb_reg(void *_iommu, u16 did,
330 u64 addr, unsigned int size_order, u64 type,
331 int flush_non_present_entry, int flush_dev_iotlb)
332 {
333 struct iommu *iommu = (struct iommu *) _iommu;
334 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
335 u64 val = 0, val_iva = 0;
336 unsigned long flag;
338 /*
339 * In the non-present entry flush case, if hardware doesn't cache
340 * non-present entry we do nothing and if hardware cache non-present
341 * entry, we flush entries of domain 0 (the domain id is used to cache
342 * any non-present entries)
343 */
344 if ( flush_non_present_entry )
345 {
346 if ( !cap_caching_mode(iommu->cap) )
347 return 1;
348 else
349 did = 0;
350 }
352 /* use register invalidation */
353 switch ( type )
354 {
355 case DMA_TLB_GLOBAL_FLUSH:
356 /* global flush doesn't need set IVA_REG */
357 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
358 break;
359 case DMA_TLB_DSI_FLUSH:
360 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
361 break;
362 case DMA_TLB_PSI_FLUSH:
363 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
364 /* Note: always flush non-leaf currently */
365 val_iva = size_order | addr;
366 break;
367 default:
368 BUG();
369 }
370 /* Note: set drain read/write */
371 if ( cap_read_drain(iommu->cap) )
372 val |= DMA_TLB_READ_DRAIN;
373 if ( cap_write_drain(iommu->cap) )
374 val |= DMA_TLB_WRITE_DRAIN;
376 spin_lock_irqsave(&iommu->register_lock, flag);
377 /* Note: Only uses first TLB reg currently */
378 if ( val_iva )
379 dmar_writeq(iommu->reg, tlb_offset, val_iva);
380 dmar_writeq(iommu->reg, tlb_offset + 8, val);
382 /* Make sure hardware complete it */
383 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
384 !(val & DMA_TLB_IVT), val);
385 spin_unlock_irqrestore(&iommu->register_lock, flag);
387 /* check IOTLB invalidation granularity */
388 if ( DMA_TLB_IAIG(val) == 0 )
389 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
391 /* flush iotlb entry will implicitly flush write buffer */
392 return 0;
393 }
395 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
396 int flush_non_present_entry, int flush_dev_iotlb)
397 {
398 struct iommu_flush *flush = iommu_get_flush(iommu);
399 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
400 flush_non_present_entry, flush_dev_iotlb);
401 }
403 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
404 int flush_non_present_entry, int flush_dev_iotlb)
405 {
406 struct iommu_flush *flush = iommu_get_flush(iommu);
407 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
408 flush_non_present_entry, flush_dev_iotlb);
409 }
411 static int inline get_alignment(u64 base, unsigned int size)
412 {
413 int t = 0;
414 u64 end;
416 end = base + size - 1;
417 while ( base != end )
418 {
419 t++;
420 base >>= 1;
421 end >>= 1;
422 }
423 return t;
424 }
426 static int inline iommu_flush_iotlb_psi(
427 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
428 int flush_non_present_entry, int flush_dev_iotlb)
429 {
430 unsigned int align;
431 struct iommu_flush *flush = iommu_get_flush(iommu);
433 ASSERT(!(addr & (~PAGE_MASK_4K)));
434 ASSERT(pages > 0);
436 /* Fallback to domain selective flush if no PSI support */
437 if ( !cap_pgsel_inv(iommu->cap) )
438 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
440 /*
441 * PSI requires page size is 2 ^ x, and the base address is naturally
442 * aligned to the size
443 */
444 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
445 /* Fallback to domain selective flush if size is too big */
446 if ( align > cap_max_amask_val(iommu->cap) )
447 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
449 addr >>= PAGE_SHIFT_4K + align;
450 addr <<= PAGE_SHIFT_4K + align;
452 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
453 flush_non_present_entry, flush_dev_iotlb);
454 }
456 void iommu_flush_all(void)
457 {
458 struct acpi_drhd_unit *drhd;
459 struct iommu *iommu;
460 int flush_dev_iotlb;
462 flush_all_cache();
463 for_each_drhd_unit ( drhd )
464 {
465 iommu = drhd->iommu;
466 iommu_flush_context_global(iommu, 0);
467 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
468 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
469 }
470 }
472 /* clear one page's page table */
473 static void dma_pte_clear_one(struct domain *domain, u64 addr)
474 {
475 struct hvm_iommu *hd = domain_hvm_iommu(domain);
476 struct acpi_drhd_unit *drhd;
477 struct iommu *iommu;
478 struct dma_pte *page = NULL, *pte = NULL;
479 u64 pg_maddr;
480 int flush_dev_iotlb;
482 spin_lock(&hd->mapping_lock);
483 /* get last level pte */
484 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
485 if ( pg_maddr == 0 )
486 {
487 spin_unlock(&hd->mapping_lock);
488 return;
489 }
491 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
492 pte = page + address_level_offset(addr, 1);
494 if ( !dma_pte_present(*pte) )
495 {
496 spin_unlock(&hd->mapping_lock);
497 unmap_vtd_domain_page(page);
498 return;
499 }
501 dma_clear_pte(*pte);
502 spin_unlock(&hd->mapping_lock);
503 iommu_flush_cache_entry(pte);
505 /* No need pcidevs_lock here since do that on assign/deassign device*/
506 for_each_drhd_unit ( drhd )
507 {
508 iommu = drhd->iommu;
509 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
510 {
511 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
512 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
513 addr, 1, 0, flush_dev_iotlb) )
514 iommu_flush_write_buffer(iommu);
515 }
516 }
518 unmap_vtd_domain_page(page);
519 }
521 static void iommu_free_pagetable(u64 pt_maddr, int level)
522 {
523 int i;
524 struct dma_pte *pt_vaddr, *pte;
525 int next_level = level - 1;
527 if ( pt_maddr == 0 )
528 return;
530 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
532 for ( i = 0; i < PTE_NUM; i++ )
533 {
534 pte = &pt_vaddr[i];
535 if ( !dma_pte_present(*pte) )
536 continue;
538 if ( next_level >= 1 )
539 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
541 dma_clear_pte(*pte);
542 iommu_flush_cache_entry(pte);
543 }
545 unmap_vtd_domain_page(pt_vaddr);
546 free_pgtable_maddr(pt_maddr);
547 }
549 static int iommu_set_root_entry(struct iommu *iommu)
550 {
551 u32 sts;
552 unsigned long flags;
554 spin_lock(&iommu->lock);
556 if ( iommu->root_maddr == 0 )
557 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
558 if ( iommu->root_maddr == 0 )
559 {
560 spin_unlock(&iommu->lock);
561 return -ENOMEM;
562 }
564 spin_unlock(&iommu->lock);
565 spin_lock_irqsave(&iommu->register_lock, flags);
566 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
568 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
569 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
571 /* Make sure hardware complete it */
572 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
573 (sts & DMA_GSTS_RTPS), sts);
574 spin_unlock_irqrestore(&iommu->register_lock, flags);
576 return 0;
577 }
579 static void iommu_enable_translation(struct iommu *iommu)
580 {
581 u32 sts;
582 unsigned long flags;
584 dprintk(XENLOG_INFO VTDPREFIX,
585 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
586 spin_lock_irqsave(&iommu->register_lock, flags);
587 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
588 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
590 /* Make sure hardware complete it */
591 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
592 (sts & DMA_GSTS_TES), sts);
594 /* Disable PMRs when VT-d engine takes effect per spec definition */
595 disable_pmr(iommu);
596 spin_unlock_irqrestore(&iommu->register_lock, flags);
597 }
599 static void iommu_disable_translation(struct iommu *iommu)
600 {
601 u32 sts;
602 unsigned long flags;
604 spin_lock_irqsave(&iommu->register_lock, flags);
605 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
606 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
608 /* Make sure hardware complete it */
609 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
610 !(sts & DMA_GSTS_TES), sts);
611 spin_unlock_irqrestore(&iommu->register_lock, flags);
612 }
614 static struct iommu *vector_to_iommu[NR_VECTORS];
615 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
616 u8 fault_reason, u16 source_id, u64 addr)
617 {
618 dprintk(XENLOG_WARNING VTDPREFIX,
619 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
620 "iommu->reg = %p\n",
621 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
622 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
623 fault_reason, iommu->reg);
625 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
626 if ( fault_reason < 0x20 )
627 print_vtd_entries(iommu, (source_id >> 8),
628 (source_id & 0xff), (addr >> PAGE_SHIFT));
629 #endif
631 return 0;
632 }
634 static void iommu_fault_status(u32 fault_status)
635 {
636 if ( fault_status & DMA_FSTS_PFO )
637 dprintk(XENLOG_ERR VTDPREFIX,
638 "iommu_fault_status: Fault Overflow\n");
639 if ( fault_status & DMA_FSTS_PPF )
640 dprintk(XENLOG_ERR VTDPREFIX,
641 "iommu_fault_status: Primary Pending Fault\n");
642 if ( fault_status & DMA_FSTS_AFO )
643 dprintk(XENLOG_ERR VTDPREFIX,
644 "iommu_fault_status: Advanced Fault Overflow\n");
645 if ( fault_status & DMA_FSTS_APF )
646 dprintk(XENLOG_ERR VTDPREFIX,
647 "iommu_fault_status: Advanced Pending Fault\n");
648 if ( fault_status & DMA_FSTS_IQE )
649 dprintk(XENLOG_ERR VTDPREFIX,
650 "iommu_fault_status: Invalidation Queue Error\n");
651 if ( fault_status & DMA_FSTS_ICE )
652 dprintk(XENLOG_ERR VTDPREFIX,
653 "iommu_fault_status: Invalidation Completion Error\n");
654 if ( fault_status & DMA_FSTS_ITE )
655 dprintk(XENLOG_ERR VTDPREFIX,
656 "iommu_fault_status: Invalidation Time-out Error\n");
657 }
659 #define PRIMARY_FAULT_REG_LEN (16)
660 static void iommu_page_fault(int vector, void *dev_id,
661 struct cpu_user_regs *regs)
662 {
663 struct iommu *iommu = dev_id;
664 int reg, fault_index;
665 u32 fault_status;
666 unsigned long flags;
668 dprintk(XENLOG_WARNING VTDPREFIX,
669 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
671 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
673 iommu_fault_status(fault_status);
675 /* FIXME: ignore advanced fault log */
676 if ( !(fault_status & DMA_FSTS_PPF) )
677 goto clear_overflow;
679 fault_index = dma_fsts_fault_record_index(fault_status);
680 reg = cap_fault_reg_offset(iommu->cap);
681 while (1)
682 {
683 u8 fault_reason;
684 u16 source_id;
685 u32 data;
686 u64 guest_addr;
687 int type;
689 /* highest 32 bits */
690 spin_lock_irqsave(&iommu->register_lock, flags);
691 data = dmar_readl(iommu->reg, reg +
692 fault_index * PRIMARY_FAULT_REG_LEN + 12);
693 if ( !(data & DMA_FRCD_F) )
694 {
695 spin_unlock_irqrestore(&iommu->register_lock, flags);
696 break;
697 }
699 fault_reason = dma_frcd_fault_reason(data);
700 type = dma_frcd_type(data);
702 data = dmar_readl(iommu->reg, reg +
703 fault_index * PRIMARY_FAULT_REG_LEN + 8);
704 source_id = dma_frcd_source_id(data);
706 guest_addr = dmar_readq(iommu->reg, reg +
707 fault_index * PRIMARY_FAULT_REG_LEN);
708 guest_addr = dma_frcd_page_addr(guest_addr);
709 /* clear the fault */
710 dmar_writel(iommu->reg, reg +
711 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
712 spin_unlock_irqrestore(&iommu->register_lock, flags);
714 iommu_page_fault_do_one(iommu, type, fault_reason,
715 source_id, guest_addr);
717 fault_index++;
718 if ( fault_index > cap_num_fault_regs(iommu->cap) )
719 fault_index = 0;
720 }
721 clear_overflow:
722 /* clear primary fault overflow */
723 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
724 if ( fault_status & DMA_FSTS_PFO )
725 {
726 spin_lock_irqsave(&iommu->register_lock, flags);
727 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
728 spin_unlock_irqrestore(&iommu->register_lock, flags);
729 }
730 }
732 static void dma_msi_unmask(unsigned int vector)
733 {
734 struct iommu *iommu = vector_to_iommu[vector];
735 unsigned long flags;
737 /* unmask it */
738 spin_lock_irqsave(&iommu->register_lock, flags);
739 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
740 spin_unlock_irqrestore(&iommu->register_lock, flags);
741 }
743 static void dma_msi_mask(unsigned int vector)
744 {
745 unsigned long flags;
746 struct iommu *iommu = vector_to_iommu[vector];
748 /* mask it */
749 spin_lock_irqsave(&iommu->register_lock, flags);
750 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
751 spin_unlock_irqrestore(&iommu->register_lock, flags);
752 }
754 static unsigned int dma_msi_startup(unsigned int vector)
755 {
756 dma_msi_unmask(vector);
757 return 0;
758 }
760 static void dma_msi_end(unsigned int vector)
761 {
762 dma_msi_unmask(vector);
763 ack_APIC_irq();
764 }
766 static void dma_msi_data_init(struct iommu *iommu, int vector)
767 {
768 u32 msi_data = 0;
769 unsigned long flags;
771 /* Fixed, edge, assert mode. Follow MSI setting */
772 msi_data |= vector & 0xff;
773 msi_data |= 1 << 14;
775 spin_lock_irqsave(&iommu->register_lock, flags);
776 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
777 spin_unlock_irqrestore(&iommu->register_lock, flags);
778 }
780 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
781 {
782 u64 msi_address;
783 unsigned long flags;
785 /* Physical, dedicated cpu. Follow MSI setting */
786 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
787 msi_address |= MSI_PHYSICAL_MODE << 2;
788 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
789 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
791 spin_lock_irqsave(&iommu->register_lock, flags);
792 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
793 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
794 spin_unlock_irqrestore(&iommu->register_lock, flags);
795 }
797 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
798 {
799 struct iommu *iommu = vector_to_iommu[vector];
800 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
801 }
803 static struct hw_interrupt_type dma_msi_type = {
804 .typename = "DMA_MSI",
805 .startup = dma_msi_startup,
806 .shutdown = dma_msi_mask,
807 .enable = dma_msi_unmask,
808 .disable = dma_msi_mask,
809 .ack = dma_msi_mask,
810 .end = dma_msi_end,
811 .set_affinity = dma_msi_set_affinity,
812 };
814 static int iommu_set_interrupt(struct iommu *iommu)
815 {
816 int vector, ret;
818 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
819 if ( vector <= 0 )
820 {
821 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
822 return -EINVAL;
823 }
825 irq_desc[vector].handler = &dma_msi_type;
826 vector_to_iommu[vector] = iommu;
827 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
828 if ( ret )
829 {
830 irq_desc[vector].handler = &no_irq_type;
831 vector_to_iommu[vector] = NULL;
832 free_irq_vector(vector);
833 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
834 return ret;
835 }
837 /* Make sure that vector is never re-used. */
838 vector_irq[vector] = NEVER_ASSIGN_IRQ;
840 return vector;
841 }
843 static int iommu_alloc(struct acpi_drhd_unit *drhd)
844 {
845 struct iommu *iommu;
846 unsigned long sagaw;
847 int agaw;
849 if ( nr_iommus > MAX_IOMMUS )
850 {
851 gdprintk(XENLOG_ERR VTDPREFIX,
852 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
853 return -ENOMEM;
854 }
856 iommu = xmalloc(struct iommu);
857 if ( iommu == NULL )
858 return -ENOMEM;
859 memset(iommu, 0, sizeof(struct iommu));
861 iommu->vector = -1; /* No vector assigned yet. */
863 iommu->intel = alloc_intel_iommu();
864 if ( iommu->intel == NULL )
865 {
866 xfree(iommu);
867 return -ENOMEM;
868 }
870 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
871 iommu->index = nr_iommus++;
873 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
874 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
876 gdprintk(XENLOG_INFO VTDPREFIX,
877 "drhd->address = %"PRIx64"\n", drhd->address);
878 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
880 /* Calculate number of pagetable levels: between 2 and 4. */
881 sagaw = cap_sagaw(iommu->cap);
882 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
883 if ( test_bit(agaw, &sagaw) )
884 break;
885 if ( agaw < 0 )
886 {
887 gdprintk(XENLOG_ERR VTDPREFIX,
888 "IOMMU: unsupported sagaw %lx\n", sagaw);
889 xfree(iommu);
890 return -ENODEV;
891 }
892 iommu->nr_pt_levels = agaw_to_level(agaw);
894 if ( !ecap_coherent(iommu->ecap) )
895 iommus_incoherent = 1;
897 spin_lock_init(&iommu->lock);
898 spin_lock_init(&iommu->register_lock);
900 drhd->iommu = iommu;
901 return 0;
902 }
904 static void iommu_free(struct acpi_drhd_unit *drhd)
905 {
906 struct iommu *iommu = drhd->iommu;
908 if ( iommu == NULL )
909 return;
911 if ( iommu->root_maddr != 0 )
912 {
913 free_pgtable_maddr(iommu->root_maddr);
914 iommu->root_maddr = 0;
915 }
917 if ( iommu->reg )
918 iounmap(iommu->reg);
920 free_intel_iommu(iommu->intel);
921 release_irq_vector(iommu->vector);
922 xfree(iommu);
924 drhd->iommu = NULL;
925 }
927 #define guestwidth_to_adjustwidth(gaw) ({ \
928 int agaw, r = (gaw - 12) % 9; \
929 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
930 if ( agaw > 64 ) \
931 agaw = 64; \
932 agaw; })
934 static int intel_iommu_domain_init(struct domain *d)
935 {
936 struct hvm_iommu *hd = domain_hvm_iommu(d);
937 struct iommu *iommu = NULL;
938 struct acpi_drhd_unit *drhd;
940 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
941 iommu = drhd->iommu;
943 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
945 if ( d->domain_id == 0 )
946 {
947 /* Set up 1:1 page table for dom0 */
948 iommu_set_dom0_mapping(d);
950 setup_dom0_devices(d);
951 setup_dom0_rmrr(d);
953 iommu_flush_all();
955 for_each_drhd_unit ( drhd )
956 {
957 iommu = drhd->iommu;
958 iommu_enable_translation(iommu);
959 }
960 }
962 return 0;
963 }
965 static int domain_context_mapping_one(
966 struct domain *domain,
967 struct iommu *iommu,
968 u8 bus, u8 devfn)
969 {
970 struct hvm_iommu *hd = domain_hvm_iommu(domain);
971 struct context_entry *context, *context_entries;
972 u64 maddr, pgd_maddr;
973 struct pci_dev *pdev = NULL;
974 int agaw;
976 ASSERT(spin_is_locked(&pcidevs_lock));
977 spin_lock(&iommu->lock);
978 maddr = bus_to_context_maddr(iommu, bus);
979 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
980 context = &context_entries[devfn];
982 if ( context_present(*context) )
983 {
984 int res = 0;
986 pdev = pci_get_pdev(bus, devfn);
987 if (!pdev)
988 res = -ENODEV;
989 else if (pdev->domain != domain)
990 res = -EINVAL;
991 unmap_vtd_domain_page(context_entries);
992 spin_unlock(&iommu->lock);
993 return res;
994 }
996 if ( iommu_passthrough && (domain->domain_id == 0) )
997 {
998 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
999 agaw = level_to_agaw(iommu->nr_pt_levels);
1001 else
1003 spin_lock(&hd->mapping_lock);
1005 /* Ensure we have pagetables allocated down to leaf PTE. */
1006 if ( hd->pgd_maddr == 0 )
1008 addr_to_dma_page_maddr(domain, 0, 1);
1009 if ( hd->pgd_maddr == 0 )
1011 nomem:
1012 spin_unlock(&hd->mapping_lock);
1013 spin_unlock(&iommu->lock);
1014 unmap_vtd_domain_page(context_entries);
1015 return -ENOMEM;
1019 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1020 pgd_maddr = hd->pgd_maddr;
1021 for ( agaw = level_to_agaw(4);
1022 agaw != level_to_agaw(iommu->nr_pt_levels);
1023 agaw-- )
1025 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1026 pgd_maddr = dma_pte_addr(*p);
1027 unmap_vtd_domain_page(p);
1028 if ( pgd_maddr == 0 )
1029 goto nomem;
1032 context_set_address_root(*context, pgd_maddr);
1033 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1034 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1035 else
1036 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1038 spin_unlock(&hd->mapping_lock);
1041 /*
1042 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1043 * be 1 based as required by intel's iommu hw.
1044 */
1045 context_set_domain_id(context, domain);
1046 context_set_address_width(*context, agaw);
1047 context_set_fault_enable(*context);
1048 context_set_present(*context);
1049 iommu_flush_cache_entry(context);
1050 spin_unlock(&iommu->lock);
1052 /* Context entry was previously non-present (with domid 0). */
1053 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1054 DMA_CCMD_MASK_NOBIT, 1) )
1055 iommu_flush_write_buffer(iommu);
1056 else
1058 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1059 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1062 set_bit(iommu->index, &hd->iommu_bitmap);
1064 unmap_vtd_domain_page(context_entries);
1066 return 0;
1069 #define PCI_BASE_CLASS_BRIDGE 0x06
1070 #define PCI_CLASS_BRIDGE_PCI 0x0604
1072 enum {
1073 DEV_TYPE_PCIe_ENDPOINT,
1074 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1075 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1076 DEV_TYPE_PCI,
1077 };
1079 int pdev_type(u8 bus, u8 devfn)
1081 u16 class_device;
1082 u16 status, creg;
1083 int pos;
1084 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1086 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1087 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1089 pos = pci_find_next_cap(bus, devfn,
1090 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1091 if ( !pos )
1092 return DEV_TYPE_PCI_BRIDGE;
1093 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1094 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1095 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1098 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1099 if ( !(status & PCI_STATUS_CAP_LIST) )
1100 return DEV_TYPE_PCI;
1102 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1103 return DEV_TYPE_PCIe_ENDPOINT;
1105 return DEV_TYPE_PCI;
1108 #define MAX_BUSES 256
1109 static DEFINE_SPINLOCK(bus2bridge_lock);
1110 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1112 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1114 int cnt = 0;
1115 *secbus = *bus;
1117 ASSERT(spin_is_locked(&bus2bridge_lock));
1118 if ( !bus2bridge[*bus].map )
1119 return 0;
1121 while ( bus2bridge[*bus].map )
1123 *secbus = *bus;
1124 *devfn = bus2bridge[*bus].devfn;
1125 *bus = bus2bridge[*bus].bus;
1126 if ( cnt++ >= MAX_BUSES )
1127 return 0;
1130 return 1;
1133 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1135 int ret = 0;
1137 if ( *bus == 0 )
1138 /* assume integrated PCI devices in RC have valid requester-id */
1139 return 1;
1141 spin_lock(&bus2bridge_lock);
1142 ret = _find_pcie_endpoint(bus, devfn, secbus);
1143 spin_unlock(&bus2bridge_lock);
1145 return ret;
1148 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1150 struct acpi_drhd_unit *drhd;
1151 int ret = 0;
1152 u16 sec_bus, sub_bus;
1153 u32 type;
1154 u8 secbus, secdevfn;
1155 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1157 if ( pdev == NULL )
1159 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1160 * -> domain_context_mapping().
1161 * In the case a user enables VT-d and disables USB (that usually needs
1162 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1163 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1164 * the BDF and at last pci_get_pdev() returns NULL here.
1165 */
1166 gdprintk(XENLOG_WARNING VTDPREFIX,
1167 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1168 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1169 return 0;
1172 drhd = acpi_find_matched_drhd_unit(pdev);
1173 if ( !drhd )
1174 return -ENODEV;
1176 ASSERT(spin_is_locked(&pcidevs_lock));
1178 type = pdev_type(bus, devfn);
1179 switch ( type )
1181 case DEV_TYPE_PCIe_BRIDGE:
1182 break;
1184 case DEV_TYPE_PCI_BRIDGE:
1185 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1186 PCI_SECONDARY_BUS);
1187 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1188 PCI_SUBORDINATE_BUS);
1190 spin_lock(&bus2bridge_lock);
1191 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1193 bus2bridge[sec_bus].map = 1;
1194 bus2bridge[sec_bus].bus = bus;
1195 bus2bridge[sec_bus].devfn = devfn;
1197 spin_unlock(&bus2bridge_lock);
1198 break;
1200 case DEV_TYPE_PCIe_ENDPOINT:
1201 gdprintk(XENLOG_INFO VTDPREFIX,
1202 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1203 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1204 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1205 break;
1207 case DEV_TYPE_PCI:
1208 gdprintk(XENLOG_INFO VTDPREFIX,
1209 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1210 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1212 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1213 if ( ret )
1214 break;
1216 secbus = bus;
1217 secdevfn = devfn;
1218 /* dependent devices mapping */
1219 while ( bus2bridge[bus].map )
1221 secbus = bus;
1222 secdevfn = devfn;
1223 devfn = bus2bridge[bus].devfn;
1224 bus = bus2bridge[bus].bus;
1225 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1226 if ( ret )
1227 return ret;
1230 if ( (secbus != bus) && (secdevfn != 0) )
1231 /*
1232 * The source-id for transactions on non-PCIe buses seem
1233 * to originate from devfn=0 on the secondary bus behind
1234 * the bridge. Map that id as well. The id to use in
1235 * these scanarios is not particularly well documented
1236 * anywhere.
1237 */
1238 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1239 break;
1241 default:
1242 gdprintk(XENLOG_ERR VTDPREFIX,
1243 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1244 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1245 ret = -EINVAL;
1246 break;
1249 return ret;
1252 static int domain_context_unmap_one(
1253 struct domain *domain,
1254 struct iommu *iommu,
1255 u8 bus, u8 devfn)
1257 struct context_entry *context, *context_entries;
1258 u64 maddr;
1260 ASSERT(spin_is_locked(&pcidevs_lock));
1261 spin_lock(&iommu->lock);
1263 maddr = bus_to_context_maddr(iommu, bus);
1264 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1265 context = &context_entries[devfn];
1267 if ( !context_present(*context) )
1269 spin_unlock(&iommu->lock);
1270 unmap_vtd_domain_page(context_entries);
1271 return 0;
1274 context_clear_present(*context);
1275 context_clear_entry(*context);
1276 iommu_flush_cache_entry(context);
1278 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1279 (((u16)bus) << 8) | devfn,
1280 DMA_CCMD_MASK_NOBIT, 0) )
1281 iommu_flush_write_buffer(iommu);
1282 else
1284 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1285 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1288 spin_unlock(&iommu->lock);
1289 unmap_vtd_domain_page(context_entries);
1291 return 0;
1294 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1296 struct acpi_drhd_unit *drhd;
1297 int ret = 0;
1298 u32 type;
1299 u8 secbus, secdevfn;
1300 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1302 BUG_ON(!pdev);
1304 drhd = acpi_find_matched_drhd_unit(pdev);
1305 if ( !drhd )
1306 return -ENODEV;
1308 type = pdev_type(bus, devfn);
1309 switch ( type )
1311 case DEV_TYPE_PCIe_BRIDGE:
1312 case DEV_TYPE_PCI_BRIDGE:
1313 break;
1315 case DEV_TYPE_PCIe_ENDPOINT:
1316 gdprintk(XENLOG_INFO VTDPREFIX,
1317 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1318 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1319 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1320 break;
1322 case DEV_TYPE_PCI:
1323 gdprintk(XENLOG_INFO VTDPREFIX,
1324 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1325 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1326 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1327 if ( ret )
1328 break;
1330 secbus = bus;
1331 secdevfn = devfn;
1332 /* dependent devices unmapping */
1333 while ( bus2bridge[bus].map )
1335 secbus = bus;
1336 secdevfn = devfn;
1337 devfn = bus2bridge[bus].devfn;
1338 bus = bus2bridge[bus].bus;
1339 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1340 if ( ret )
1341 return ret;
1344 if ( (secbus != bus) && (secdevfn != 0) )
1345 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1346 break;
1348 default:
1349 gdprintk(XENLOG_ERR VTDPREFIX,
1350 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1351 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1352 ret = -EINVAL;
1353 break;
1356 return ret;
1359 static int reassign_device_ownership(
1360 struct domain *source,
1361 struct domain *target,
1362 u8 bus, u8 devfn)
1364 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1365 struct pci_dev *pdev;
1366 struct acpi_drhd_unit *drhd;
1367 struct iommu *pdev_iommu;
1368 int ret, found = 0;
1370 ASSERT(spin_is_locked(&pcidevs_lock));
1371 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1373 if (!pdev)
1374 return -ENODEV;
1376 drhd = acpi_find_matched_drhd_unit(pdev);
1377 pdev_iommu = drhd->iommu;
1378 domain_context_unmap(source, bus, devfn);
1380 ret = domain_context_mapping(target, bus, devfn);
1381 if ( ret )
1382 return ret;
1384 list_move(&pdev->domain_list, &target->arch.pdev_list);
1385 pdev->domain = target;
1387 for_each_pdev ( source, pdev )
1389 drhd = acpi_find_matched_drhd_unit(pdev);
1390 if ( drhd->iommu == pdev_iommu )
1392 found = 1;
1393 break;
1397 if ( !found )
1398 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1400 return ret;
1403 void iommu_domain_teardown(struct domain *d)
1405 struct hvm_iommu *hd = domain_hvm_iommu(d);
1407 if ( list_empty(&acpi_drhd_units) )
1408 return;
1410 spin_lock(&hd->mapping_lock);
1411 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1412 hd->pgd_maddr = 0;
1413 spin_unlock(&hd->mapping_lock);
1415 iommu_domid_release(d);
1418 int intel_iommu_map_page(
1419 struct domain *d, unsigned long gfn, unsigned long mfn)
1421 struct hvm_iommu *hd = domain_hvm_iommu(d);
1422 struct acpi_drhd_unit *drhd;
1423 struct iommu *iommu;
1424 struct dma_pte *page = NULL, *pte = NULL;
1425 u64 pg_maddr;
1426 int pte_present;
1427 int flush_dev_iotlb;
1429 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1430 iommu = drhd->iommu;
1432 /* do nothing if dom0 and iommu supports pass thru */
1433 if ( iommu_passthrough && (d->domain_id == 0) )
1434 return 0;
1436 spin_lock(&hd->mapping_lock);
1438 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1439 if ( pg_maddr == 0 )
1441 spin_unlock(&hd->mapping_lock);
1442 return -ENOMEM;
1444 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1445 pte = page + (gfn & LEVEL_MASK);
1446 pte_present = dma_pte_present(*pte);
1447 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1448 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1450 /* Set the SNP on leaf page table if Snoop Control available */
1451 if ( iommu_snoop )
1452 dma_set_pte_snp(*pte);
1454 iommu_flush_cache_entry(pte);
1455 spin_unlock(&hd->mapping_lock);
1456 unmap_vtd_domain_page(page);
1458 /*
1459 * No need pcideves_lock here because we have flush
1460 * when assign/deassign device
1461 */
1462 for_each_drhd_unit ( drhd )
1464 iommu = drhd->iommu;
1466 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1467 continue;
1469 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1470 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1471 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1472 !pte_present, flush_dev_iotlb) )
1473 iommu_flush_write_buffer(iommu);
1476 return 0;
1479 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1481 struct acpi_drhd_unit *drhd;
1482 struct iommu *iommu;
1484 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1485 iommu = drhd->iommu;
1487 /* do nothing if dom0 and iommu supports pass thru */
1488 if ( iommu_passthrough && (d->domain_id == 0) )
1489 return 0;
1491 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1493 return 0;
1496 static int iommu_prepare_rmrr_dev(struct domain *d,
1497 struct acpi_rmrr_unit *rmrr,
1498 u8 bus, u8 devfn)
1500 int ret = 0;
1501 u64 base, end;
1502 unsigned long base_pfn, end_pfn;
1504 ASSERT(spin_is_locked(&pcidevs_lock));
1505 ASSERT(rmrr->base_address < rmrr->end_address);
1507 base = rmrr->base_address & PAGE_MASK_4K;
1508 base_pfn = base >> PAGE_SHIFT_4K;
1509 end = PAGE_ALIGN_4K(rmrr->end_address);
1510 end_pfn = end >> PAGE_SHIFT_4K;
1512 while ( base_pfn < end_pfn )
1514 intel_iommu_map_page(d, base_pfn, base_pfn);
1515 base_pfn++;
1518 ret = domain_context_mapping(d, bus, devfn);
1520 return ret;
1523 static int intel_iommu_add_device(struct pci_dev *pdev)
1525 struct acpi_rmrr_unit *rmrr;
1526 u16 bdf;
1527 int ret, i;
1529 ASSERT(spin_is_locked(&pcidevs_lock));
1531 if ( !pdev->domain )
1532 return -EINVAL;
1534 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1535 if ( ret )
1537 gdprintk(XENLOG_ERR VTDPREFIX,
1538 "intel_iommu_add_device: context mapping failed\n");
1539 return ret;
1542 for_each_rmrr_device ( rmrr, bdf, i )
1544 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1546 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1547 pdev->bus, pdev->devfn);
1548 if ( ret )
1549 gdprintk(XENLOG_ERR VTDPREFIX,
1550 "intel_iommu_add_device: RMRR mapping failed\n");
1551 break;
1555 return ret;
1558 static int intel_iommu_remove_device(struct pci_dev *pdev)
1560 struct acpi_rmrr_unit *rmrr;
1561 u16 bdf;
1562 int i;
1564 if ( !pdev->domain )
1565 return -EINVAL;
1567 /* If the device belongs to dom0, and it has RMRR, don't remove it
1568 * from dom0, because BIOS may use RMRR at booting time.
1569 */
1570 if ( pdev->domain->domain_id == 0 )
1572 for_each_rmrr_device ( rmrr, bdf, i )
1574 if ( PCI_BUS(bdf) == pdev->bus &&
1575 PCI_DEVFN2(bdf) == pdev->devfn )
1576 return 0;
1580 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1583 static void setup_dom0_devices(struct domain *d)
1585 struct hvm_iommu *hd;
1586 struct pci_dev *pdev;
1587 int bus, dev, func;
1588 u32 l;
1590 hd = domain_hvm_iommu(d);
1592 spin_lock(&pcidevs_lock);
1593 for ( bus = 0; bus < 256; bus++ )
1595 for ( dev = 0; dev < 32; dev++ )
1597 for ( func = 0; func < 8; func++ )
1599 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1600 /* some broken boards return 0 or ~0 if a slot is empty: */
1601 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1602 (l == 0x0000ffff) || (l == 0xffff0000) )
1603 continue;
1605 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1606 pdev->domain = d;
1607 list_add(&pdev->domain_list, &d->arch.pdev_list);
1608 domain_context_mapping(d, pdev->bus, pdev->devfn);
1609 if ( ats_device(0, pdev->bus, pdev->devfn) )
1610 enable_ats_device(0, pdev->bus, pdev->devfn);
1614 spin_unlock(&pcidevs_lock);
1617 void clear_fault_bits(struct iommu *iommu)
1619 u64 val;
1621 val = dmar_readq(
1622 iommu->reg,
1623 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1624 dmar_writeq(
1625 iommu->reg,
1626 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1627 val);
1628 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1631 static int init_vtd_hw(void)
1633 struct acpi_drhd_unit *drhd;
1634 struct iommu *iommu;
1635 struct iommu_flush *flush = NULL;
1636 int vector;
1637 int ret;
1639 for_each_drhd_unit ( drhd )
1641 iommu = drhd->iommu;
1642 ret = iommu_set_root_entry(iommu);
1643 if ( ret )
1645 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1646 return -EIO;
1649 if ( iommu->vector < 0 )
1651 vector = iommu_set_interrupt(iommu);
1652 if ( vector < 0 )
1654 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1655 return vector;
1657 iommu->vector = vector;
1659 dma_msi_data_init(iommu, iommu->vector);
1660 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1661 clear_fault_bits(iommu);
1662 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1664 /* initialize flush functions */
1665 flush = iommu_get_flush(iommu);
1666 flush->context = flush_context_reg;
1667 flush->iotlb = flush_iotlb_reg;
1670 if ( iommu_qinval )
1672 for_each_drhd_unit ( drhd )
1674 iommu = drhd->iommu;
1675 if ( enable_qinval(iommu) != 0 )
1677 dprintk(XENLOG_INFO VTDPREFIX,
1678 "Failed to enable Queued Invalidation!\n");
1679 break;
1684 if ( iommu_intremap )
1686 for_each_drhd_unit ( drhd )
1688 iommu = drhd->iommu;
1689 if ( enable_intremap(iommu) != 0 )
1691 dprintk(XENLOG_INFO VTDPREFIX,
1692 "Failed to enable Interrupt Remapping!\n");
1693 break;
1698 return 0;
1701 static void setup_dom0_rmrr(struct domain *d)
1703 struct acpi_rmrr_unit *rmrr;
1704 u16 bdf;
1705 int ret, i;
1707 spin_lock(&pcidevs_lock);
1708 for_each_rmrr_device ( rmrr, bdf, i )
1710 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1711 if ( ret )
1712 gdprintk(XENLOG_ERR VTDPREFIX,
1713 "IOMMU: mapping reserved region failed\n");
1715 spin_unlock(&pcidevs_lock);
1718 static void platform_quirks(void)
1720 u32 id;
1722 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1723 id = pci_conf_read32(0, 0, 0, 0);
1724 if ( id == 0x2a408086 )
1726 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1727 rwbf_quirk = 1;
1731 int intel_vtd_setup(void)
1733 struct acpi_drhd_unit *drhd;
1734 struct iommu *iommu;
1736 if ( !iommu_enabled )
1737 return -ENODEV;
1739 platform_quirks();
1741 spin_lock_init(&domid_bitmap_lock);
1742 clflush_size = get_cache_line_size();
1744 /* We enable the following features only if they are supported by all VT-d
1745 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1746 * Interrupt Remapping.
1747 */
1748 for_each_drhd_unit ( drhd )
1750 if ( iommu_alloc(drhd) != 0 )
1751 goto error;
1753 iommu = drhd->iommu;
1755 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1756 iommu_snoop = 0;
1758 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1759 iommu_passthrough = 0;
1761 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1762 iommu_qinval = 0;
1764 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1765 iommu_intremap = 0;
1768 if ( !iommu_qinval && iommu_intremap )
1770 iommu_intremap = 0;
1771 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1772 "since Queued Invalidation isn't supported or enabled.\n");
1775 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1776 P(iommu_snoop, "Snoop Control");
1777 P(iommu_passthrough, "DMA Passthrough");
1778 P(iommu_qinval, "Queued Invalidation");
1779 P(iommu_intremap, "Interrupt Remapping");
1780 #undef P
1782 /* Allocate IO page directory page for the domain. */
1783 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1784 iommu = drhd->iommu;
1786 /* Allocate domain id bitmap, and set bit 0 as reserved */
1787 domid_bitmap_size = cap_ndoms(iommu->cap);
1788 domid_bitmap = xmalloc_array(unsigned long,
1789 BITS_TO_LONGS(domid_bitmap_size));
1790 if ( domid_bitmap == NULL )
1791 goto error;
1792 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1793 set_bit(0, domid_bitmap);
1795 if ( init_vtd_hw() )
1796 goto error;
1798 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1800 return 0;
1802 error:
1803 for_each_drhd_unit ( drhd )
1804 iommu_free(drhd);
1805 iommu_enabled = 0;
1806 iommu_snoop = 0;
1807 iommu_passthrough = 0;
1808 iommu_qinval = 0;
1809 iommu_intremap = 0;
1810 return -ENOMEM;
1813 /*
1814 * If the device isn't owned by dom0, it means it already
1815 * has been assigned to other domain, or it's not exist.
1816 */
1817 int device_assigned(u8 bus, u8 devfn)
1819 struct pci_dev *pdev;
1821 spin_lock(&pcidevs_lock);
1822 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1823 if (!pdev)
1825 spin_unlock(&pcidevs_lock);
1826 return -1;
1829 spin_unlock(&pcidevs_lock);
1830 return 0;
1833 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1835 struct acpi_rmrr_unit *rmrr;
1836 int ret = 0, i;
1837 struct pci_dev *pdev;
1838 u16 bdf;
1840 if ( list_empty(&acpi_drhd_units) )
1841 return -ENODEV;
1843 ASSERT(spin_is_locked(&pcidevs_lock));
1844 pdev = pci_get_pdev(bus, devfn);
1845 if (!pdev)
1846 return -ENODEV;
1848 if (pdev->domain != dom0)
1850 gdprintk(XENLOG_ERR VTDPREFIX,
1851 "IOMMU: assign a assigned device\n");
1852 return -EBUSY;
1855 ret = reassign_device_ownership(dom0, d, bus, devfn);
1856 if ( ret )
1857 goto done;
1859 /* Setup rmrr identity mapping */
1860 for_each_rmrr_device( rmrr, bdf, i )
1862 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1864 /* FIXME: Because USB RMRR conflicts with guest bios region,
1865 * ignore USB RMRR temporarily.
1866 */
1867 if ( is_usb_device(bus, devfn) )
1869 ret = 0;
1870 goto done;
1873 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1874 if ( ret )
1875 gdprintk(XENLOG_ERR VTDPREFIX,
1876 "IOMMU: mapping reserved region failed\n");
1877 goto done;
1881 done:
1882 return ret;
1885 static int intel_iommu_group_id(u8 bus, u8 devfn)
1887 u8 secbus;
1888 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1889 return PCI_BDF2(bus, devfn);
1890 else
1891 return -1;
1894 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1895 void iommu_suspend(void)
1897 struct acpi_drhd_unit *drhd;
1898 struct iommu *iommu;
1899 u32 i;
1901 if ( !iommu_enabled )
1902 return;
1904 iommu_flush_all();
1906 for_each_drhd_unit ( drhd )
1908 iommu = drhd->iommu;
1909 i = iommu->index;
1911 iommu_state[i][DMAR_FECTL_REG] =
1912 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1913 iommu_state[i][DMAR_FEDATA_REG] =
1914 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1915 iommu_state[i][DMAR_FEADDR_REG] =
1916 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1917 iommu_state[i][DMAR_FEUADDR_REG] =
1918 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1920 iommu_disable_translation(iommu);
1922 if ( iommu_intremap )
1923 disable_intremap(iommu);
1925 if ( iommu_qinval )
1926 disable_qinval(iommu);
1930 void iommu_resume(void)
1932 struct acpi_drhd_unit *drhd;
1933 struct iommu *iommu;
1934 struct iommu_flush *flush;
1935 u32 i;
1937 if ( !iommu_enabled )
1938 return;
1940 /* Re-initialize the register-based flush functions.
1941 * In iommu_flush_all(), we invoke iommu_flush_{context,iotlb}_global(),
1942 * but at this point, on hosts that support QI(Queued Invalidation), QI
1943 * hasn't been re-enabed yet, so for now let's use the register-based
1944 * invalidation method before invoking init_vtd_hw().
1945 */
1946 if ( iommu_qinval )
1948 for_each_drhd_unit ( drhd )
1950 iommu = drhd->iommu;
1951 flush = iommu_get_flush(iommu);
1952 flush->context = flush_context_reg;
1953 flush->iotlb = flush_iotlb_reg;
1957 /* Not sure whether the flush operation is required to meet iommu
1958 * specification. Note that BIOS also executes in S3 resume and iommu may
1959 * be touched again, so let us do the flush operation for safety.
1960 */
1961 iommu_flush_all();
1963 if ( init_vtd_hw() != 0 && force_iommu )
1964 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1966 for_each_drhd_unit ( drhd )
1968 iommu = drhd->iommu;
1969 i = iommu->index;
1971 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1972 (u32) iommu_state[i][DMAR_FECTL_REG]);
1973 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1974 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1975 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1976 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1977 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1978 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1980 iommu_enable_translation(iommu);
1984 struct iommu_ops intel_iommu_ops = {
1985 .init = intel_iommu_domain_init,
1986 .add_device = intel_iommu_add_device,
1987 .remove_device = intel_iommu_remove_device,
1988 .assign_device = intel_iommu_assign_device,
1989 .teardown = iommu_domain_teardown,
1990 .map_page = intel_iommu_map_page,
1991 .unmap_page = intel_iommu_unmap_page,
1992 .reassign_device = reassign_device_ownership,
1993 .get_device_group_id = intel_iommu_group_id,
1994 .update_ire_from_apic = io_apic_write_remap_rte,
1995 .update_ire_from_msi = msi_msg_write_remap_rte,
1996 };
1998 /*
1999 * Local variables:
2000 * mode: C
2001 * c-set-style: "BSD"
2002 * c-basic-offset: 4
2003 * tab-width: 4
2004 * indent-tabs-mode: nil
2005 * End:
2006 */