ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19697:42fe00c6f8b4

Enable pci mmcfg and ATS for x86_64

This patch enables PCI MMCONFIG in xen and turns on hooks for ATS.

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jun 02 11:49:34 2009 +0100 (2009-06-02)
parents f3bed18decfc
children 931dbe86e5f3
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
233 s_time_t start_time;
235 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
236 return;
237 val = iommu->gcmd | DMA_GCMD_WBF;
239 spin_lock_irqsave(&iommu->register_lock, flag);
240 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
242 /* Make sure hardware complete it */
243 start_time = NOW();
244 for ( ; ; )
245 {
246 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
247 if ( !(val & DMA_GSTS_WBFS) )
248 break;
249 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
250 panic("%s: DMAR hardware is malfunctional,"
251 " please disable IOMMU\n", __func__);
252 cpu_relax();
253 }
254 spin_unlock_irqrestore(&iommu->register_lock, flag);
255 }
257 /* return value determine if we need a write buffer flush */
258 static int flush_context_reg(
259 void *_iommu,
260 u16 did, u16 source_id, u8 function_mask, u64 type,
261 int flush_non_present_entry)
262 {
263 struct iommu *iommu = (struct iommu *) _iommu;
264 u64 val = 0;
265 unsigned long flag;
266 s_time_t start_time;
268 /*
269 * In the non-present entry flush case, if hardware doesn't cache
270 * non-present entry we do nothing and if hardware cache non-present
271 * entry, we flush entries of domain 0 (the domain id is used to cache
272 * any non-present entries)
273 */
274 if ( flush_non_present_entry )
275 {
276 if ( !cap_caching_mode(iommu->cap) )
277 return 1;
278 else
279 did = 0;
280 }
282 /* use register invalidation */
283 switch ( type )
284 {
285 case DMA_CCMD_GLOBAL_INVL:
286 val = DMA_CCMD_GLOBAL_INVL;
287 break;
288 case DMA_CCMD_DOMAIN_INVL:
289 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
290 break;
291 case DMA_CCMD_DEVICE_INVL:
292 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
293 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
294 break;
295 default:
296 BUG();
297 }
298 val |= DMA_CCMD_ICC;
300 spin_lock_irqsave(&iommu->register_lock, flag);
301 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
303 /* Make sure hardware complete it */
304 start_time = NOW();
305 for ( ; ; )
306 {
307 val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
308 if ( !(val & DMA_CCMD_ICC) )
309 break;
310 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
311 panic("%s: DMAR hardware is malfunctional,"
312 " please disable IOMMU\n", __func__);
313 cpu_relax();
314 }
315 spin_unlock_irqrestore(&iommu->register_lock, flag);
316 /* flush context entry will implicitly flush write buffer */
317 return 0;
318 }
320 static int inline iommu_flush_context_global(
321 struct iommu *iommu, int flush_non_present_entry)
322 {
323 struct iommu_flush *flush = iommu_get_flush(iommu);
324 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
325 flush_non_present_entry);
326 }
328 static int inline iommu_flush_context_domain(
329 struct iommu *iommu, u16 did, int flush_non_present_entry)
330 {
331 struct iommu_flush *flush = iommu_get_flush(iommu);
332 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
333 flush_non_present_entry);
334 }
336 static int inline iommu_flush_context_device(
337 struct iommu *iommu, u16 did, u16 source_id,
338 u8 function_mask, int flush_non_present_entry)
339 {
340 struct iommu_flush *flush = iommu_get_flush(iommu);
341 return flush->context(iommu, did, source_id, function_mask,
342 DMA_CCMD_DEVICE_INVL,
343 flush_non_present_entry);
344 }
346 /* return value determine if we need a write buffer flush */
347 static int flush_iotlb_reg(void *_iommu, u16 did,
348 u64 addr, unsigned int size_order, u64 type,
349 int flush_non_present_entry, int flush_dev_iotlb)
350 {
351 struct iommu *iommu = (struct iommu *) _iommu;
352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
353 u64 val = 0, val_iva = 0;
354 unsigned long flag;
355 s_time_t start_time;
357 /*
358 * In the non-present entry flush case, if hardware doesn't cache
359 * non-present entry we do nothing and if hardware cache non-present
360 * entry, we flush entries of domain 0 (the domain id is used to cache
361 * any non-present entries)
362 */
363 if ( flush_non_present_entry )
364 {
365 if ( !cap_caching_mode(iommu->cap) )
366 return 1;
367 else
368 did = 0;
369 }
371 /* use register invalidation */
372 switch ( type )
373 {
374 case DMA_TLB_GLOBAL_FLUSH:
375 /* global flush doesn't need set IVA_REG */
376 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
377 break;
378 case DMA_TLB_DSI_FLUSH:
379 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
380 break;
381 case DMA_TLB_PSI_FLUSH:
382 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
383 /* Note: always flush non-leaf currently */
384 val_iva = size_order | addr;
385 break;
386 default:
387 BUG();
388 }
389 /* Note: set drain read/write */
390 if ( cap_read_drain(iommu->cap) )
391 val |= DMA_TLB_READ_DRAIN;
392 if ( cap_write_drain(iommu->cap) )
393 val |= DMA_TLB_WRITE_DRAIN;
395 spin_lock_irqsave(&iommu->register_lock, flag);
396 /* Note: Only uses first TLB reg currently */
397 if ( val_iva )
398 dmar_writeq(iommu->reg, tlb_offset, val_iva);
399 dmar_writeq(iommu->reg, tlb_offset + 8, val);
401 /* Make sure hardware complete it */
402 start_time = NOW();
403 for ( ; ; )
404 {
405 val = dmar_readq(iommu->reg, tlb_offset + 8);
406 if ( !(val & DMA_TLB_IVT) )
407 break;
408 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
409 panic("%s: DMAR hardware is malfunctional,"
410 " please disable IOMMU\n", __func__);
411 cpu_relax();
412 }
413 spin_unlock_irqrestore(&iommu->register_lock, flag);
415 /* check IOTLB invalidation granularity */
416 if ( DMA_TLB_IAIG(val) == 0 )
417 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
419 /* flush iotlb entry will implicitly flush write buffer */
420 return 0;
421 }
423 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
424 int flush_non_present_entry, int flush_dev_iotlb)
425 {
426 struct iommu_flush *flush = iommu_get_flush(iommu);
427 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
428 flush_non_present_entry, flush_dev_iotlb);
429 }
431 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
432 int flush_non_present_entry, int flush_dev_iotlb)
433 {
434 struct iommu_flush *flush = iommu_get_flush(iommu);
435 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
436 flush_non_present_entry, flush_dev_iotlb);
437 }
439 static int inline get_alignment(u64 base, unsigned int size)
440 {
441 int t = 0;
442 u64 end;
444 end = base + size - 1;
445 while ( base != end )
446 {
447 t++;
448 base >>= 1;
449 end >>= 1;
450 }
451 return t;
452 }
454 static int inline iommu_flush_iotlb_psi(
455 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
456 int flush_non_present_entry, int flush_dev_iotlb)
457 {
458 unsigned int align;
459 struct iommu_flush *flush = iommu_get_flush(iommu);
461 ASSERT(!(addr & (~PAGE_MASK_4K)));
462 ASSERT(pages > 0);
464 /* Fallback to domain selective flush if no PSI support */
465 if ( !cap_pgsel_inv(iommu->cap) )
466 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
468 /*
469 * PSI requires page size is 2 ^ x, and the base address is naturally
470 * aligned to the size
471 */
472 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
473 /* Fallback to domain selective flush if size is too big */
474 if ( align > cap_max_amask_val(iommu->cap) )
475 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
477 addr >>= PAGE_SHIFT_4K + align;
478 addr <<= PAGE_SHIFT_4K + align;
480 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
481 flush_non_present_entry, flush_dev_iotlb);
482 }
484 void iommu_flush_all(void)
485 {
486 struct acpi_drhd_unit *drhd;
487 struct iommu *iommu;
488 int flush_dev_iotlb;
490 flush_all_cache();
491 for_each_drhd_unit ( drhd )
492 {
493 iommu = drhd->iommu;
494 iommu_flush_context_global(iommu, 0);
495 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
496 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
497 }
498 }
500 /* clear one page's page table */
501 static void dma_pte_clear_one(struct domain *domain, u64 addr)
502 {
503 struct hvm_iommu *hd = domain_hvm_iommu(domain);
504 struct acpi_drhd_unit *drhd;
505 struct iommu *iommu;
506 struct dma_pte *page = NULL, *pte = NULL;
507 u64 pg_maddr;
508 int flush_dev_iotlb;
510 spin_lock(&hd->mapping_lock);
511 /* get last level pte */
512 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
513 if ( pg_maddr == 0 )
514 {
515 spin_unlock(&hd->mapping_lock);
516 return;
517 }
519 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
520 pte = page + address_level_offset(addr, 1);
522 if ( !dma_pte_present(*pte) )
523 {
524 spin_unlock(&hd->mapping_lock);
525 unmap_vtd_domain_page(page);
526 return;
527 }
529 dma_clear_pte(*pte);
530 spin_unlock(&hd->mapping_lock);
531 iommu_flush_cache_entry(pte);
533 /* No need pcidevs_lock here since do that on assign/deassign device*/
534 for_each_drhd_unit ( drhd )
535 {
536 iommu = drhd->iommu;
537 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
538 {
539 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
540 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
541 addr, 1, 0, flush_dev_iotlb) )
542 iommu_flush_write_buffer(iommu);
543 }
544 }
546 unmap_vtd_domain_page(page);
547 }
549 static void iommu_free_pagetable(u64 pt_maddr, int level)
550 {
551 int i;
552 struct dma_pte *pt_vaddr, *pte;
553 int next_level = level - 1;
555 if ( pt_maddr == 0 )
556 return;
558 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
560 for ( i = 0; i < PTE_NUM; i++ )
561 {
562 pte = &pt_vaddr[i];
563 if ( !dma_pte_present(*pte) )
564 continue;
566 if ( next_level >= 1 )
567 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
569 dma_clear_pte(*pte);
570 iommu_flush_cache_entry(pte);
571 }
573 unmap_vtd_domain_page(pt_vaddr);
574 free_pgtable_maddr(pt_maddr);
575 }
577 static int iommu_set_root_entry(struct iommu *iommu)
578 {
579 u32 cmd, sts;
580 unsigned long flags;
581 s_time_t start_time;
583 spin_lock(&iommu->lock);
585 if ( iommu->root_maddr == 0 )
586 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
587 if ( iommu->root_maddr == 0 )
588 {
589 spin_unlock(&iommu->lock);
590 return -ENOMEM;
591 }
593 spin_unlock(&iommu->lock);
594 spin_lock_irqsave(&iommu->register_lock, flags);
595 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
596 cmd = iommu->gcmd | DMA_GCMD_SRTP;
597 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
599 /* Make sure hardware complete it */
600 start_time = NOW();
601 for ( ; ; )
602 {
603 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
604 if ( sts & DMA_GSTS_RTPS )
605 break;
606 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
607 panic("%s: DMAR hardware is malfunctional,"
608 " please disable IOMMU\n", __func__);
609 cpu_relax();
610 }
612 spin_unlock_irqrestore(&iommu->register_lock, flags);
614 return 0;
615 }
617 static void iommu_enable_translation(struct iommu *iommu)
618 {
619 u32 sts;
620 unsigned long flags;
621 s_time_t start_time;
623 dprintk(XENLOG_INFO VTDPREFIX,
624 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
625 spin_lock_irqsave(&iommu->register_lock, flags);
626 iommu->gcmd |= DMA_GCMD_TE;
627 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
628 /* Make sure hardware complete it */
629 start_time = NOW();
630 for ( ; ; )
631 {
632 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
633 if ( sts & DMA_GSTS_TES )
634 break;
635 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
636 panic("%s: DMAR hardware is malfunctional,"
637 " please disable IOMMU\n", __func__);
638 cpu_relax();
639 }
641 /* Disable PMRs when VT-d engine takes effect per spec definition */
642 disable_pmr(iommu);
643 spin_unlock_irqrestore(&iommu->register_lock, flags);
644 }
646 static void iommu_disable_translation(struct iommu *iommu)
647 {
648 u32 sts;
649 unsigned long flags;
650 s_time_t start_time;
652 spin_lock_irqsave(&iommu->register_lock, flags);
653 iommu->gcmd &= ~ DMA_GCMD_TE;
654 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
656 /* Make sure hardware complete it */
657 start_time = NOW();
658 for ( ; ; )
659 {
660 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
661 if ( !(sts & DMA_GSTS_TES) )
662 break;
663 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
664 panic("%s: DMAR hardware is malfunctional,"
665 " please disable IOMMU\n", __func__);
666 cpu_relax();
667 }
668 spin_unlock_irqrestore(&iommu->register_lock, flags);
669 }
671 static struct iommu *vector_to_iommu[NR_VECTORS];
672 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
673 u8 fault_reason, u16 source_id, u64 addr)
674 {
675 dprintk(XENLOG_WARNING VTDPREFIX,
676 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
677 "iommu->reg = %p\n",
678 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
679 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
680 fault_reason, iommu->reg);
682 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
683 if ( fault_reason < 0x20 )
684 print_vtd_entries(iommu, (source_id >> 8),
685 (source_id & 0xff), (addr >> PAGE_SHIFT));
686 #endif
688 return 0;
689 }
691 static void iommu_fault_status(u32 fault_status)
692 {
693 if ( fault_status & DMA_FSTS_PFO )
694 dprintk(XENLOG_ERR VTDPREFIX,
695 "iommu_fault_status: Fault Overflow\n");
696 if ( fault_status & DMA_FSTS_PPF )
697 dprintk(XENLOG_ERR VTDPREFIX,
698 "iommu_fault_status: Primary Pending Fault\n");
699 if ( fault_status & DMA_FSTS_AFO )
700 dprintk(XENLOG_ERR VTDPREFIX,
701 "iommu_fault_status: Advanced Fault Overflow\n");
702 if ( fault_status & DMA_FSTS_APF )
703 dprintk(XENLOG_ERR VTDPREFIX,
704 "iommu_fault_status: Advanced Pending Fault\n");
705 if ( fault_status & DMA_FSTS_IQE )
706 dprintk(XENLOG_ERR VTDPREFIX,
707 "iommu_fault_status: Invalidation Queue Error\n");
708 if ( fault_status & DMA_FSTS_ICE )
709 dprintk(XENLOG_ERR VTDPREFIX,
710 "iommu_fault_status: Invalidation Completion Error\n");
711 if ( fault_status & DMA_FSTS_ITE )
712 dprintk(XENLOG_ERR VTDPREFIX,
713 "iommu_fault_status: Invalidation Time-out Error\n");
714 }
716 #define PRIMARY_FAULT_REG_LEN (16)
717 static void iommu_page_fault(int vector, void *dev_id,
718 struct cpu_user_regs *regs)
719 {
720 struct iommu *iommu = dev_id;
721 int reg, fault_index;
722 u32 fault_status;
723 unsigned long flags;
725 dprintk(XENLOG_WARNING VTDPREFIX,
726 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
728 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
730 iommu_fault_status(fault_status);
732 /* FIXME: ignore advanced fault log */
733 if ( !(fault_status & DMA_FSTS_PPF) )
734 goto clear_overflow;
736 fault_index = dma_fsts_fault_record_index(fault_status);
737 reg = cap_fault_reg_offset(iommu->cap);
738 while (1)
739 {
740 u8 fault_reason;
741 u16 source_id;
742 u32 data;
743 u64 guest_addr;
744 int type;
746 /* highest 32 bits */
747 spin_lock_irqsave(&iommu->register_lock, flags);
748 data = dmar_readl(iommu->reg, reg +
749 fault_index * PRIMARY_FAULT_REG_LEN + 12);
750 if ( !(data & DMA_FRCD_F) )
751 {
752 spin_unlock_irqrestore(&iommu->register_lock, flags);
753 break;
754 }
756 fault_reason = dma_frcd_fault_reason(data);
757 type = dma_frcd_type(data);
759 data = dmar_readl(iommu->reg, reg +
760 fault_index * PRIMARY_FAULT_REG_LEN + 8);
761 source_id = dma_frcd_source_id(data);
763 guest_addr = dmar_readq(iommu->reg, reg +
764 fault_index * PRIMARY_FAULT_REG_LEN);
765 guest_addr = dma_frcd_page_addr(guest_addr);
766 /* clear the fault */
767 dmar_writel(iommu->reg, reg +
768 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
769 spin_unlock_irqrestore(&iommu->register_lock, flags);
771 iommu_page_fault_do_one(iommu, type, fault_reason,
772 source_id, guest_addr);
774 fault_index++;
775 if ( fault_index > cap_num_fault_regs(iommu->cap) )
776 fault_index = 0;
777 }
778 clear_overflow:
779 /* clear primary fault overflow */
780 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
781 if ( fault_status & DMA_FSTS_PFO )
782 {
783 spin_lock_irqsave(&iommu->register_lock, flags);
784 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
785 spin_unlock_irqrestore(&iommu->register_lock, flags);
786 }
787 }
789 static void dma_msi_unmask(unsigned int vector)
790 {
791 struct iommu *iommu = vector_to_iommu[vector];
792 unsigned long flags;
794 /* unmask it */
795 spin_lock_irqsave(&iommu->register_lock, flags);
796 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
797 spin_unlock_irqrestore(&iommu->register_lock, flags);
798 }
800 static void dma_msi_mask(unsigned int vector)
801 {
802 unsigned long flags;
803 struct iommu *iommu = vector_to_iommu[vector];
805 /* mask it */
806 spin_lock_irqsave(&iommu->register_lock, flags);
807 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
808 spin_unlock_irqrestore(&iommu->register_lock, flags);
809 }
811 static unsigned int dma_msi_startup(unsigned int vector)
812 {
813 dma_msi_unmask(vector);
814 return 0;
815 }
817 static void dma_msi_end(unsigned int vector)
818 {
819 dma_msi_unmask(vector);
820 ack_APIC_irq();
821 }
823 static void dma_msi_data_init(struct iommu *iommu, int vector)
824 {
825 u32 msi_data = 0;
826 unsigned long flags;
828 /* Fixed, edge, assert mode. Follow MSI setting */
829 msi_data |= vector & 0xff;
830 msi_data |= 1 << 14;
832 spin_lock_irqsave(&iommu->register_lock, flags);
833 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
834 spin_unlock_irqrestore(&iommu->register_lock, flags);
835 }
837 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
838 {
839 u64 msi_address;
840 unsigned long flags;
842 /* Physical, dedicated cpu. Follow MSI setting */
843 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
844 msi_address |= MSI_PHYSICAL_MODE << 2;
845 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
846 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
848 spin_lock_irqsave(&iommu->register_lock, flags);
849 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
850 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
851 spin_unlock_irqrestore(&iommu->register_lock, flags);
852 }
854 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
855 {
856 struct iommu *iommu = vector_to_iommu[vector];
857 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
858 }
860 static struct hw_interrupt_type dma_msi_type = {
861 .typename = "DMA_MSI",
862 .startup = dma_msi_startup,
863 .shutdown = dma_msi_mask,
864 .enable = dma_msi_unmask,
865 .disable = dma_msi_mask,
866 .ack = dma_msi_mask,
867 .end = dma_msi_end,
868 .set_affinity = dma_msi_set_affinity,
869 };
871 static int iommu_set_interrupt(struct iommu *iommu)
872 {
873 int vector, ret;
875 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
876 if ( vector <= 0 )
877 {
878 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
879 return -EINVAL;
880 }
882 irq_desc[vector].handler = &dma_msi_type;
883 vector_to_iommu[vector] = iommu;
884 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
885 if ( ret )
886 {
887 irq_desc[vector].handler = &no_irq_type;
888 vector_to_iommu[vector] = NULL;
889 free_irq_vector(vector);
890 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
891 return ret;
892 }
894 /* Make sure that vector is never re-used. */
895 vector_irq[vector] = NEVER_ASSIGN_IRQ;
897 return vector;
898 }
900 static int iommu_alloc(struct acpi_drhd_unit *drhd)
901 {
902 struct iommu *iommu;
903 unsigned long sagaw;
904 int agaw;
906 if ( nr_iommus > MAX_IOMMUS )
907 {
908 gdprintk(XENLOG_ERR VTDPREFIX,
909 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
910 return -ENOMEM;
911 }
913 iommu = xmalloc(struct iommu);
914 if ( iommu == NULL )
915 return -ENOMEM;
916 memset(iommu, 0, sizeof(struct iommu));
918 iommu->vector = -1; /* No vector assigned yet. */
920 iommu->intel = alloc_intel_iommu();
921 if ( iommu->intel == NULL )
922 {
923 xfree(iommu);
924 return -ENOMEM;
925 }
927 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
928 iommu->index = nr_iommus++;
930 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
931 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
933 gdprintk(XENLOG_INFO VTDPREFIX,
934 "drhd->address = %"PRIx64"\n", drhd->address);
935 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
937 /* Calculate number of pagetable levels: between 2 and 4. */
938 sagaw = cap_sagaw(iommu->cap);
939 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
940 if ( test_bit(agaw, &sagaw) )
941 break;
942 if ( agaw < 0 )
943 {
944 gdprintk(XENLOG_ERR VTDPREFIX,
945 "IOMMU: unsupported sagaw %lx\n", sagaw);
946 xfree(iommu);
947 return -ENODEV;
948 }
949 iommu->nr_pt_levels = agaw_to_level(agaw);
951 if ( !ecap_coherent(iommu->ecap) )
952 iommus_incoherent = 1;
954 spin_lock_init(&iommu->lock);
955 spin_lock_init(&iommu->register_lock);
957 drhd->iommu = iommu;
958 return 0;
959 }
961 static void iommu_free(struct acpi_drhd_unit *drhd)
962 {
963 struct iommu *iommu = drhd->iommu;
965 if ( iommu == NULL )
966 return;
968 if ( iommu->root_maddr != 0 )
969 {
970 free_pgtable_maddr(iommu->root_maddr);
971 iommu->root_maddr = 0;
972 }
974 if ( iommu->reg )
975 iounmap(iommu->reg);
977 free_intel_iommu(iommu->intel);
978 release_irq_vector(iommu->vector);
979 xfree(iommu);
981 drhd->iommu = NULL;
982 }
984 #define guestwidth_to_adjustwidth(gaw) ({ \
985 int agaw, r = (gaw - 12) % 9; \
986 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
987 if ( agaw > 64 ) \
988 agaw = 64; \
989 agaw; })
991 static int intel_iommu_domain_init(struct domain *d)
992 {
993 struct hvm_iommu *hd = domain_hvm_iommu(d);
994 struct iommu *iommu = NULL;
995 struct acpi_drhd_unit *drhd;
997 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
998 iommu = drhd->iommu;
1000 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1002 if ( d->domain_id == 0 )
1004 /* Set up 1:1 page table for dom0 */
1005 iommu_set_dom0_mapping(d);
1007 setup_dom0_devices(d);
1008 setup_dom0_rmrr(d);
1010 iommu_flush_all();
1012 for_each_drhd_unit ( drhd )
1014 iommu = drhd->iommu;
1015 iommu_enable_translation(iommu);
1019 return 0;
1022 static int domain_context_mapping_one(
1023 struct domain *domain,
1024 struct iommu *iommu,
1025 u8 bus, u8 devfn)
1027 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1028 struct context_entry *context, *context_entries;
1029 u64 maddr, pgd_maddr;
1030 struct pci_dev *pdev = NULL;
1031 int agaw;
1033 ASSERT(spin_is_locked(&pcidevs_lock));
1034 spin_lock(&iommu->lock);
1035 maddr = bus_to_context_maddr(iommu, bus);
1036 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1037 context = &context_entries[devfn];
1039 if ( context_present(*context) )
1041 int res = 0;
1043 pdev = pci_get_pdev(bus, devfn);
1044 if (!pdev)
1045 res = -ENODEV;
1046 else if (pdev->domain != domain)
1047 res = -EINVAL;
1048 unmap_vtd_domain_page(context_entries);
1049 spin_unlock(&iommu->lock);
1050 return res;
1053 if ( iommu_passthrough && (domain->domain_id == 0) )
1055 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1056 agaw = level_to_agaw(iommu->nr_pt_levels);
1058 else
1060 spin_lock(&hd->mapping_lock);
1062 /* Ensure we have pagetables allocated down to leaf PTE. */
1063 if ( hd->pgd_maddr == 0 )
1065 addr_to_dma_page_maddr(domain, 0, 1);
1066 if ( hd->pgd_maddr == 0 )
1068 nomem:
1069 spin_unlock(&hd->mapping_lock);
1070 spin_unlock(&iommu->lock);
1071 unmap_vtd_domain_page(context_entries);
1072 return -ENOMEM;
1076 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1077 pgd_maddr = hd->pgd_maddr;
1078 for ( agaw = level_to_agaw(4);
1079 agaw != level_to_agaw(iommu->nr_pt_levels);
1080 agaw-- )
1082 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1083 pgd_maddr = dma_pte_addr(*p);
1084 unmap_vtd_domain_page(p);
1085 if ( pgd_maddr == 0 )
1086 goto nomem;
1089 context_set_address_root(*context, pgd_maddr);
1090 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1091 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1092 else
1093 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1095 spin_unlock(&hd->mapping_lock);
1098 /*
1099 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1100 * be 1 based as required by intel's iommu hw.
1101 */
1102 context_set_domain_id(context, domain);
1103 context_set_address_width(*context, agaw);
1104 context_set_fault_enable(*context);
1105 context_set_present(*context);
1106 iommu_flush_cache_entry(context);
1107 spin_unlock(&iommu->lock);
1109 /* Context entry was previously non-present (with domid 0). */
1110 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1111 DMA_CCMD_MASK_NOBIT, 1) )
1112 iommu_flush_write_buffer(iommu);
1113 else
1115 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1116 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1119 set_bit(iommu->index, &hd->iommu_bitmap);
1121 unmap_vtd_domain_page(context_entries);
1123 return 0;
1126 #define PCI_BASE_CLASS_BRIDGE 0x06
1127 #define PCI_CLASS_BRIDGE_PCI 0x0604
1129 enum {
1130 DEV_TYPE_PCIe_ENDPOINT,
1131 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1132 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1133 DEV_TYPE_PCI,
1134 };
1136 int pdev_type(u8 bus, u8 devfn)
1138 u16 class_device;
1139 u16 status, creg;
1140 int pos;
1141 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1143 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1144 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1146 pos = pci_find_next_cap(bus, devfn,
1147 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1148 if ( !pos )
1149 return DEV_TYPE_PCI_BRIDGE;
1150 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1151 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1152 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1155 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1156 if ( !(status & PCI_STATUS_CAP_LIST) )
1157 return DEV_TYPE_PCI;
1159 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1160 return DEV_TYPE_PCIe_ENDPOINT;
1162 return DEV_TYPE_PCI;
1165 #define MAX_BUSES 256
1166 static DEFINE_SPINLOCK(bus2bridge_lock);
1167 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1169 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1171 int cnt = 0;
1172 *secbus = *bus;
1174 ASSERT(spin_is_locked(&bus2bridge_lock));
1175 if ( !bus2bridge[*bus].map )
1176 return 0;
1178 while ( bus2bridge[*bus].map )
1180 *secbus = *bus;
1181 *devfn = bus2bridge[*bus].devfn;
1182 *bus = bus2bridge[*bus].bus;
1183 if ( cnt++ >= MAX_BUSES )
1184 return 0;
1187 return 1;
1190 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1192 int ret = 0;
1194 if ( *bus == 0 )
1195 /* assume integrated PCI devices in RC have valid requester-id */
1196 return 1;
1198 spin_lock(&bus2bridge_lock);
1199 ret = _find_pcie_endpoint(bus, devfn, secbus);
1200 spin_unlock(&bus2bridge_lock);
1202 return ret;
1205 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1207 struct acpi_drhd_unit *drhd;
1208 int ret = 0;
1209 u16 sec_bus, sub_bus;
1210 u32 type;
1211 u8 secbus, secdevfn;
1212 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1214 if ( pdev == NULL )
1216 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1217 * -> domain_context_mapping().
1218 * In the case a user enables VT-d and disables USB (that usually needs
1219 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1220 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1221 * the BDF and at last pci_get_pdev() returns NULL here.
1222 */
1223 gdprintk(XENLOG_WARNING VTDPREFIX,
1224 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1225 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1226 return 0;
1229 drhd = acpi_find_matched_drhd_unit(pdev);
1230 if ( !drhd )
1231 return -ENODEV;
1233 ASSERT(spin_is_locked(&pcidevs_lock));
1235 type = pdev_type(bus, devfn);
1236 switch ( type )
1238 case DEV_TYPE_PCIe_BRIDGE:
1239 break;
1241 case DEV_TYPE_PCI_BRIDGE:
1242 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1243 PCI_SECONDARY_BUS);
1244 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1245 PCI_SUBORDINATE_BUS);
1247 spin_lock(&bus2bridge_lock);
1248 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1250 bus2bridge[sec_bus].map = 1;
1251 bus2bridge[sec_bus].bus = bus;
1252 bus2bridge[sec_bus].devfn = devfn;
1254 spin_unlock(&bus2bridge_lock);
1255 break;
1257 case DEV_TYPE_PCIe_ENDPOINT:
1258 gdprintk(XENLOG_INFO VTDPREFIX,
1259 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1260 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1261 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1262 break;
1264 case DEV_TYPE_PCI:
1265 gdprintk(XENLOG_INFO VTDPREFIX,
1266 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1267 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1269 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1270 if ( ret )
1271 break;
1273 secbus = bus;
1274 secdevfn = devfn;
1275 /* dependent devices mapping */
1276 while ( bus2bridge[bus].map )
1278 secbus = bus;
1279 secdevfn = devfn;
1280 devfn = bus2bridge[bus].devfn;
1281 bus = bus2bridge[bus].bus;
1282 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1283 if ( ret )
1284 return ret;
1287 if ( (secbus != bus) && (secdevfn != 0) )
1288 /*
1289 * The source-id for transactions on non-PCIe buses seem
1290 * to originate from devfn=0 on the secondary bus behind
1291 * the bridge. Map that id as well. The id to use in
1292 * these scanarios is not particularly well documented
1293 * anywhere.
1294 */
1295 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1296 break;
1298 default:
1299 gdprintk(XENLOG_ERR VTDPREFIX,
1300 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1301 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1302 ret = -EINVAL;
1303 break;
1306 return ret;
1309 static int domain_context_unmap_one(
1310 struct domain *domain,
1311 struct iommu *iommu,
1312 u8 bus, u8 devfn)
1314 struct context_entry *context, *context_entries;
1315 u64 maddr;
1317 ASSERT(spin_is_locked(&pcidevs_lock));
1318 spin_lock(&iommu->lock);
1320 maddr = bus_to_context_maddr(iommu, bus);
1321 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1322 context = &context_entries[devfn];
1324 if ( !context_present(*context) )
1326 spin_unlock(&iommu->lock);
1327 unmap_vtd_domain_page(context_entries);
1328 return 0;
1331 context_clear_present(*context);
1332 context_clear_entry(*context);
1333 iommu_flush_cache_entry(context);
1335 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1336 (((u16)bus) << 8) | devfn,
1337 DMA_CCMD_MASK_NOBIT, 0) )
1338 iommu_flush_write_buffer(iommu);
1339 else
1341 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1342 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1345 spin_unlock(&iommu->lock);
1346 unmap_vtd_domain_page(context_entries);
1348 return 0;
1351 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1353 struct acpi_drhd_unit *drhd;
1354 int ret = 0;
1355 u32 type;
1356 u8 secbus, secdevfn;
1357 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1359 BUG_ON(!pdev);
1361 drhd = acpi_find_matched_drhd_unit(pdev);
1362 if ( !drhd )
1363 return -ENODEV;
1365 type = pdev_type(bus, devfn);
1366 switch ( type )
1368 case DEV_TYPE_PCIe_BRIDGE:
1369 case DEV_TYPE_PCI_BRIDGE:
1370 break;
1372 case DEV_TYPE_PCIe_ENDPOINT:
1373 gdprintk(XENLOG_INFO VTDPREFIX,
1374 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1375 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1376 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1377 break;
1379 case DEV_TYPE_PCI:
1380 gdprintk(XENLOG_INFO VTDPREFIX,
1381 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1382 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1383 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1384 if ( ret )
1385 break;
1387 secbus = bus;
1388 secdevfn = devfn;
1389 /* dependent devices unmapping */
1390 while ( bus2bridge[bus].map )
1392 secbus = bus;
1393 secdevfn = devfn;
1394 devfn = bus2bridge[bus].devfn;
1395 bus = bus2bridge[bus].bus;
1396 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1397 if ( ret )
1398 return ret;
1401 if ( (secbus != bus) && (secdevfn != 0) )
1402 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1403 break;
1405 default:
1406 gdprintk(XENLOG_ERR VTDPREFIX,
1407 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1408 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1409 ret = -EINVAL;
1410 break;
1413 return ret;
1416 static int reassign_device_ownership(
1417 struct domain *source,
1418 struct domain *target,
1419 u8 bus, u8 devfn)
1421 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1422 struct pci_dev *pdev;
1423 struct acpi_drhd_unit *drhd;
1424 struct iommu *pdev_iommu;
1425 int ret, found = 0;
1427 ASSERT(spin_is_locked(&pcidevs_lock));
1428 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1430 if (!pdev)
1431 return -ENODEV;
1433 drhd = acpi_find_matched_drhd_unit(pdev);
1434 pdev_iommu = drhd->iommu;
1435 domain_context_unmap(source, bus, devfn);
1437 ret = domain_context_mapping(target, bus, devfn);
1438 if ( ret )
1439 return ret;
1441 list_move(&pdev->domain_list, &target->arch.pdev_list);
1442 pdev->domain = target;
1444 for_each_pdev ( source, pdev )
1446 drhd = acpi_find_matched_drhd_unit(pdev);
1447 if ( drhd->iommu == pdev_iommu )
1449 found = 1;
1450 break;
1454 if ( !found )
1455 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1457 return ret;
1460 void iommu_domain_teardown(struct domain *d)
1462 struct hvm_iommu *hd = domain_hvm_iommu(d);
1464 if ( list_empty(&acpi_drhd_units) )
1465 return;
1467 spin_lock(&hd->mapping_lock);
1468 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1469 hd->pgd_maddr = 0;
1470 spin_unlock(&hd->mapping_lock);
1472 iommu_domid_release(d);
1475 int intel_iommu_map_page(
1476 struct domain *d, unsigned long gfn, unsigned long mfn)
1478 struct hvm_iommu *hd = domain_hvm_iommu(d);
1479 struct acpi_drhd_unit *drhd;
1480 struct iommu *iommu;
1481 struct dma_pte *page = NULL, *pte = NULL;
1482 u64 pg_maddr;
1483 int pte_present;
1484 int flush_dev_iotlb;
1486 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1487 iommu = drhd->iommu;
1489 /* do nothing if dom0 and iommu supports pass thru */
1490 if ( iommu_passthrough && (d->domain_id == 0) )
1491 return 0;
1493 spin_lock(&hd->mapping_lock);
1495 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1496 if ( pg_maddr == 0 )
1498 spin_unlock(&hd->mapping_lock);
1499 return -ENOMEM;
1501 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1502 pte = page + (gfn & LEVEL_MASK);
1503 pte_present = dma_pte_present(*pte);
1504 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1505 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1507 /* Set the SNP on leaf page table if Snoop Control available */
1508 if ( iommu_snoop )
1509 dma_set_pte_snp(*pte);
1511 iommu_flush_cache_entry(pte);
1512 spin_unlock(&hd->mapping_lock);
1513 unmap_vtd_domain_page(page);
1515 /*
1516 * No need pcideves_lock here because we have flush
1517 * when assign/deassign device
1518 */
1519 for_each_drhd_unit ( drhd )
1521 iommu = drhd->iommu;
1523 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1524 continue;
1526 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1527 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1528 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1529 !pte_present, flush_dev_iotlb) )
1530 iommu_flush_write_buffer(iommu);
1533 return 0;
1536 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1538 struct acpi_drhd_unit *drhd;
1539 struct iommu *iommu;
1541 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1542 iommu = drhd->iommu;
1544 /* do nothing if dom0 and iommu supports pass thru */
1545 if ( iommu_passthrough && (d->domain_id == 0) )
1546 return 0;
1548 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1550 return 0;
1553 static int iommu_prepare_rmrr_dev(struct domain *d,
1554 struct acpi_rmrr_unit *rmrr,
1555 u8 bus, u8 devfn)
1557 int ret = 0;
1558 u64 base, end;
1559 unsigned long base_pfn, end_pfn;
1561 ASSERT(spin_is_locked(&pcidevs_lock));
1562 ASSERT(rmrr->base_address < rmrr->end_address);
1564 base = rmrr->base_address & PAGE_MASK_4K;
1565 base_pfn = base >> PAGE_SHIFT_4K;
1566 end = PAGE_ALIGN_4K(rmrr->end_address);
1567 end_pfn = end >> PAGE_SHIFT_4K;
1569 while ( base_pfn < end_pfn )
1571 intel_iommu_map_page(d, base_pfn, base_pfn);
1572 base_pfn++;
1575 ret = domain_context_mapping(d, bus, devfn);
1577 return ret;
1580 static int intel_iommu_add_device(struct pci_dev *pdev)
1582 struct acpi_rmrr_unit *rmrr;
1583 u16 bdf;
1584 int ret, i;
1586 ASSERT(spin_is_locked(&pcidevs_lock));
1588 if ( !pdev->domain )
1589 return -EINVAL;
1591 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1592 if ( ret )
1594 gdprintk(XENLOG_ERR VTDPREFIX,
1595 "intel_iommu_add_device: context mapping failed\n");
1596 return ret;
1599 for_each_rmrr_device ( rmrr, bdf, i )
1601 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1603 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1604 pdev->bus, pdev->devfn);
1605 if ( ret )
1606 gdprintk(XENLOG_ERR VTDPREFIX,
1607 "intel_iommu_add_device: RMRR mapping failed\n");
1608 break;
1612 return ret;
1615 static int intel_iommu_remove_device(struct pci_dev *pdev)
1617 struct acpi_rmrr_unit *rmrr;
1618 u16 bdf;
1619 int i;
1621 if ( !pdev->domain )
1622 return -EINVAL;
1624 /* If the device belongs to dom0, and it has RMRR, don't remove it
1625 * from dom0, because BIOS may use RMRR at booting time.
1626 */
1627 if ( pdev->domain->domain_id == 0 )
1629 for_each_rmrr_device ( rmrr, bdf, i )
1631 if ( PCI_BUS(bdf) == pdev->bus &&
1632 PCI_DEVFN2(bdf) == pdev->devfn )
1633 return 0;
1637 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1640 static void setup_dom0_devices(struct domain *d)
1642 struct hvm_iommu *hd;
1643 struct pci_dev *pdev;
1644 int bus, dev, func;
1645 u32 l;
1647 hd = domain_hvm_iommu(d);
1649 spin_lock(&pcidevs_lock);
1650 for ( bus = 0; bus < 256; bus++ )
1652 for ( dev = 0; dev < 32; dev++ )
1654 for ( func = 0; func < 8; func++ )
1656 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1657 /* some broken boards return 0 or ~0 if a slot is empty: */
1658 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1659 (l == 0x0000ffff) || (l == 0xffff0000) )
1660 continue;
1662 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1663 pdev->domain = d;
1664 list_add(&pdev->domain_list, &d->arch.pdev_list);
1665 domain_context_mapping(d, pdev->bus, pdev->devfn);
1666 if ( ats_device(0, pdev->bus, pdev->devfn) )
1667 enable_ats_device(0, pdev->bus, pdev->devfn);
1671 spin_unlock(&pcidevs_lock);
1674 void clear_fault_bits(struct iommu *iommu)
1676 u64 val;
1678 val = dmar_readq(
1679 iommu->reg,
1680 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1681 dmar_writeq(
1682 iommu->reg,
1683 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1684 val);
1685 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1688 static int init_vtd_hw(void)
1690 struct acpi_drhd_unit *drhd;
1691 struct iommu *iommu;
1692 struct iommu_flush *flush = NULL;
1693 int vector;
1694 int ret;
1696 for_each_drhd_unit ( drhd )
1698 iommu = drhd->iommu;
1699 ret = iommu_set_root_entry(iommu);
1700 if ( ret )
1702 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1703 return -EIO;
1706 if ( iommu->vector < 0 )
1708 vector = iommu_set_interrupt(iommu);
1709 if ( vector < 0 )
1711 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1712 return vector;
1714 iommu->vector = vector;
1716 dma_msi_data_init(iommu, iommu->vector);
1717 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1718 clear_fault_bits(iommu);
1719 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1721 /* initialize flush functions */
1722 flush = iommu_get_flush(iommu);
1723 flush->context = flush_context_reg;
1724 flush->iotlb = flush_iotlb_reg;
1727 if ( iommu_qinval )
1729 for_each_drhd_unit ( drhd )
1731 iommu = drhd->iommu;
1732 if ( enable_qinval(iommu) != 0 )
1734 dprintk(XENLOG_INFO VTDPREFIX,
1735 "Failed to enable Queued Invalidation!\n");
1736 break;
1741 if ( iommu_intremap )
1743 for_each_drhd_unit ( drhd )
1745 iommu = drhd->iommu;
1746 if ( enable_intremap(iommu) != 0 )
1748 dprintk(XENLOG_INFO VTDPREFIX,
1749 "Failed to enable Interrupt Remapping!\n");
1750 break;
1755 return 0;
1758 static void setup_dom0_rmrr(struct domain *d)
1760 struct acpi_rmrr_unit *rmrr;
1761 u16 bdf;
1762 int ret, i;
1764 spin_lock(&pcidevs_lock);
1765 for_each_rmrr_device ( rmrr, bdf, i )
1767 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1768 if ( ret )
1769 gdprintk(XENLOG_ERR VTDPREFIX,
1770 "IOMMU: mapping reserved region failed\n");
1772 spin_unlock(&pcidevs_lock);
1775 static void platform_quirks(void)
1777 u32 id;
1779 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1780 id = pci_conf_read32(0, 0, 0, 0);
1781 if ( id == 0x2a408086 )
1783 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1784 rwbf_quirk = 1;
1788 int intel_vtd_setup(void)
1790 struct acpi_drhd_unit *drhd;
1791 struct iommu *iommu;
1793 if ( !vtd_enabled )
1794 return -ENODEV;
1796 platform_quirks();
1798 spin_lock_init(&domid_bitmap_lock);
1799 clflush_size = get_cache_line_size();
1801 /* We enable the following features only if they are supported by all VT-d
1802 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1803 * Interrupt Remapping.
1804 */
1805 for_each_drhd_unit ( drhd )
1807 if ( iommu_alloc(drhd) != 0 )
1808 goto error;
1810 iommu = drhd->iommu;
1812 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1813 iommu_snoop = 0;
1815 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1816 iommu_passthrough = 0;
1818 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1819 iommu_qinval = 0;
1821 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1822 iommu_intremap = 0;
1825 if ( !iommu_qinval && iommu_intremap )
1827 iommu_intremap = 0;
1828 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1829 "since Queued Invalidation isn't supported or enabled.\n");
1832 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1833 P(iommu_snoop, "Snoop Control");
1834 P(iommu_passthrough, "DMA Passthrough");
1835 P(iommu_qinval, "Queued Invalidation");
1836 P(iommu_intremap, "Interrupt Remapping");
1837 #undef P
1839 /* Allocate IO page directory page for the domain. */
1840 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1841 iommu = drhd->iommu;
1843 /* Allocate domain id bitmap, and set bit 0 as reserved */
1844 domid_bitmap_size = cap_ndoms(iommu->cap);
1845 domid_bitmap = xmalloc_array(unsigned long,
1846 BITS_TO_LONGS(domid_bitmap_size));
1847 if ( domid_bitmap == NULL )
1848 goto error;
1849 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1850 set_bit(0, domid_bitmap);
1852 if ( init_vtd_hw() )
1853 goto error;
1855 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1857 return 0;
1859 error:
1860 for_each_drhd_unit ( drhd )
1861 iommu_free(drhd);
1862 vtd_enabled = 0;
1863 iommu_snoop = 0;
1864 iommu_passthrough = 0;
1865 iommu_qinval = 0;
1866 iommu_intremap = 0;
1867 return -ENOMEM;
1870 /*
1871 * If the device isn't owned by dom0, it means it already
1872 * has been assigned to other domain, or it's not exist.
1873 */
1874 int device_assigned(u8 bus, u8 devfn)
1876 struct pci_dev *pdev;
1878 spin_lock(&pcidevs_lock);
1879 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1880 if (!pdev)
1882 spin_unlock(&pcidevs_lock);
1883 return -1;
1886 spin_unlock(&pcidevs_lock);
1887 return 0;
1890 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1892 struct acpi_rmrr_unit *rmrr;
1893 int ret = 0, i;
1894 struct pci_dev *pdev;
1895 u16 bdf;
1897 if ( list_empty(&acpi_drhd_units) )
1898 return -ENODEV;
1900 ASSERT(spin_is_locked(&pcidevs_lock));
1901 pdev = pci_get_pdev(bus, devfn);
1902 if (!pdev)
1903 return -ENODEV;
1905 if (pdev->domain != dom0)
1907 gdprintk(XENLOG_ERR VTDPREFIX,
1908 "IOMMU: assign a assigned device\n");
1909 return -EBUSY;
1912 ret = reassign_device_ownership(dom0, d, bus, devfn);
1913 if ( ret )
1914 goto done;
1916 /* Setup rmrr identity mapping */
1917 for_each_rmrr_device( rmrr, bdf, i )
1919 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1921 /* FIXME: Because USB RMRR conflicts with guest bios region,
1922 * ignore USB RMRR temporarily.
1923 */
1924 if ( is_usb_device(bus, devfn) )
1926 ret = 0;
1927 goto done;
1930 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1931 if ( ret )
1932 gdprintk(XENLOG_ERR VTDPREFIX,
1933 "IOMMU: mapping reserved region failed\n");
1934 goto done;
1938 done:
1939 return ret;
1942 static int intel_iommu_group_id(u8 bus, u8 devfn)
1944 u8 secbus;
1945 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1946 return PCI_BDF2(bus, devfn);
1947 else
1948 return -1;
1951 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1952 void iommu_suspend(void)
1954 struct acpi_drhd_unit *drhd;
1955 struct iommu *iommu;
1956 u32 i;
1958 if ( !vtd_enabled )
1959 return;
1961 iommu_flush_all();
1963 for_each_drhd_unit ( drhd )
1965 iommu = drhd->iommu;
1966 i = iommu->index;
1968 iommu_state[i][DMAR_FECTL_REG] =
1969 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1970 iommu_state[i][DMAR_FEDATA_REG] =
1971 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1972 iommu_state[i][DMAR_FEADDR_REG] =
1973 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1974 iommu_state[i][DMAR_FEUADDR_REG] =
1975 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1977 iommu_disable_translation(iommu);
1979 if ( iommu_intremap )
1980 disable_intremap(iommu);
1982 if ( iommu_qinval )
1983 disable_qinval(iommu);
1987 void iommu_resume(void)
1989 struct acpi_drhd_unit *drhd;
1990 struct iommu *iommu;
1991 struct iommu_flush *flush;
1992 u32 i;
1994 if ( !vtd_enabled )
1995 return;
1997 /* Re-initialize the register-based flush functions.
1998 * In iommu_flush_all(), we invoke iommu_flush_{context,iotlb}_global(),
1999 * but at this point, on hosts that support QI(Queued Invalidation), QI
2000 * hasn't been re-enabed yet, so for now let's use the register-based
2001 * invalidation method before invoking init_vtd_hw().
2002 */
2003 if ( iommu_qinval )
2005 for_each_drhd_unit ( drhd )
2007 iommu = drhd->iommu;
2008 flush = iommu_get_flush(iommu);
2009 flush->context = flush_context_reg;
2010 flush->iotlb = flush_iotlb_reg;
2014 /* Not sure whether the flush operation is required to meet iommu
2015 * specification. Note that BIOS also executes in S3 resume and iommu may
2016 * be touched again, so let us do the flush operation for safety.
2017 */
2018 iommu_flush_all();
2020 if ( init_vtd_hw() != 0 && force_iommu )
2021 panic("IOMMU setup failed, crash Xen for security purpose!\n");
2023 for_each_drhd_unit ( drhd )
2025 iommu = drhd->iommu;
2026 i = iommu->index;
2028 dmar_writel(iommu->reg, DMAR_FECTL_REG,
2029 (u32) iommu_state[i][DMAR_FECTL_REG]);
2030 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
2031 (u32) iommu_state[i][DMAR_FEDATA_REG]);
2032 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
2033 (u32) iommu_state[i][DMAR_FEADDR_REG]);
2034 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
2035 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
2037 iommu_enable_translation(iommu);
2041 struct iommu_ops intel_iommu_ops = {
2042 .init = intel_iommu_domain_init,
2043 .add_device = intel_iommu_add_device,
2044 .remove_device = intel_iommu_remove_device,
2045 .assign_device = intel_iommu_assign_device,
2046 .teardown = iommu_domain_teardown,
2047 .map_page = intel_iommu_map_page,
2048 .unmap_page = intel_iommu_unmap_page,
2049 .reassign_device = reassign_device_ownership,
2050 .get_device_group_id = intel_iommu_group_id,
2051 .update_ire_from_apic = io_apic_write_remap_rte,
2052 .update_ire_from_msi = msi_msg_write_remap_rte,
2053 };
2055 /*
2056 * Local variables:
2057 * mode: C
2058 * c-set-style: "BSD"
2059 * c-basic-offset: 4
2060 * tab-width: 4
2061 * indent-tabs-mode: nil
2062 * End:
2063 */