ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19673:f3bed18decfc

[VTD] laying the ground work for ATS

These changes lay the ground work for ATS enabling in Xen. It will be
followed by patch which enables PCI MMCFG which is needed for actual
enabling of ATS functionality.

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri May 29 09:19:30 2009 +0100 (2009-05-29)
parents 6ba4e34d21d3
children 42fe00c6f8b4
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
233 s_time_t start_time;
235 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
236 return;
237 val = iommu->gcmd | DMA_GCMD_WBF;
239 spin_lock_irqsave(&iommu->register_lock, flag);
240 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
242 /* Make sure hardware complete it */
243 start_time = NOW();
244 for ( ; ; )
245 {
246 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
247 if ( !(val & DMA_GSTS_WBFS) )
248 break;
249 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
250 panic("%s: DMAR hardware is malfunctional,"
251 " please disable IOMMU\n", __func__);
252 cpu_relax();
253 }
254 spin_unlock_irqrestore(&iommu->register_lock, flag);
255 }
257 /* return value determine if we need a write buffer flush */
258 static int flush_context_reg(
259 void *_iommu,
260 u16 did, u16 source_id, u8 function_mask, u64 type,
261 int flush_non_present_entry)
262 {
263 struct iommu *iommu = (struct iommu *) _iommu;
264 u64 val = 0;
265 unsigned long flag;
266 s_time_t start_time;
268 /*
269 * In the non-present entry flush case, if hardware doesn't cache
270 * non-present entry we do nothing and if hardware cache non-present
271 * entry, we flush entries of domain 0 (the domain id is used to cache
272 * any non-present entries)
273 */
274 if ( flush_non_present_entry )
275 {
276 if ( !cap_caching_mode(iommu->cap) )
277 return 1;
278 else
279 did = 0;
280 }
282 /* use register invalidation */
283 switch ( type )
284 {
285 case DMA_CCMD_GLOBAL_INVL:
286 val = DMA_CCMD_GLOBAL_INVL;
287 break;
288 case DMA_CCMD_DOMAIN_INVL:
289 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
290 break;
291 case DMA_CCMD_DEVICE_INVL:
292 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
293 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
294 break;
295 default:
296 BUG();
297 }
298 val |= DMA_CCMD_ICC;
300 spin_lock_irqsave(&iommu->register_lock, flag);
301 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
303 /* Make sure hardware complete it */
304 start_time = NOW();
305 for ( ; ; )
306 {
307 val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
308 if ( !(val & DMA_CCMD_ICC) )
309 break;
310 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
311 panic("%s: DMAR hardware is malfunctional,"
312 " please disable IOMMU\n", __func__);
313 cpu_relax();
314 }
315 spin_unlock_irqrestore(&iommu->register_lock, flag);
316 /* flush context entry will implicitly flush write buffer */
317 return 0;
318 }
320 static int inline iommu_flush_context_global(
321 struct iommu *iommu, int flush_non_present_entry)
322 {
323 struct iommu_flush *flush = iommu_get_flush(iommu);
324 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
325 flush_non_present_entry);
326 }
328 static int inline iommu_flush_context_domain(
329 struct iommu *iommu, u16 did, int flush_non_present_entry)
330 {
331 struct iommu_flush *flush = iommu_get_flush(iommu);
332 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
333 flush_non_present_entry);
334 }
336 static int inline iommu_flush_context_device(
337 struct iommu *iommu, u16 did, u16 source_id,
338 u8 function_mask, int flush_non_present_entry)
339 {
340 struct iommu_flush *flush = iommu_get_flush(iommu);
341 return flush->context(iommu, did, source_id, function_mask,
342 DMA_CCMD_DEVICE_INVL,
343 flush_non_present_entry);
344 }
346 /* return value determine if we need a write buffer flush */
347 static int flush_iotlb_reg(void *_iommu, u16 did,
348 u64 addr, unsigned int size_order, u64 type,
349 int flush_non_present_entry, int flush_dev_iotlb)
350 {
351 struct iommu *iommu = (struct iommu *) _iommu;
352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
353 u64 val = 0, val_iva = 0;
354 unsigned long flag;
355 s_time_t start_time;
357 /*
358 * In the non-present entry flush case, if hardware doesn't cache
359 * non-present entry we do nothing and if hardware cache non-present
360 * entry, we flush entries of domain 0 (the domain id is used to cache
361 * any non-present entries)
362 */
363 if ( flush_non_present_entry )
364 {
365 if ( !cap_caching_mode(iommu->cap) )
366 return 1;
367 else
368 did = 0;
369 }
371 /* use register invalidation */
372 switch ( type )
373 {
374 case DMA_TLB_GLOBAL_FLUSH:
375 /* global flush doesn't need set IVA_REG */
376 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
377 break;
378 case DMA_TLB_DSI_FLUSH:
379 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
380 break;
381 case DMA_TLB_PSI_FLUSH:
382 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
383 /* Note: always flush non-leaf currently */
384 val_iva = size_order | addr;
385 break;
386 default:
387 BUG();
388 }
389 /* Note: set drain read/write */
390 if ( cap_read_drain(iommu->cap) )
391 val |= DMA_TLB_READ_DRAIN;
392 if ( cap_write_drain(iommu->cap) )
393 val |= DMA_TLB_WRITE_DRAIN;
395 spin_lock_irqsave(&iommu->register_lock, flag);
396 /* Note: Only uses first TLB reg currently */
397 if ( val_iva )
398 dmar_writeq(iommu->reg, tlb_offset, val_iva);
399 dmar_writeq(iommu->reg, tlb_offset + 8, val);
401 /* Make sure hardware complete it */
402 start_time = NOW();
403 for ( ; ; )
404 {
405 val = dmar_readq(iommu->reg, tlb_offset + 8);
406 if ( !(val & DMA_TLB_IVT) )
407 break;
408 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
409 panic("%s: DMAR hardware is malfunctional,"
410 " please disable IOMMU\n", __func__);
411 cpu_relax();
412 }
413 spin_unlock_irqrestore(&iommu->register_lock, flag);
415 /* check IOTLB invalidation granularity */
416 if ( DMA_TLB_IAIG(val) == 0 )
417 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
419 /* flush iotlb entry will implicitly flush write buffer */
420 return 0;
421 }
423 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
424 int flush_non_present_entry, int flush_dev_iotlb)
425 {
426 struct iommu_flush *flush = iommu_get_flush(iommu);
427 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
428 flush_non_present_entry, flush_dev_iotlb);
429 }
431 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
432 int flush_non_present_entry, int flush_dev_iotlb)
433 {
434 struct iommu_flush *flush = iommu_get_flush(iommu);
435 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
436 flush_non_present_entry, flush_dev_iotlb);
437 }
439 static int inline get_alignment(u64 base, unsigned int size)
440 {
441 int t = 0;
442 u64 end;
444 end = base + size - 1;
445 while ( base != end )
446 {
447 t++;
448 base >>= 1;
449 end >>= 1;
450 }
451 return t;
452 }
454 static int inline iommu_flush_iotlb_psi(
455 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
456 int flush_non_present_entry, int flush_dev_iotlb)
457 {
458 unsigned int align;
459 struct iommu_flush *flush = iommu_get_flush(iommu);
461 ASSERT(!(addr & (~PAGE_MASK_4K)));
462 ASSERT(pages > 0);
464 /* Fallback to domain selective flush if no PSI support */
465 if ( !cap_pgsel_inv(iommu->cap) )
466 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
468 /*
469 * PSI requires page size is 2 ^ x, and the base address is naturally
470 * aligned to the size
471 */
472 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
473 /* Fallback to domain selective flush if size is too big */
474 if ( align > cap_max_amask_val(iommu->cap) )
475 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
477 addr >>= PAGE_SHIFT_4K + align;
478 addr <<= PAGE_SHIFT_4K + align;
480 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
481 flush_non_present_entry, flush_dev_iotlb);
482 }
484 void iommu_flush_all(void)
485 {
486 struct acpi_drhd_unit *drhd;
487 struct iommu *iommu;
488 int flush_dev_iotlb;
490 flush_all_cache();
491 for_each_drhd_unit ( drhd )
492 {
493 iommu = drhd->iommu;
494 iommu_flush_context_global(iommu, 0);
495 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
496 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
497 }
498 }
500 /* clear one page's page table */
501 static void dma_pte_clear_one(struct domain *domain, u64 addr)
502 {
503 struct hvm_iommu *hd = domain_hvm_iommu(domain);
504 struct acpi_drhd_unit *drhd;
505 struct iommu *iommu;
506 struct dma_pte *page = NULL, *pte = NULL;
507 u64 pg_maddr;
508 int flush_dev_iotlb;
510 spin_lock(&hd->mapping_lock);
511 /* get last level pte */
512 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
513 if ( pg_maddr == 0 )
514 {
515 spin_unlock(&hd->mapping_lock);
516 return;
517 }
519 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
520 pte = page + address_level_offset(addr, 1);
522 if ( !dma_pte_present(*pte) )
523 {
524 spin_unlock(&hd->mapping_lock);
525 unmap_vtd_domain_page(page);
526 return;
527 }
529 dma_clear_pte(*pte);
530 spin_unlock(&hd->mapping_lock);
531 iommu_flush_cache_entry(pte);
533 /* No need pcidevs_lock here since do that on assign/deassign device*/
534 for_each_drhd_unit ( drhd )
535 {
536 iommu = drhd->iommu;
537 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
538 {
539 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
540 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
541 addr, 1, 0, flush_dev_iotlb) )
542 iommu_flush_write_buffer(iommu);
543 }
544 }
546 unmap_vtd_domain_page(page);
547 }
549 static void iommu_free_pagetable(u64 pt_maddr, int level)
550 {
551 int i;
552 struct dma_pte *pt_vaddr, *pte;
553 int next_level = level - 1;
555 if ( pt_maddr == 0 )
556 return;
558 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
560 for ( i = 0; i < PTE_NUM; i++ )
561 {
562 pte = &pt_vaddr[i];
563 if ( !dma_pte_present(*pte) )
564 continue;
566 if ( next_level >= 1 )
567 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
569 dma_clear_pte(*pte);
570 iommu_flush_cache_entry(pte);
571 }
573 unmap_vtd_domain_page(pt_vaddr);
574 free_pgtable_maddr(pt_maddr);
575 }
577 static int iommu_set_root_entry(struct iommu *iommu)
578 {
579 u32 cmd, sts;
580 unsigned long flags;
581 s_time_t start_time;
583 spin_lock(&iommu->lock);
585 if ( iommu->root_maddr == 0 )
586 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
587 if ( iommu->root_maddr == 0 )
588 {
589 spin_unlock(&iommu->lock);
590 return -ENOMEM;
591 }
593 spin_unlock(&iommu->lock);
594 spin_lock_irqsave(&iommu->register_lock, flags);
595 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
596 cmd = iommu->gcmd | DMA_GCMD_SRTP;
597 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
599 /* Make sure hardware complete it */
600 start_time = NOW();
601 for ( ; ; )
602 {
603 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
604 if ( sts & DMA_GSTS_RTPS )
605 break;
606 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
607 panic("%s: DMAR hardware is malfunctional,"
608 " please disable IOMMU\n", __func__);
609 cpu_relax();
610 }
612 spin_unlock_irqrestore(&iommu->register_lock, flags);
614 return 0;
615 }
617 static void iommu_enable_translation(struct iommu *iommu)
618 {
619 u32 sts;
620 unsigned long flags;
621 s_time_t start_time;
623 dprintk(XENLOG_INFO VTDPREFIX,
624 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
625 spin_lock_irqsave(&iommu->register_lock, flags);
626 iommu->gcmd |= DMA_GCMD_TE;
627 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
628 /* Make sure hardware complete it */
629 start_time = NOW();
630 for ( ; ; )
631 {
632 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
633 if ( sts & DMA_GSTS_TES )
634 break;
635 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
636 panic("%s: DMAR hardware is malfunctional,"
637 " please disable IOMMU\n", __func__);
638 cpu_relax();
639 }
641 /* Disable PMRs when VT-d engine takes effect per spec definition */
642 disable_pmr(iommu);
643 spin_unlock_irqrestore(&iommu->register_lock, flags);
644 }
646 static void iommu_disable_translation(struct iommu *iommu)
647 {
648 u32 sts;
649 unsigned long flags;
650 s_time_t start_time;
652 spin_lock_irqsave(&iommu->register_lock, flags);
653 iommu->gcmd &= ~ DMA_GCMD_TE;
654 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
656 /* Make sure hardware complete it */
657 start_time = NOW();
658 for ( ; ; )
659 {
660 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
661 if ( !(sts & DMA_GSTS_TES) )
662 break;
663 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
664 panic("%s: DMAR hardware is malfunctional,"
665 " please disable IOMMU\n", __func__);
666 cpu_relax();
667 }
668 spin_unlock_irqrestore(&iommu->register_lock, flags);
669 }
671 static struct iommu *vector_to_iommu[NR_VECTORS];
672 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
673 u8 fault_reason, u16 source_id, u64 addr)
674 {
675 dprintk(XENLOG_WARNING VTDPREFIX,
676 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
677 "iommu->reg = %p\n",
678 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
679 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
680 fault_reason, iommu->reg);
682 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
683 if ( fault_reason < 0x20 )
684 print_vtd_entries(iommu, (source_id >> 8),
685 (source_id & 0xff), (addr >> PAGE_SHIFT));
686 #endif
688 return 0;
689 }
691 static void iommu_fault_status(u32 fault_status)
692 {
693 if ( fault_status & DMA_FSTS_PFO )
694 dprintk(XENLOG_ERR VTDPREFIX,
695 "iommu_fault_status: Fault Overflow\n");
696 if ( fault_status & DMA_FSTS_PPF )
697 dprintk(XENLOG_ERR VTDPREFIX,
698 "iommu_fault_status: Primary Pending Fault\n");
699 if ( fault_status & DMA_FSTS_AFO )
700 dprintk(XENLOG_ERR VTDPREFIX,
701 "iommu_fault_status: Advanced Fault Overflow\n");
702 if ( fault_status & DMA_FSTS_APF )
703 dprintk(XENLOG_ERR VTDPREFIX,
704 "iommu_fault_status: Advanced Pending Fault\n");
705 if ( fault_status & DMA_FSTS_IQE )
706 dprintk(XENLOG_ERR VTDPREFIX,
707 "iommu_fault_status: Invalidation Queue Error\n");
708 if ( fault_status & DMA_FSTS_ICE )
709 dprintk(XENLOG_ERR VTDPREFIX,
710 "iommu_fault_status: Invalidation Completion Error\n");
711 if ( fault_status & DMA_FSTS_ITE )
712 dprintk(XENLOG_ERR VTDPREFIX,
713 "iommu_fault_status: Invalidation Time-out Error\n");
714 }
716 #define PRIMARY_FAULT_REG_LEN (16)
717 static void iommu_page_fault(int vector, void *dev_id,
718 struct cpu_user_regs *regs)
719 {
720 struct iommu *iommu = dev_id;
721 int reg, fault_index;
722 u32 fault_status;
723 unsigned long flags;
725 dprintk(XENLOG_WARNING VTDPREFIX,
726 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
728 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
730 iommu_fault_status(fault_status);
732 /* FIXME: ignore advanced fault log */
733 if ( !(fault_status & DMA_FSTS_PPF) )
734 goto clear_overflow;
736 fault_index = dma_fsts_fault_record_index(fault_status);
737 reg = cap_fault_reg_offset(iommu->cap);
738 while (1)
739 {
740 u8 fault_reason;
741 u16 source_id;
742 u32 data;
743 u64 guest_addr;
744 int type;
746 /* highest 32 bits */
747 spin_lock_irqsave(&iommu->register_lock, flags);
748 data = dmar_readl(iommu->reg, reg +
749 fault_index * PRIMARY_FAULT_REG_LEN + 12);
750 if ( !(data & DMA_FRCD_F) )
751 {
752 spin_unlock_irqrestore(&iommu->register_lock, flags);
753 break;
754 }
756 fault_reason = dma_frcd_fault_reason(data);
757 type = dma_frcd_type(data);
759 data = dmar_readl(iommu->reg, reg +
760 fault_index * PRIMARY_FAULT_REG_LEN + 8);
761 source_id = dma_frcd_source_id(data);
763 guest_addr = dmar_readq(iommu->reg, reg +
764 fault_index * PRIMARY_FAULT_REG_LEN);
765 guest_addr = dma_frcd_page_addr(guest_addr);
766 /* clear the fault */
767 dmar_writel(iommu->reg, reg +
768 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
769 spin_unlock_irqrestore(&iommu->register_lock, flags);
771 iommu_page_fault_do_one(iommu, type, fault_reason,
772 source_id, guest_addr);
774 fault_index++;
775 if ( fault_index > cap_num_fault_regs(iommu->cap) )
776 fault_index = 0;
777 }
778 clear_overflow:
779 /* clear primary fault overflow */
780 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
781 if ( fault_status & DMA_FSTS_PFO )
782 {
783 spin_lock_irqsave(&iommu->register_lock, flags);
784 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
785 spin_unlock_irqrestore(&iommu->register_lock, flags);
786 }
787 }
789 static void dma_msi_unmask(unsigned int vector)
790 {
791 struct iommu *iommu = vector_to_iommu[vector];
792 unsigned long flags;
794 /* unmask it */
795 spin_lock_irqsave(&iommu->register_lock, flags);
796 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
797 spin_unlock_irqrestore(&iommu->register_lock, flags);
798 }
800 static void dma_msi_mask(unsigned int vector)
801 {
802 unsigned long flags;
803 struct iommu *iommu = vector_to_iommu[vector];
805 /* mask it */
806 spin_lock_irqsave(&iommu->register_lock, flags);
807 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
808 spin_unlock_irqrestore(&iommu->register_lock, flags);
809 }
811 static unsigned int dma_msi_startup(unsigned int vector)
812 {
813 dma_msi_unmask(vector);
814 return 0;
815 }
817 static void dma_msi_end(unsigned int vector)
818 {
819 dma_msi_unmask(vector);
820 ack_APIC_irq();
821 }
823 static void dma_msi_data_init(struct iommu *iommu, int vector)
824 {
825 u32 msi_data = 0;
826 unsigned long flags;
828 /* Fixed, edge, assert mode. Follow MSI setting */
829 msi_data |= vector & 0xff;
830 msi_data |= 1 << 14;
832 spin_lock_irqsave(&iommu->register_lock, flags);
833 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
834 spin_unlock_irqrestore(&iommu->register_lock, flags);
835 }
837 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
838 {
839 u64 msi_address;
840 unsigned long flags;
842 /* Physical, dedicated cpu. Follow MSI setting */
843 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
844 msi_address |= MSI_PHYSICAL_MODE << 2;
845 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
846 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
848 spin_lock_irqsave(&iommu->register_lock, flags);
849 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
850 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
851 spin_unlock_irqrestore(&iommu->register_lock, flags);
852 }
854 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
855 {
856 struct iommu *iommu = vector_to_iommu[vector];
857 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
858 }
860 static struct hw_interrupt_type dma_msi_type = {
861 .typename = "DMA_MSI",
862 .startup = dma_msi_startup,
863 .shutdown = dma_msi_mask,
864 .enable = dma_msi_unmask,
865 .disable = dma_msi_mask,
866 .ack = dma_msi_mask,
867 .end = dma_msi_end,
868 .set_affinity = dma_msi_set_affinity,
869 };
871 static int iommu_set_interrupt(struct iommu *iommu)
872 {
873 int vector, ret;
875 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
876 if ( vector <= 0 )
877 {
878 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
879 return -EINVAL;
880 }
882 irq_desc[vector].handler = &dma_msi_type;
883 vector_to_iommu[vector] = iommu;
884 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
885 if ( ret )
886 {
887 irq_desc[vector].handler = &no_irq_type;
888 vector_to_iommu[vector] = NULL;
889 free_irq_vector(vector);
890 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
891 return ret;
892 }
894 /* Make sure that vector is never re-used. */
895 vector_irq[vector] = NEVER_ASSIGN_IRQ;
897 return vector;
898 }
900 static int iommu_alloc(struct acpi_drhd_unit *drhd)
901 {
902 struct iommu *iommu;
903 unsigned long sagaw;
904 int agaw;
906 if ( nr_iommus > MAX_IOMMUS )
907 {
908 gdprintk(XENLOG_ERR VTDPREFIX,
909 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
910 return -ENOMEM;
911 }
913 iommu = xmalloc(struct iommu);
914 if ( iommu == NULL )
915 return -ENOMEM;
916 memset(iommu, 0, sizeof(struct iommu));
918 iommu->vector = -1; /* No vector assigned yet. */
920 iommu->intel = alloc_intel_iommu();
921 if ( iommu->intel == NULL )
922 {
923 xfree(iommu);
924 return -ENOMEM;
925 }
927 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
928 iommu->index = nr_iommus++;
930 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
931 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
933 gdprintk(XENLOG_INFO VTDPREFIX,
934 "drhd->address = %"PRIx64"\n", drhd->address);
935 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
937 /* Calculate number of pagetable levels: between 2 and 4. */
938 sagaw = cap_sagaw(iommu->cap);
939 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
940 if ( test_bit(agaw, &sagaw) )
941 break;
942 if ( agaw < 0 )
943 {
944 gdprintk(XENLOG_ERR VTDPREFIX,
945 "IOMMU: unsupported sagaw %lx\n", sagaw);
946 xfree(iommu);
947 return -ENODEV;
948 }
949 iommu->nr_pt_levels = agaw_to_level(agaw);
951 if ( !ecap_coherent(iommu->ecap) )
952 iommus_incoherent = 1;
954 spin_lock_init(&iommu->lock);
955 spin_lock_init(&iommu->register_lock);
957 drhd->iommu = iommu;
958 return 0;
959 }
961 static void iommu_free(struct acpi_drhd_unit *drhd)
962 {
963 struct iommu *iommu = drhd->iommu;
965 if ( iommu == NULL )
966 return;
968 if ( iommu->root_maddr != 0 )
969 {
970 free_pgtable_maddr(iommu->root_maddr);
971 iommu->root_maddr = 0;
972 }
974 if ( iommu->reg )
975 iounmap(iommu->reg);
977 free_intel_iommu(iommu->intel);
978 release_irq_vector(iommu->vector);
979 xfree(iommu);
981 drhd->iommu = NULL;
982 }
984 #define guestwidth_to_adjustwidth(gaw) ({ \
985 int agaw, r = (gaw - 12) % 9; \
986 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
987 if ( agaw > 64 ) \
988 agaw = 64; \
989 agaw; })
991 static int intel_iommu_domain_init(struct domain *d)
992 {
993 struct hvm_iommu *hd = domain_hvm_iommu(d);
994 struct iommu *iommu = NULL;
995 struct acpi_drhd_unit *drhd;
997 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
998 iommu = drhd->iommu;
1000 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1002 if ( d->domain_id == 0 )
1004 /* Set up 1:1 page table for dom0 */
1005 iommu_set_dom0_mapping(d);
1007 setup_dom0_devices(d);
1008 setup_dom0_rmrr(d);
1010 iommu_flush_all();
1012 for_each_drhd_unit ( drhd )
1014 iommu = drhd->iommu;
1015 iommu_enable_translation(iommu);
1019 return 0;
1022 static int domain_context_mapping_one(
1023 struct domain *domain,
1024 struct iommu *iommu,
1025 u8 bus, u8 devfn)
1027 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1028 struct context_entry *context, *context_entries;
1029 u64 maddr, pgd_maddr;
1030 struct pci_dev *pdev = NULL;
1031 int agaw;
1033 ASSERT(spin_is_locked(&pcidevs_lock));
1034 spin_lock(&iommu->lock);
1035 maddr = bus_to_context_maddr(iommu, bus);
1036 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1037 context = &context_entries[devfn];
1039 if ( context_present(*context) )
1041 int res = 0;
1043 pdev = pci_get_pdev(bus, devfn);
1044 if (!pdev)
1045 res = -ENODEV;
1046 else if (pdev->domain != domain)
1047 res = -EINVAL;
1048 unmap_vtd_domain_page(context_entries);
1049 spin_unlock(&iommu->lock);
1050 return res;
1053 if ( iommu_passthrough && (domain->domain_id == 0) )
1055 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1056 agaw = level_to_agaw(iommu->nr_pt_levels);
1058 else
1060 spin_lock(&hd->mapping_lock);
1062 /* Ensure we have pagetables allocated down to leaf PTE. */
1063 if ( hd->pgd_maddr == 0 )
1065 addr_to_dma_page_maddr(domain, 0, 1);
1066 if ( hd->pgd_maddr == 0 )
1068 nomem:
1069 spin_unlock(&hd->mapping_lock);
1070 spin_unlock(&iommu->lock);
1071 unmap_vtd_domain_page(context_entries);
1072 return -ENOMEM;
1076 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1077 pgd_maddr = hd->pgd_maddr;
1078 for ( agaw = level_to_agaw(4);
1079 agaw != level_to_agaw(iommu->nr_pt_levels);
1080 agaw-- )
1082 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1083 pgd_maddr = dma_pte_addr(*p);
1084 unmap_vtd_domain_page(p);
1085 if ( pgd_maddr == 0 )
1086 goto nomem;
1089 context_set_address_root(*context, pgd_maddr);
1090 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1091 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1092 else
1093 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1095 spin_unlock(&hd->mapping_lock);
1098 /*
1099 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1100 * be 1 based as required by intel's iommu hw.
1101 */
1102 context_set_domain_id(context, domain);
1103 context_set_address_width(*context, agaw);
1104 context_set_fault_enable(*context);
1105 context_set_present(*context);
1106 iommu_flush_cache_entry(context);
1107 spin_unlock(&iommu->lock);
1109 /* Context entry was previously non-present (with domid 0). */
1110 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1111 DMA_CCMD_MASK_NOBIT, 1) )
1112 iommu_flush_write_buffer(iommu);
1113 else
1115 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1116 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1119 set_bit(iommu->index, &hd->iommu_bitmap);
1121 unmap_vtd_domain_page(context_entries);
1123 return 0;
1126 #define PCI_BASE_CLASS_BRIDGE 0x06
1127 #define PCI_CLASS_BRIDGE_PCI 0x0604
1129 enum {
1130 DEV_TYPE_PCIe_ENDPOINT,
1131 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1132 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1133 DEV_TYPE_PCI,
1134 };
1136 int pdev_type(u8 bus, u8 devfn)
1138 u16 class_device;
1139 u16 status, creg;
1140 int pos;
1141 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1143 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1144 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1146 pos = pci_find_next_cap(bus, devfn,
1147 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1148 if ( !pos )
1149 return DEV_TYPE_PCI_BRIDGE;
1150 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1151 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1152 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1155 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1156 if ( !(status & PCI_STATUS_CAP_LIST) )
1157 return DEV_TYPE_PCI;
1159 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1160 return DEV_TYPE_PCIe_ENDPOINT;
1162 return DEV_TYPE_PCI;
1165 #define MAX_BUSES 256
1166 static DEFINE_SPINLOCK(bus2bridge_lock);
1167 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1169 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1171 int cnt = 0;
1172 *secbus = *bus;
1174 ASSERT(spin_is_locked(&bus2bridge_lock));
1175 if ( !bus2bridge[*bus].map )
1176 return 0;
1178 while ( bus2bridge[*bus].map )
1180 *secbus = *bus;
1181 *devfn = bus2bridge[*bus].devfn;
1182 *bus = bus2bridge[*bus].bus;
1183 if ( cnt++ >= MAX_BUSES )
1184 return 0;
1187 return 1;
1190 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1192 int ret = 0;
1194 if ( *bus == 0 )
1195 /* assume integrated PCI devices in RC have valid requester-id */
1196 return 1;
1198 spin_lock(&bus2bridge_lock);
1199 ret = _find_pcie_endpoint(bus, devfn, secbus);
1200 spin_unlock(&bus2bridge_lock);
1202 return ret;
1205 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1207 struct acpi_drhd_unit *drhd;
1208 int ret = 0;
1209 u16 sec_bus, sub_bus;
1210 u32 type;
1211 u8 secbus, secdevfn;
1212 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1214 if ( pdev == NULL )
1216 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1217 * -> domain_context_mapping().
1218 * In the case a user enables VT-d and disables USB (that usually needs
1219 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1220 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1221 * the BDF and at last pci_get_pdev() returns NULL here.
1222 */
1223 gdprintk(XENLOG_WARNING VTDPREFIX,
1224 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1225 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1226 return 0;
1229 drhd = acpi_find_matched_drhd_unit(pdev);
1230 if ( !drhd )
1231 return -ENODEV;
1233 ASSERT(spin_is_locked(&pcidevs_lock));
1235 type = pdev_type(bus, devfn);
1236 switch ( type )
1238 case DEV_TYPE_PCIe_BRIDGE:
1239 break;
1241 case DEV_TYPE_PCI_BRIDGE:
1242 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1243 PCI_SECONDARY_BUS);
1244 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1245 PCI_SUBORDINATE_BUS);
1247 spin_lock(&bus2bridge_lock);
1248 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1250 bus2bridge[sec_bus].map = 1;
1251 bus2bridge[sec_bus].bus = bus;
1252 bus2bridge[sec_bus].devfn = devfn;
1254 spin_unlock(&bus2bridge_lock);
1255 break;
1257 case DEV_TYPE_PCIe_ENDPOINT:
1258 gdprintk(XENLOG_INFO VTDPREFIX,
1259 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1260 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1261 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1262 break;
1264 case DEV_TYPE_PCI:
1265 gdprintk(XENLOG_INFO VTDPREFIX,
1266 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1267 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1269 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1270 if ( ret )
1271 break;
1273 secbus = bus;
1274 secdevfn = devfn;
1275 /* dependent devices mapping */
1276 while ( bus2bridge[bus].map )
1278 secbus = bus;
1279 secdevfn = devfn;
1280 devfn = bus2bridge[bus].devfn;
1281 bus = bus2bridge[bus].bus;
1282 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1283 if ( ret )
1284 return ret;
1287 if ( (secbus != bus) && (secdevfn != 0) )
1288 /*
1289 * The source-id for transactions on non-PCIe buses seem
1290 * to originate from devfn=0 on the secondary bus behind
1291 * the bridge. Map that id as well. The id to use in
1292 * these scanarios is not particularly well documented
1293 * anywhere.
1294 */
1295 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1296 break;
1298 default:
1299 gdprintk(XENLOG_ERR VTDPREFIX,
1300 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1301 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1302 ret = -EINVAL;
1303 break;
1306 return ret;
1309 static int domain_context_unmap_one(
1310 struct domain *domain,
1311 struct iommu *iommu,
1312 u8 bus, u8 devfn)
1314 struct context_entry *context, *context_entries;
1315 u64 maddr;
1317 ASSERT(spin_is_locked(&pcidevs_lock));
1318 spin_lock(&iommu->lock);
1320 maddr = bus_to_context_maddr(iommu, bus);
1321 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1322 context = &context_entries[devfn];
1324 if ( !context_present(*context) )
1326 spin_unlock(&iommu->lock);
1327 unmap_vtd_domain_page(context_entries);
1328 return 0;
1331 context_clear_present(*context);
1332 context_clear_entry(*context);
1333 iommu_flush_cache_entry(context);
1335 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1336 (((u16)bus) << 8) | devfn,
1337 DMA_CCMD_MASK_NOBIT, 0) )
1338 iommu_flush_write_buffer(iommu);
1339 else
1341 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1342 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1345 spin_unlock(&iommu->lock);
1346 unmap_vtd_domain_page(context_entries);
1348 return 0;
1351 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1353 struct acpi_drhd_unit *drhd;
1354 int ret = 0;
1355 u32 type;
1356 u8 secbus, secdevfn;
1357 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1359 BUG_ON(!pdev);
1361 drhd = acpi_find_matched_drhd_unit(pdev);
1362 if ( !drhd )
1363 return -ENODEV;
1365 type = pdev_type(bus, devfn);
1366 switch ( type )
1368 case DEV_TYPE_PCIe_BRIDGE:
1369 case DEV_TYPE_PCI_BRIDGE:
1370 break;
1372 case DEV_TYPE_PCIe_ENDPOINT:
1373 gdprintk(XENLOG_INFO VTDPREFIX,
1374 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1375 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1376 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1377 break;
1379 case DEV_TYPE_PCI:
1380 gdprintk(XENLOG_INFO VTDPREFIX,
1381 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1382 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1383 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1384 if ( ret )
1385 break;
1387 secbus = bus;
1388 secdevfn = devfn;
1389 /* dependent devices unmapping */
1390 while ( bus2bridge[bus].map )
1392 secbus = bus;
1393 secdevfn = devfn;
1394 devfn = bus2bridge[bus].devfn;
1395 bus = bus2bridge[bus].bus;
1396 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1397 if ( ret )
1398 return ret;
1401 if ( (secbus != bus) && (secdevfn != 0) )
1402 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1403 break;
1405 default:
1406 gdprintk(XENLOG_ERR VTDPREFIX,
1407 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1408 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1409 ret = -EINVAL;
1410 break;
1413 return ret;
1416 static int reassign_device_ownership(
1417 struct domain *source,
1418 struct domain *target,
1419 u8 bus, u8 devfn)
1421 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1422 struct pci_dev *pdev;
1423 struct acpi_drhd_unit *drhd;
1424 struct iommu *pdev_iommu;
1425 int ret, found = 0;
1427 ASSERT(spin_is_locked(&pcidevs_lock));
1428 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1430 if (!pdev)
1431 return -ENODEV;
1433 drhd = acpi_find_matched_drhd_unit(pdev);
1434 pdev_iommu = drhd->iommu;
1435 domain_context_unmap(source, bus, devfn);
1437 ret = domain_context_mapping(target, bus, devfn);
1438 if ( ret )
1439 return ret;
1441 list_move(&pdev->domain_list, &target->arch.pdev_list);
1442 pdev->domain = target;
1444 for_each_pdev ( source, pdev )
1446 drhd = acpi_find_matched_drhd_unit(pdev);
1447 if ( drhd->iommu == pdev_iommu )
1449 found = 1;
1450 break;
1454 if ( !found )
1455 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1457 return ret;
1460 void iommu_domain_teardown(struct domain *d)
1462 struct hvm_iommu *hd = domain_hvm_iommu(d);
1464 if ( list_empty(&acpi_drhd_units) )
1465 return;
1467 spin_lock(&hd->mapping_lock);
1468 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1469 hd->pgd_maddr = 0;
1470 spin_unlock(&hd->mapping_lock);
1472 iommu_domid_release(d);
1475 int intel_iommu_map_page(
1476 struct domain *d, unsigned long gfn, unsigned long mfn)
1478 struct hvm_iommu *hd = domain_hvm_iommu(d);
1479 struct acpi_drhd_unit *drhd;
1480 struct iommu *iommu;
1481 struct dma_pte *page = NULL, *pte = NULL;
1482 u64 pg_maddr;
1483 int pte_present;
1484 int flush_dev_iotlb;
1486 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1487 iommu = drhd->iommu;
1489 /* do nothing if dom0 and iommu supports pass thru */
1490 if ( iommu_passthrough && (d->domain_id == 0) )
1491 return 0;
1493 spin_lock(&hd->mapping_lock);
1495 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1496 if ( pg_maddr == 0 )
1498 spin_unlock(&hd->mapping_lock);
1499 return -ENOMEM;
1501 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1502 pte = page + (gfn & LEVEL_MASK);
1503 pte_present = dma_pte_present(*pte);
1504 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1505 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1507 /* Set the SNP on leaf page table if Snoop Control available */
1508 if ( iommu_snoop )
1509 dma_set_pte_snp(*pte);
1511 iommu_flush_cache_entry(pte);
1512 spin_unlock(&hd->mapping_lock);
1513 unmap_vtd_domain_page(page);
1515 /*
1516 * No need pcideves_lock here because we have flush
1517 * when assign/deassign device
1518 */
1519 for_each_drhd_unit ( drhd )
1521 iommu = drhd->iommu;
1523 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1524 continue;
1526 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1527 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1528 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1529 !pte_present, flush_dev_iotlb) )
1530 iommu_flush_write_buffer(iommu);
1533 return 0;
1536 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1538 struct acpi_drhd_unit *drhd;
1539 struct iommu *iommu;
1541 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1542 iommu = drhd->iommu;
1544 /* do nothing if dom0 and iommu supports pass thru */
1545 if ( iommu_passthrough && (d->domain_id == 0) )
1546 return 0;
1548 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1550 return 0;
1553 static int iommu_prepare_rmrr_dev(struct domain *d,
1554 struct acpi_rmrr_unit *rmrr,
1555 u8 bus, u8 devfn)
1557 int ret = 0;
1558 u64 base, end;
1559 unsigned long base_pfn, end_pfn;
1561 ASSERT(spin_is_locked(&pcidevs_lock));
1562 ASSERT(rmrr->base_address < rmrr->end_address);
1564 base = rmrr->base_address & PAGE_MASK_4K;
1565 base_pfn = base >> PAGE_SHIFT_4K;
1566 end = PAGE_ALIGN_4K(rmrr->end_address);
1567 end_pfn = end >> PAGE_SHIFT_4K;
1569 while ( base_pfn < end_pfn )
1571 intel_iommu_map_page(d, base_pfn, base_pfn);
1572 base_pfn++;
1575 ret = domain_context_mapping(d, bus, devfn);
1577 return ret;
1580 static int intel_iommu_add_device(struct pci_dev *pdev)
1582 struct acpi_rmrr_unit *rmrr;
1583 u16 bdf;
1584 int ret, i;
1586 ASSERT(spin_is_locked(&pcidevs_lock));
1588 if ( !pdev->domain )
1589 return -EINVAL;
1591 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1592 if ( ret )
1594 gdprintk(XENLOG_ERR VTDPREFIX,
1595 "intel_iommu_add_device: context mapping failed\n");
1596 return ret;
1599 for_each_rmrr_device ( rmrr, bdf, i )
1601 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1603 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1604 pdev->bus, pdev->devfn);
1605 if ( ret )
1606 gdprintk(XENLOG_ERR VTDPREFIX,
1607 "intel_iommu_add_device: RMRR mapping failed\n");
1608 break;
1612 return ret;
1615 static int intel_iommu_remove_device(struct pci_dev *pdev)
1617 struct acpi_rmrr_unit *rmrr;
1618 u16 bdf;
1619 int i;
1621 if ( !pdev->domain )
1622 return -EINVAL;
1624 /* If the device belongs to dom0, and it has RMRR, don't remove it
1625 * from dom0, because BIOS may use RMRR at booting time.
1626 */
1627 if ( pdev->domain->domain_id == 0 )
1629 for_each_rmrr_device ( rmrr, bdf, i )
1631 if ( PCI_BUS(bdf) == pdev->bus &&
1632 PCI_DEVFN2(bdf) == pdev->devfn )
1633 return 0;
1637 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1640 static void setup_dom0_devices(struct domain *d)
1642 struct hvm_iommu *hd;
1643 struct pci_dev *pdev;
1644 int bus, dev, func;
1645 u32 l;
1647 hd = domain_hvm_iommu(d);
1649 spin_lock(&pcidevs_lock);
1650 for ( bus = 0; bus < 256; bus++ )
1652 for ( dev = 0; dev < 32; dev++ )
1654 for ( func = 0; func < 8; func++ )
1656 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1657 /* some broken boards return 0 or ~0 if a slot is empty: */
1658 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1659 (l == 0x0000ffff) || (l == 0xffff0000) )
1660 continue;
1662 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1663 pdev->domain = d;
1664 list_add(&pdev->domain_list, &d->arch.pdev_list);
1665 domain_context_mapping(d, pdev->bus, pdev->devfn);
1666 #if defined(NOT_YET)
1667 if ( ats_device(0, pdev->bus, pdev->devfn) )
1668 enable_ats_device(0, pdev->bus, pdev->devfn);
1669 #endif
1673 spin_unlock(&pcidevs_lock);
1676 void clear_fault_bits(struct iommu *iommu)
1678 u64 val;
1680 val = dmar_readq(
1681 iommu->reg,
1682 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1683 dmar_writeq(
1684 iommu->reg,
1685 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1686 val);
1687 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1690 static int init_vtd_hw(void)
1692 struct acpi_drhd_unit *drhd;
1693 struct iommu *iommu;
1694 struct iommu_flush *flush = NULL;
1695 int vector;
1696 int ret;
1698 for_each_drhd_unit ( drhd )
1700 iommu = drhd->iommu;
1701 ret = iommu_set_root_entry(iommu);
1702 if ( ret )
1704 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1705 return -EIO;
1708 if ( iommu->vector < 0 )
1710 vector = iommu_set_interrupt(iommu);
1711 if ( vector < 0 )
1713 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1714 return vector;
1716 iommu->vector = vector;
1718 dma_msi_data_init(iommu, iommu->vector);
1719 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1720 clear_fault_bits(iommu);
1721 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1723 /* initialize flush functions */
1724 flush = iommu_get_flush(iommu);
1725 flush->context = flush_context_reg;
1726 flush->iotlb = flush_iotlb_reg;
1729 if ( iommu_qinval )
1731 for_each_drhd_unit ( drhd )
1733 iommu = drhd->iommu;
1734 if ( enable_qinval(iommu) != 0 )
1736 dprintk(XENLOG_INFO VTDPREFIX,
1737 "Failed to enable Queued Invalidation!\n");
1738 break;
1743 if ( iommu_intremap )
1745 for_each_drhd_unit ( drhd )
1747 iommu = drhd->iommu;
1748 if ( enable_intremap(iommu) != 0 )
1750 dprintk(XENLOG_INFO VTDPREFIX,
1751 "Failed to enable Interrupt Remapping!\n");
1752 break;
1757 return 0;
1760 static void setup_dom0_rmrr(struct domain *d)
1762 struct acpi_rmrr_unit *rmrr;
1763 u16 bdf;
1764 int ret, i;
1766 spin_lock(&pcidevs_lock);
1767 for_each_rmrr_device ( rmrr, bdf, i )
1769 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1770 if ( ret )
1771 gdprintk(XENLOG_ERR VTDPREFIX,
1772 "IOMMU: mapping reserved region failed\n");
1774 spin_unlock(&pcidevs_lock);
1777 static void platform_quirks(void)
1779 u32 id;
1781 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1782 id = pci_conf_read32(0, 0, 0, 0);
1783 if ( id == 0x2a408086 )
1785 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1786 rwbf_quirk = 1;
1790 int intel_vtd_setup(void)
1792 struct acpi_drhd_unit *drhd;
1793 struct iommu *iommu;
1795 if ( !vtd_enabled )
1796 return -ENODEV;
1798 platform_quirks();
1800 spin_lock_init(&domid_bitmap_lock);
1801 clflush_size = get_cache_line_size();
1803 /* We enable the following features only if they are supported by all VT-d
1804 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1805 * Interrupt Remapping.
1806 */
1807 for_each_drhd_unit ( drhd )
1809 if ( iommu_alloc(drhd) != 0 )
1810 goto error;
1812 iommu = drhd->iommu;
1814 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1815 iommu_snoop = 0;
1817 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1818 iommu_passthrough = 0;
1820 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1821 iommu_qinval = 0;
1823 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1824 iommu_intremap = 0;
1827 if ( !iommu_qinval && iommu_intremap )
1829 iommu_intremap = 0;
1830 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1831 "since Queued Invalidation isn't supported or enabled.\n");
1834 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1835 P(iommu_snoop, "Snoop Control");
1836 P(iommu_passthrough, "DMA Passthrough");
1837 P(iommu_qinval, "Queued Invalidation");
1838 P(iommu_intremap, "Interrupt Remapping");
1839 #undef P
1841 /* Allocate IO page directory page for the domain. */
1842 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1843 iommu = drhd->iommu;
1845 /* Allocate domain id bitmap, and set bit 0 as reserved */
1846 domid_bitmap_size = cap_ndoms(iommu->cap);
1847 domid_bitmap = xmalloc_array(unsigned long,
1848 BITS_TO_LONGS(domid_bitmap_size));
1849 if ( domid_bitmap == NULL )
1850 goto error;
1851 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1852 set_bit(0, domid_bitmap);
1854 if ( init_vtd_hw() )
1855 goto error;
1857 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1859 return 0;
1861 error:
1862 for_each_drhd_unit ( drhd )
1863 iommu_free(drhd);
1864 vtd_enabled = 0;
1865 iommu_snoop = 0;
1866 iommu_passthrough = 0;
1867 iommu_qinval = 0;
1868 iommu_intremap = 0;
1869 return -ENOMEM;
1872 /*
1873 * If the device isn't owned by dom0, it means it already
1874 * has been assigned to other domain, or it's not exist.
1875 */
1876 int device_assigned(u8 bus, u8 devfn)
1878 struct pci_dev *pdev;
1880 spin_lock(&pcidevs_lock);
1881 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1882 if (!pdev)
1884 spin_unlock(&pcidevs_lock);
1885 return -1;
1888 spin_unlock(&pcidevs_lock);
1889 return 0;
1892 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1894 struct acpi_rmrr_unit *rmrr;
1895 int ret = 0, i;
1896 struct pci_dev *pdev;
1897 u16 bdf;
1899 if ( list_empty(&acpi_drhd_units) )
1900 return -ENODEV;
1902 ASSERT(spin_is_locked(&pcidevs_lock));
1903 pdev = pci_get_pdev(bus, devfn);
1904 if (!pdev)
1905 return -ENODEV;
1907 if (pdev->domain != dom0)
1909 gdprintk(XENLOG_ERR VTDPREFIX,
1910 "IOMMU: assign a assigned device\n");
1911 return -EBUSY;
1914 ret = reassign_device_ownership(dom0, d, bus, devfn);
1915 if ( ret )
1916 goto done;
1918 /* Setup rmrr identity mapping */
1919 for_each_rmrr_device( rmrr, bdf, i )
1921 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1923 /* FIXME: Because USB RMRR conflicts with guest bios region,
1924 * ignore USB RMRR temporarily.
1925 */
1926 if ( is_usb_device(bus, devfn) )
1928 ret = 0;
1929 goto done;
1932 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1933 if ( ret )
1934 gdprintk(XENLOG_ERR VTDPREFIX,
1935 "IOMMU: mapping reserved region failed\n");
1936 goto done;
1940 done:
1941 return ret;
1944 static int intel_iommu_group_id(u8 bus, u8 devfn)
1946 u8 secbus;
1947 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1948 return PCI_BDF2(bus, devfn);
1949 else
1950 return -1;
1953 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1954 void iommu_suspend(void)
1956 struct acpi_drhd_unit *drhd;
1957 struct iommu *iommu;
1958 u32 i;
1960 if ( !vtd_enabled )
1961 return;
1963 iommu_flush_all();
1965 for_each_drhd_unit ( drhd )
1967 iommu = drhd->iommu;
1968 i = iommu->index;
1970 iommu_state[i][DMAR_FECTL_REG] =
1971 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1972 iommu_state[i][DMAR_FEDATA_REG] =
1973 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1974 iommu_state[i][DMAR_FEADDR_REG] =
1975 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1976 iommu_state[i][DMAR_FEUADDR_REG] =
1977 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1979 iommu_disable_translation(iommu);
1981 if ( iommu_intremap )
1982 disable_intremap(iommu);
1984 if ( iommu_qinval )
1985 disable_qinval(iommu);
1989 void iommu_resume(void)
1991 struct acpi_drhd_unit *drhd;
1992 struct iommu *iommu;
1993 struct iommu_flush *flush;
1994 u32 i;
1996 if ( !vtd_enabled )
1997 return;
1999 /* Re-initialize the register-based flush functions.
2000 * In iommu_flush_all(), we invoke iommu_flush_{context,iotlb}_global(),
2001 * but at this point, on hosts that support QI(Queued Invalidation), QI
2002 * hasn't been re-enabed yet, so for now let's use the register-based
2003 * invalidation method before invoking init_vtd_hw().
2004 */
2005 if ( iommu_qinval )
2007 for_each_drhd_unit ( drhd )
2009 iommu = drhd->iommu;
2010 flush = iommu_get_flush(iommu);
2011 flush->context = flush_context_reg;
2012 flush->iotlb = flush_iotlb_reg;
2016 /* Not sure whether the flush operation is required to meet iommu
2017 * specification. Note that BIOS also executes in S3 resume and iommu may
2018 * be touched again, so let us do the flush operation for safety.
2019 */
2020 iommu_flush_all();
2022 if ( init_vtd_hw() != 0 && force_iommu )
2023 panic("IOMMU setup failed, crash Xen for security purpose!\n");
2025 for_each_drhd_unit ( drhd )
2027 iommu = drhd->iommu;
2028 i = iommu->index;
2030 dmar_writel(iommu->reg, DMAR_FECTL_REG,
2031 (u32) iommu_state[i][DMAR_FECTL_REG]);
2032 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
2033 (u32) iommu_state[i][DMAR_FEDATA_REG]);
2034 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
2035 (u32) iommu_state[i][DMAR_FEADDR_REG]);
2036 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
2037 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
2039 iommu_enable_translation(iommu);
2043 struct iommu_ops intel_iommu_ops = {
2044 .init = intel_iommu_domain_init,
2045 .add_device = intel_iommu_add_device,
2046 .remove_device = intel_iommu_remove_device,
2047 .assign_device = intel_iommu_assign_device,
2048 .teardown = iommu_domain_teardown,
2049 .map_page = intel_iommu_map_page,
2050 .unmap_page = intel_iommu_unmap_page,
2051 .reassign_device = reassign_device_ownership,
2052 .get_device_group_id = intel_iommu_group_id,
2053 .update_ire_from_apic = io_apic_write_remap_rte,
2054 .update_ire_from_msi = msi_msg_write_remap_rte,
2055 };
2057 /*
2058 * Local variables:
2059 * mode: C
2060 * c-set-style: "BSD"
2061 * c-basic-offset: 4
2062 * tab-width: 4
2063 * indent-tabs-mode: nil
2064 * End:
2065 */