ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19402:f02a528d2e56

Xen: use proper device ID to search VT-d unit for ARI and SR-IOV device

PCIe Alternative Routing-ID Interpretation (ARI) ECN defines the Extended
Function -- a function whose function number is greater than 7 within an
ARI Device. Intel VT-d spec 1.2 section 8.3.2 specifies that the Extended
Function is under the scope of the same remapping unit as the traditional
function. The hypervisor needs to know if a function is Extended
Function so it can find proper DMAR for it.

And section 8.3.3 specifies that the SR-IOV Virtual Function is under the
scope of the same remapping unit as the Physical Function. The hypervisor
also needs to know if a function is the Virtual Function and which
Physical Function it's associated with for same reason.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Mar 19 10:20:11 2009 +0000 (2009-03-19)
parents 6d65dc14d21b
children abb87a8387ac
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
42 static int domid_bitmap_size; /* domain id bitmap size in bits */
43 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static bool_t rwbf_quirk;
46 static void setup_dom0_devices(struct domain *d);
47 static void setup_dom0_rmrr(struct domain *d);
49 #define DID_FIELD_WIDTH 16
50 #define DID_HIGH_OFFSET 8
51 static void context_set_domain_id(struct context_entry *context,
52 struct domain *d)
53 {
54 domid_t iommu_domid = domain_iommu_domid(d);
56 if ( iommu_domid == 0 )
57 {
58 spin_lock(&domid_bitmap_lock);
59 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
60 set_bit(iommu_domid, domid_bitmap);
61 spin_unlock(&domid_bitmap_lock);
62 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
63 }
65 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
66 context->hi |= iommu_domid << DID_HIGH_OFFSET;
67 }
69 static void iommu_domid_release(struct domain *d)
70 {
71 domid_t iommu_domid = domain_iommu_domid(d);
73 if ( iommu_domid != 0 )
74 {
75 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
76 clear_bit(iommu_domid, domid_bitmap);
77 }
78 }
80 static struct intel_iommu *alloc_intel_iommu(void)
81 {
82 struct intel_iommu *intel;
84 intel = xmalloc(struct intel_iommu);
85 if ( intel == NULL )
86 return NULL;
87 memset(intel, 0, sizeof(struct intel_iommu));
89 spin_lock_init(&intel->qi_ctrl.qinval_lock);
90 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
91 spin_lock_init(&intel->ir_ctrl.iremap_lock);
93 return intel;
94 }
96 static void free_intel_iommu(struct intel_iommu *intel)
97 {
98 xfree(intel);
99 }
101 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
102 {
103 return iommu ? &iommu->intel->qi_ctrl : NULL;
104 }
106 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
107 {
108 return iommu ? &iommu->intel->ir_ctrl : NULL;
109 }
111 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
112 {
113 return iommu ? &iommu->intel->flush : NULL;
114 }
116 static unsigned int clflush_size;
117 static int iommus_incoherent;
118 static void __iommu_flush_cache(void *addr, int size)
119 {
120 int i;
122 if ( !iommus_incoherent )
123 return;
125 for ( i = 0; i < size; i += clflush_size )
126 cacheline_flush((char *)addr + i);
127 }
129 void iommu_flush_cache_entry(void *addr)
130 {
131 __iommu_flush_cache(addr, 8);
132 }
134 void iommu_flush_cache_page(void *addr, unsigned long npages)
135 {
136 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
137 }
139 int nr_iommus;
140 /* context entry handling */
141 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
142 {
143 struct root_entry *root, *root_entries;
144 u64 maddr;
146 ASSERT(spin_is_locked(&iommu->lock));
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL, 1);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 return 0;
156 }
157 set_root_value(*root, maddr);
158 set_root_present(*root);
159 iommu_flush_cache_entry(root);
160 }
161 maddr = (u64) get_context_addr(*root);
162 unmap_vtd_domain_page(root_entries);
163 return maddr;
164 }
166 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
167 {
168 struct hvm_iommu *hd = domain_hvm_iommu(domain);
169 int addr_width = agaw_to_width(hd->agaw);
170 struct dma_pte *parent, *pte = NULL;
171 int level = agaw_to_level(hd->agaw);
172 int offset;
173 u64 pte_maddr = 0, maddr;
174 u64 *vaddr = NULL;
176 addr &= (((u64)1) << addr_width) - 1;
177 ASSERT(spin_is_locked(&hd->mapping_lock));
178 if ( hd->pgd_maddr == 0 )
179 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
180 goto out;
182 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
183 while ( level > 1 )
184 {
185 offset = address_level_offset(addr, level);
186 pte = &parent[offset];
188 if ( dma_pte_addr(*pte) == 0 )
189 {
190 if ( !alloc )
191 break;
192 maddr = alloc_pgtable_maddr(domain, 1);
193 if ( !maddr )
194 break;
195 dma_set_pte_addr(*pte, maddr);
196 vaddr = map_vtd_domain_page(maddr);
198 /*
199 * high level table always sets r/w, last level
200 * page table control read/write
201 */
202 dma_set_pte_readable(*pte);
203 dma_set_pte_writable(*pte);
204 iommu_flush_cache_entry(pte);
205 }
206 else
207 {
208 vaddr = map_vtd_domain_page(pte->val);
209 }
211 if ( level == 2 )
212 {
213 pte_maddr = pte->val & PAGE_MASK_4K;
214 unmap_vtd_domain_page(vaddr);
215 break;
216 }
218 unmap_vtd_domain_page(parent);
219 parent = (struct dma_pte *)vaddr;
220 vaddr = NULL;
221 level--;
222 }
224 unmap_vtd_domain_page(parent);
225 out:
226 return pte_maddr;
227 }
229 static void iommu_flush_write_buffer(struct iommu *iommu)
230 {
231 u32 val;
232 unsigned long flag;
233 s_time_t start_time;
235 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
236 return;
237 val = iommu->gcmd | DMA_GCMD_WBF;
239 spin_lock_irqsave(&iommu->register_lock, flag);
240 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
242 /* Make sure hardware complete it */
243 start_time = NOW();
244 for ( ; ; )
245 {
246 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
247 if ( !(val & DMA_GSTS_WBFS) )
248 break;
249 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
250 panic("%s: DMAR hardware is malfunctional,"
251 " please disable IOMMU\n", __func__);
252 cpu_relax();
253 }
254 spin_unlock_irqrestore(&iommu->register_lock, flag);
255 }
257 /* return value determine if we need a write buffer flush */
258 static int flush_context_reg(
259 void *_iommu,
260 u16 did, u16 source_id, u8 function_mask, u64 type,
261 int non_present_entry_flush)
262 {
263 struct iommu *iommu = (struct iommu *) _iommu;
264 u64 val = 0;
265 unsigned long flag;
266 s_time_t start_time;
268 /*
269 * In the non-present entry flush case, if hardware doesn't cache
270 * non-present entry we do nothing and if hardware cache non-present
271 * entry, we flush entries of domain 0 (the domain id is used to cache
272 * any non-present entries)
273 */
274 if ( non_present_entry_flush )
275 {
276 if ( !cap_caching_mode(iommu->cap) )
277 return 1;
278 else
279 did = 0;
280 }
282 /* use register invalidation */
283 switch ( type )
284 {
285 case DMA_CCMD_GLOBAL_INVL:
286 val = DMA_CCMD_GLOBAL_INVL;
287 break;
288 case DMA_CCMD_DOMAIN_INVL:
289 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
290 break;
291 case DMA_CCMD_DEVICE_INVL:
292 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
293 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
294 break;
295 default:
296 BUG();
297 }
298 val |= DMA_CCMD_ICC;
300 spin_lock_irqsave(&iommu->register_lock, flag);
301 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
303 /* Make sure hardware complete it */
304 start_time = NOW();
305 for ( ; ; )
306 {
307 val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
308 if ( !(val & DMA_CCMD_ICC) )
309 break;
310 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
311 panic("%s: DMAR hardware is malfunctional,"
312 " please disable IOMMU\n", __func__);
313 cpu_relax();
314 }
315 spin_unlock_irqrestore(&iommu->register_lock, flag);
316 /* flush context entry will implicitly flush write buffer */
317 return 0;
318 }
320 static int inline iommu_flush_context_global(
321 struct iommu *iommu, int non_present_entry_flush)
322 {
323 struct iommu_flush *flush = iommu_get_flush(iommu);
324 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
325 non_present_entry_flush);
326 }
328 static int inline iommu_flush_context_domain(
329 struct iommu *iommu, u16 did, int non_present_entry_flush)
330 {
331 struct iommu_flush *flush = iommu_get_flush(iommu);
332 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
333 non_present_entry_flush);
334 }
336 static int inline iommu_flush_context_device(
337 struct iommu *iommu, u16 did, u16 source_id,
338 u8 function_mask, int non_present_entry_flush)
339 {
340 struct iommu_flush *flush = iommu_get_flush(iommu);
341 return flush->context(iommu, did, source_id, function_mask,
342 DMA_CCMD_DEVICE_INVL,
343 non_present_entry_flush);
344 }
346 /* return value determine if we need a write buffer flush */
347 static int flush_iotlb_reg(void *_iommu, u16 did,
348 u64 addr, unsigned int size_order, u64 type,
349 int non_present_entry_flush)
350 {
351 struct iommu *iommu = (struct iommu *) _iommu;
352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
353 u64 val = 0, val_iva = 0;
354 unsigned long flag;
355 s_time_t start_time;
357 /*
358 * In the non-present entry flush case, if hardware doesn't cache
359 * non-present entry we do nothing and if hardware cache non-present
360 * entry, we flush entries of domain 0 (the domain id is used to cache
361 * any non-present entries)
362 */
363 if ( non_present_entry_flush )
364 {
365 if ( !cap_caching_mode(iommu->cap) )
366 return 1;
367 else
368 did = 0;
369 }
371 /* use register invalidation */
372 switch ( type )
373 {
374 case DMA_TLB_GLOBAL_FLUSH:
375 /* global flush doesn't need set IVA_REG */
376 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
377 break;
378 case DMA_TLB_DSI_FLUSH:
379 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
380 break;
381 case DMA_TLB_PSI_FLUSH:
382 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
383 /* Note: always flush non-leaf currently */
384 val_iva = size_order | addr;
385 break;
386 default:
387 BUG();
388 }
389 /* Note: set drain read/write */
390 if ( cap_read_drain(iommu->cap) )
391 val |= DMA_TLB_READ_DRAIN;
392 if ( cap_write_drain(iommu->cap) )
393 val |= DMA_TLB_WRITE_DRAIN;
395 spin_lock_irqsave(&iommu->register_lock, flag);
396 /* Note: Only uses first TLB reg currently */
397 if ( val_iva )
398 dmar_writeq(iommu->reg, tlb_offset, val_iva);
399 dmar_writeq(iommu->reg, tlb_offset + 8, val);
401 /* Make sure hardware complete it */
402 start_time = NOW();
403 for ( ; ; )
404 {
405 val = dmar_readq(iommu->reg, tlb_offset + 8);
406 if ( !(val & DMA_TLB_IVT) )
407 break;
408 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
409 panic("%s: DMAR hardware is malfunctional,"
410 " please disable IOMMU\n", __func__);
411 cpu_relax();
412 }
413 spin_unlock_irqrestore(&iommu->register_lock, flag);
415 /* check IOTLB invalidation granularity */
416 if ( DMA_TLB_IAIG(val) == 0 )
417 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
419 /* flush iotlb entry will implicitly flush write buffer */
420 return 0;
421 }
423 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
424 int non_present_entry_flush)
425 {
426 struct iommu_flush *flush = iommu_get_flush(iommu);
427 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
428 non_present_entry_flush);
429 }
431 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
432 int non_present_entry_flush)
433 {
434 struct iommu_flush *flush = iommu_get_flush(iommu);
435 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
436 non_present_entry_flush);
437 }
439 static int inline get_alignment(u64 base, unsigned int size)
440 {
441 int t = 0;
442 u64 end;
444 end = base + size - 1;
445 while ( base != end )
446 {
447 t++;
448 base >>= 1;
449 end >>= 1;
450 }
451 return t;
452 }
454 static int inline iommu_flush_iotlb_psi(
455 struct iommu *iommu, u16 did,
456 u64 addr, unsigned int pages, int non_present_entry_flush)
457 {
458 unsigned int align;
459 struct iommu_flush *flush = iommu_get_flush(iommu);
461 ASSERT(!(addr & (~PAGE_MASK_4K)));
462 ASSERT(pages > 0);
464 /* Fallback to domain selective flush if no PSI support */
465 if ( !cap_pgsel_inv(iommu->cap) )
466 return iommu_flush_iotlb_dsi(iommu, did,
467 non_present_entry_flush);
469 /*
470 * PSI requires page size is 2 ^ x, and the base address is naturally
471 * aligned to the size
472 */
473 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
474 /* Fallback to domain selective flush if size is too big */
475 if ( align > cap_max_amask_val(iommu->cap) )
476 return iommu_flush_iotlb_dsi(iommu, did,
477 non_present_entry_flush);
479 addr >>= PAGE_SHIFT_4K + align;
480 addr <<= PAGE_SHIFT_4K + align;
482 return flush->iotlb(iommu, did, addr, align,
483 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
484 }
486 void iommu_flush_all(void)
487 {
488 struct acpi_drhd_unit *drhd;
489 struct iommu *iommu;
491 flush_all_cache();
492 for_each_drhd_unit ( drhd )
493 {
494 iommu = drhd->iommu;
495 iommu_flush_context_global(iommu, 0);
496 iommu_flush_iotlb_global(iommu, 0);
497 }
498 }
500 /* clear one page's page table */
501 static void dma_pte_clear_one(struct domain *domain, u64 addr)
502 {
503 struct hvm_iommu *hd = domain_hvm_iommu(domain);
504 struct acpi_drhd_unit *drhd;
505 struct iommu *iommu;
506 struct dma_pte *page = NULL, *pte = NULL;
507 u64 pg_maddr;
509 spin_lock(&hd->mapping_lock);
510 /* get last level pte */
511 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
512 if ( pg_maddr == 0 )
513 {
514 spin_unlock(&hd->mapping_lock);
515 return;
516 }
518 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
519 pte = page + address_level_offset(addr, 1);
521 if ( !dma_pte_present(*pte) )
522 {
523 spin_unlock(&hd->mapping_lock);
524 unmap_vtd_domain_page(page);
525 return;
526 }
528 dma_clear_pte(*pte);
529 spin_unlock(&hd->mapping_lock);
530 iommu_flush_cache_entry(pte);
532 /* No need pcidevs_lock here since do that on assign/deassign device*/
533 for_each_drhd_unit ( drhd )
534 {
535 iommu = drhd->iommu;
536 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
537 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
538 addr, 1, 0))
539 iommu_flush_write_buffer(iommu);
540 }
542 unmap_vtd_domain_page(page);
543 }
545 static void iommu_free_pagetable(u64 pt_maddr, int level)
546 {
547 int i;
548 struct dma_pte *pt_vaddr, *pte;
549 int next_level = level - 1;
551 if ( pt_maddr == 0 )
552 return;
554 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
556 for ( i = 0; i < PTE_NUM; i++ )
557 {
558 pte = &pt_vaddr[i];
559 if ( !dma_pte_present(*pte) )
560 continue;
562 if ( next_level >= 1 )
563 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
565 dma_clear_pte(*pte);
566 iommu_flush_cache_entry(pte);
567 }
569 unmap_vtd_domain_page(pt_vaddr);
570 free_pgtable_maddr(pt_maddr);
571 }
573 static int iommu_set_root_entry(struct iommu *iommu)
574 {
575 u32 cmd, sts;
576 unsigned long flags;
577 s_time_t start_time;
579 spin_lock(&iommu->lock);
581 if ( iommu->root_maddr == 0 )
582 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
583 if ( iommu->root_maddr == 0 )
584 {
585 spin_unlock(&iommu->lock);
586 return -ENOMEM;
587 }
589 spin_unlock(&iommu->lock);
590 spin_lock_irqsave(&iommu->register_lock, flags);
591 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
592 cmd = iommu->gcmd | DMA_GCMD_SRTP;
593 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
595 /* Make sure hardware complete it */
596 start_time = NOW();
597 for ( ; ; )
598 {
599 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
600 if ( sts & DMA_GSTS_RTPS )
601 break;
602 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
603 panic("%s: DMAR hardware is malfunctional,"
604 " please disable IOMMU\n", __func__);
605 cpu_relax();
606 }
608 spin_unlock_irqrestore(&iommu->register_lock, flags);
610 return 0;
611 }
613 static void iommu_enable_translation(struct iommu *iommu)
614 {
615 u32 sts;
616 unsigned long flags;
617 s_time_t start_time;
619 dprintk(XENLOG_INFO VTDPREFIX,
620 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
621 spin_lock_irqsave(&iommu->register_lock, flags);
622 iommu->gcmd |= DMA_GCMD_TE;
623 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
624 /* Make sure hardware complete it */
625 start_time = NOW();
626 for ( ; ; )
627 {
628 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
629 if ( sts & DMA_GSTS_TES )
630 break;
631 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
632 panic("%s: DMAR hardware is malfunctional,"
633 " please disable IOMMU\n", __func__);
634 cpu_relax();
635 }
637 /* Disable PMRs when VT-d engine takes effect per spec definition */
638 disable_pmr(iommu);
639 spin_unlock_irqrestore(&iommu->register_lock, flags);
640 }
642 int iommu_disable_translation(struct iommu *iommu)
643 {
644 u32 sts;
645 unsigned long flags;
646 s_time_t start_time;
648 spin_lock_irqsave(&iommu->register_lock, flags);
649 iommu->gcmd &= ~ DMA_GCMD_TE;
650 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
652 /* Make sure hardware complete it */
653 start_time = NOW();
654 for ( ; ; )
655 {
656 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
657 if ( !(sts & DMA_GSTS_TES) )
658 break;
659 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
660 panic("%s: DMAR hardware is malfunctional,"
661 " please disable IOMMU\n", __func__);
662 cpu_relax();
663 }
664 spin_unlock_irqrestore(&iommu->register_lock, flags);
665 return 0;
666 }
668 static struct iommu *vector_to_iommu[NR_VECTORS];
669 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
670 u8 fault_reason, u16 source_id, u64 addr)
671 {
672 dprintk(XENLOG_WARNING VTDPREFIX,
673 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
674 "iommu->reg = %p\n",
675 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
676 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
677 fault_reason, iommu->reg);
679 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
680 if ( fault_reason < 0x20 )
681 print_vtd_entries(iommu, (source_id >> 8),
682 (source_id & 0xff), (addr >> PAGE_SHIFT));
683 #endif
685 return 0;
686 }
688 static void iommu_fault_status(u32 fault_status)
689 {
690 if ( fault_status & DMA_FSTS_PFO )
691 dprintk(XENLOG_ERR VTDPREFIX,
692 "iommu_fault_status: Fault Overflow\n");
693 if ( fault_status & DMA_FSTS_PPF )
694 dprintk(XENLOG_ERR VTDPREFIX,
695 "iommu_fault_status: Primary Pending Fault\n");
696 if ( fault_status & DMA_FSTS_AFO )
697 dprintk(XENLOG_ERR VTDPREFIX,
698 "iommu_fault_status: Advanced Fault Overflow\n");
699 if ( fault_status & DMA_FSTS_APF )
700 dprintk(XENLOG_ERR VTDPREFIX,
701 "iommu_fault_status: Advanced Pending Fault\n");
702 if ( fault_status & DMA_FSTS_IQE )
703 dprintk(XENLOG_ERR VTDPREFIX,
704 "iommu_fault_status: Invalidation Queue Error\n");
705 if ( fault_status & DMA_FSTS_ICE )
706 dprintk(XENLOG_ERR VTDPREFIX,
707 "iommu_fault_status: Invalidation Completion Error\n");
708 if ( fault_status & DMA_FSTS_ITE )
709 dprintk(XENLOG_ERR VTDPREFIX,
710 "iommu_fault_status: Invalidation Time-out Error\n");
711 }
713 #define PRIMARY_FAULT_REG_LEN (16)
714 static void iommu_page_fault(int vector, void *dev_id,
715 struct cpu_user_regs *regs)
716 {
717 struct iommu *iommu = dev_id;
718 int reg, fault_index;
719 u32 fault_status;
720 unsigned long flags;
722 dprintk(XENLOG_WARNING VTDPREFIX,
723 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
725 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
727 iommu_fault_status(fault_status);
729 /* FIXME: ignore advanced fault log */
730 if ( !(fault_status & DMA_FSTS_PPF) )
731 goto clear_overflow;
733 fault_index = dma_fsts_fault_record_index(fault_status);
734 reg = cap_fault_reg_offset(iommu->cap);
735 while (1)
736 {
737 u8 fault_reason;
738 u16 source_id;
739 u32 data;
740 u64 guest_addr;
741 int type;
743 /* highest 32 bits */
744 spin_lock_irqsave(&iommu->register_lock, flags);
745 data = dmar_readl(iommu->reg, reg +
746 fault_index * PRIMARY_FAULT_REG_LEN + 12);
747 if ( !(data & DMA_FRCD_F) )
748 {
749 spin_unlock_irqrestore(&iommu->register_lock, flags);
750 break;
751 }
753 fault_reason = dma_frcd_fault_reason(data);
754 type = dma_frcd_type(data);
756 data = dmar_readl(iommu->reg, reg +
757 fault_index * PRIMARY_FAULT_REG_LEN + 8);
758 source_id = dma_frcd_source_id(data);
760 guest_addr = dmar_readq(iommu->reg, reg +
761 fault_index * PRIMARY_FAULT_REG_LEN);
762 guest_addr = dma_frcd_page_addr(guest_addr);
763 /* clear the fault */
764 dmar_writel(iommu->reg, reg +
765 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
766 spin_unlock_irqrestore(&iommu->register_lock, flags);
768 iommu_page_fault_do_one(iommu, type, fault_reason,
769 source_id, guest_addr);
771 fault_index++;
772 if ( fault_index > cap_num_fault_regs(iommu->cap) )
773 fault_index = 0;
774 }
775 clear_overflow:
776 /* clear primary fault overflow */
777 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
778 if ( fault_status & DMA_FSTS_PFO )
779 {
780 spin_lock_irqsave(&iommu->register_lock, flags);
781 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
782 spin_unlock_irqrestore(&iommu->register_lock, flags);
783 }
784 }
786 static void dma_msi_unmask(unsigned int vector)
787 {
788 struct iommu *iommu = vector_to_iommu[vector];
789 unsigned long flags;
791 /* unmask it */
792 spin_lock_irqsave(&iommu->register_lock, flags);
793 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
794 spin_unlock_irqrestore(&iommu->register_lock, flags);
795 }
797 static void dma_msi_mask(unsigned int vector)
798 {
799 unsigned long flags;
800 struct iommu *iommu = vector_to_iommu[vector];
802 /* mask it */
803 spin_lock_irqsave(&iommu->register_lock, flags);
804 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
805 spin_unlock_irqrestore(&iommu->register_lock, flags);
806 }
808 static unsigned int dma_msi_startup(unsigned int vector)
809 {
810 dma_msi_unmask(vector);
811 return 0;
812 }
814 static void dma_msi_end(unsigned int vector)
815 {
816 dma_msi_unmask(vector);
817 ack_APIC_irq();
818 }
820 static void dma_msi_data_init(struct iommu *iommu, int vector)
821 {
822 u32 msi_data = 0;
823 unsigned long flags;
825 /* Fixed, edge, assert mode. Follow MSI setting */
826 msi_data |= vector & 0xff;
827 msi_data |= 1 << 14;
829 spin_lock_irqsave(&iommu->register_lock, flags);
830 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
831 spin_unlock_irqrestore(&iommu->register_lock, flags);
832 }
834 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
835 {
836 u64 msi_address;
837 unsigned long flags;
839 /* Physical, dedicated cpu. Follow MSI setting */
840 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
841 msi_address |= MSI_PHYSICAL_MODE << 2;
842 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
843 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
845 spin_lock_irqsave(&iommu->register_lock, flags);
846 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
847 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
848 spin_unlock_irqrestore(&iommu->register_lock, flags);
849 }
851 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
852 {
853 struct iommu *iommu = vector_to_iommu[vector];
854 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
855 }
857 static struct hw_interrupt_type dma_msi_type = {
858 .typename = "DMA_MSI",
859 .startup = dma_msi_startup,
860 .shutdown = dma_msi_mask,
861 .enable = dma_msi_unmask,
862 .disable = dma_msi_mask,
863 .ack = dma_msi_mask,
864 .end = dma_msi_end,
865 .set_affinity = dma_msi_set_affinity,
866 };
868 static int iommu_set_interrupt(struct iommu *iommu)
869 {
870 int vector, ret;
872 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
873 if ( vector <= 0 )
874 {
875 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
876 return -EINVAL;
877 }
879 irq_desc[vector].handler = &dma_msi_type;
880 vector_to_iommu[vector] = iommu;
881 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
882 if ( ret )
883 {
884 irq_desc[vector].handler = &no_irq_type;
885 vector_to_iommu[vector] = NULL;
886 free_irq_vector(vector);
887 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
888 return ret;
889 }
891 /* Make sure that vector is never re-used. */
892 vector_irq[vector] = NEVER_ASSIGN_IRQ;
894 return vector;
895 }
897 static int iommu_alloc(struct acpi_drhd_unit *drhd)
898 {
899 struct iommu *iommu;
900 unsigned long sagaw;
901 int agaw;
903 if ( nr_iommus > MAX_IOMMUS )
904 {
905 gdprintk(XENLOG_ERR VTDPREFIX,
906 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
907 return -ENOMEM;
908 }
910 iommu = xmalloc(struct iommu);
911 if ( iommu == NULL )
912 return -ENOMEM;
913 memset(iommu, 0, sizeof(struct iommu));
915 iommu->intel = alloc_intel_iommu();
916 if ( iommu->intel == NULL )
917 {
918 xfree(iommu);
919 return -ENOMEM;
920 }
922 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
923 iommu->index = nr_iommus++;
925 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
926 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
928 /* Calculate number of pagetable levels: between 2 and 4. */
929 sagaw = cap_sagaw(iommu->cap);
930 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
931 if ( test_bit(agaw, &sagaw) )
932 break;
933 if ( agaw < 0 )
934 {
935 gdprintk(XENLOG_ERR VTDPREFIX,
936 "IOMMU: unsupported sagaw %lx\n", sagaw);
937 xfree(iommu);
938 return -ENODEV;
939 }
940 iommu->nr_pt_levels = agaw_to_level(agaw);
942 if ( !ecap_coherent(iommu->ecap) )
943 iommus_incoherent = 1;
945 spin_lock_init(&iommu->lock);
946 spin_lock_init(&iommu->register_lock);
948 drhd->iommu = iommu;
949 return 0;
950 }
952 static void iommu_free(struct acpi_drhd_unit *drhd)
953 {
954 struct iommu *iommu = drhd->iommu;
956 if ( iommu == NULL )
957 return;
959 if ( iommu->root_maddr != 0 )
960 {
961 free_pgtable_maddr(iommu->root_maddr);
962 iommu->root_maddr = 0;
963 }
965 if ( iommu->reg )
966 iounmap(iommu->reg);
968 free_intel_iommu(iommu->intel);
969 release_irq_vector(iommu->vector);
970 xfree(iommu);
972 drhd->iommu = NULL;
973 }
975 #define guestwidth_to_adjustwidth(gaw) ({ \
976 int agaw, r = (gaw - 12) % 9; \
977 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
978 if ( agaw > 64 ) \
979 agaw = 64; \
980 agaw; })
982 static int intel_iommu_domain_init(struct domain *d)
983 {
984 struct hvm_iommu *hd = domain_hvm_iommu(d);
985 struct iommu *iommu = NULL;
986 struct acpi_drhd_unit *drhd;
988 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
989 iommu = drhd->iommu;
991 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
993 if ( d->domain_id == 0 )
994 {
995 /* Set up 1:1 page table for dom0 */
996 iommu_set_dom0_mapping(d);
998 setup_dom0_devices(d);
999 setup_dom0_rmrr(d);
1001 iommu_flush_all();
1003 for_each_drhd_unit ( drhd )
1005 iommu = drhd->iommu;
1006 iommu_enable_translation(iommu);
1010 return 0;
1013 static int domain_context_mapping_one(
1014 struct domain *domain,
1015 struct iommu *iommu,
1016 u8 bus, u8 devfn)
1018 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1019 struct context_entry *context, *context_entries;
1020 u64 maddr, pgd_maddr;
1021 struct pci_dev *pdev = NULL;
1022 int agaw;
1024 ASSERT(spin_is_locked(&pcidevs_lock));
1025 spin_lock(&iommu->lock);
1026 maddr = bus_to_context_maddr(iommu, bus);
1027 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1028 context = &context_entries[devfn];
1030 if ( context_present(*context) )
1032 int res = 0;
1034 pdev = pci_get_pdev(bus, devfn);
1035 if (!pdev)
1036 res = -ENODEV;
1037 else if (pdev->domain != domain)
1038 res = -EINVAL;
1039 unmap_vtd_domain_page(context_entries);
1040 spin_unlock(&iommu->lock);
1041 return res;
1044 if ( iommu_passthrough && (domain->domain_id == 0) )
1046 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1047 agaw = level_to_agaw(iommu->nr_pt_levels);
1049 else
1051 spin_lock(&hd->mapping_lock);
1053 /* Ensure we have pagetables allocated down to leaf PTE. */
1054 if ( hd->pgd_maddr == 0 )
1056 addr_to_dma_page_maddr(domain, 0, 1);
1057 if ( hd->pgd_maddr == 0 )
1059 nomem:
1060 spin_unlock(&hd->mapping_lock);
1061 spin_unlock(&iommu->lock);
1062 unmap_vtd_domain_page(context_entries);
1063 return -ENOMEM;
1067 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1068 pgd_maddr = hd->pgd_maddr;
1069 for ( agaw = level_to_agaw(4);
1070 agaw != level_to_agaw(iommu->nr_pt_levels);
1071 agaw-- )
1073 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1074 pgd_maddr = dma_pte_addr(*p);
1075 unmap_vtd_domain_page(p);
1076 if ( pgd_maddr == 0 )
1077 goto nomem;
1080 context_set_address_root(*context, pgd_maddr);
1081 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1082 spin_unlock(&hd->mapping_lock);
1085 /*
1086 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1087 * be 1 based as required by intel's iommu hw.
1088 */
1089 context_set_domain_id(context, domain);
1090 context_set_address_width(*context, agaw);
1091 context_set_fault_enable(*context);
1092 context_set_present(*context);
1093 iommu_flush_cache_entry(context);
1094 spin_unlock(&iommu->lock);
1096 /* Context entry was previously non-present (with domid 0). */
1097 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1098 DMA_CCMD_MASK_NOBIT, 1) )
1099 iommu_flush_write_buffer(iommu);
1100 else
1101 iommu_flush_iotlb_dsi(iommu, 0, 1);
1103 set_bit(iommu->index, &hd->iommu_bitmap);
1105 unmap_vtd_domain_page(context_entries);
1107 return 0;
1110 #define PCI_BASE_CLASS_BRIDGE 0x06
1111 #define PCI_CLASS_BRIDGE_PCI 0x0604
1113 enum {
1114 DEV_TYPE_PCIe_ENDPOINT,
1115 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1116 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1117 DEV_TYPE_PCI,
1118 };
1120 int pdev_type(u8 bus, u8 devfn)
1122 u16 class_device;
1123 u16 status, creg;
1124 int pos;
1125 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1127 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1128 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1130 pos = pci_find_next_cap(bus, devfn,
1131 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1132 if ( !pos )
1133 return DEV_TYPE_PCI_BRIDGE;
1134 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1135 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1136 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1139 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1140 if ( !(status & PCI_STATUS_CAP_LIST) )
1141 return DEV_TYPE_PCI;
1143 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1144 return DEV_TYPE_PCIe_ENDPOINT;
1146 return DEV_TYPE_PCI;
1149 #define MAX_BUSES 256
1150 static DEFINE_SPINLOCK(bus2bridge_lock);
1151 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1153 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1155 int cnt = 0;
1156 *secbus = *bus;
1158 ASSERT(spin_is_locked(&bus2bridge_lock));
1159 if ( !bus2bridge[*bus].map )
1160 return 0;
1162 while ( bus2bridge[*bus].map )
1164 *secbus = *bus;
1165 *devfn = bus2bridge[*bus].devfn;
1166 *bus = bus2bridge[*bus].bus;
1167 if ( cnt++ >= MAX_BUSES )
1168 return 0;
1171 return 1;
1174 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1176 int ret = 0;
1178 if ( *bus == 0 )
1179 /* assume integrated PCI devices in RC have valid requester-id */
1180 return 1;
1182 spin_lock(&bus2bridge_lock);
1183 ret = _find_pcie_endpoint(bus, devfn, secbus);
1184 spin_unlock(&bus2bridge_lock);
1186 return ret;
1189 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1191 struct acpi_drhd_unit *drhd;
1192 int ret = 0;
1193 u16 sec_bus, sub_bus;
1194 u32 type;
1195 u8 secbus, secdevfn;
1196 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1198 BUG_ON(!pdev);
1200 drhd = acpi_find_matched_drhd_unit(pdev);
1201 if ( !drhd )
1202 return -ENODEV;
1204 ASSERT(spin_is_locked(&pcidevs_lock));
1206 type = pdev_type(bus, devfn);
1207 switch ( type )
1209 case DEV_TYPE_PCIe_BRIDGE:
1210 break;
1212 case DEV_TYPE_PCI_BRIDGE:
1213 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1214 PCI_SECONDARY_BUS);
1215 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1216 PCI_SUBORDINATE_BUS);
1218 spin_lock(&bus2bridge_lock);
1219 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1221 bus2bridge[sec_bus].map = 1;
1222 bus2bridge[sec_bus].bus = bus;
1223 bus2bridge[sec_bus].devfn = devfn;
1225 spin_unlock(&bus2bridge_lock);
1226 break;
1228 case DEV_TYPE_PCIe_ENDPOINT:
1229 gdprintk(XENLOG_INFO VTDPREFIX,
1230 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1231 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1232 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1233 break;
1235 case DEV_TYPE_PCI:
1236 gdprintk(XENLOG_INFO VTDPREFIX,
1237 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1238 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1240 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1241 if ( ret )
1242 break;
1244 secbus = bus;
1245 secdevfn = devfn;
1246 /* dependent devices mapping */
1247 while ( bus2bridge[bus].map )
1249 secbus = bus;
1250 secdevfn = devfn;
1251 devfn = bus2bridge[bus].devfn;
1252 bus = bus2bridge[bus].bus;
1253 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1254 if ( ret )
1255 return ret;
1258 if ( (secbus != bus) && (secdevfn != 0) )
1259 /*
1260 * The source-id for transactions on non-PCIe buses seem
1261 * to originate from devfn=0 on the secondary bus behind
1262 * the bridge. Map that id as well. The id to use in
1263 * these scanarios is not particularly well documented
1264 * anywhere.
1265 */
1266 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1267 break;
1269 default:
1270 gdprintk(XENLOG_ERR VTDPREFIX,
1271 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1272 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1273 ret = -EINVAL;
1274 break;
1277 return ret;
1280 static int domain_context_unmap_one(
1281 struct domain *domain,
1282 struct iommu *iommu,
1283 u8 bus, u8 devfn)
1285 struct context_entry *context, *context_entries;
1286 u64 maddr;
1288 ASSERT(spin_is_locked(&pcidevs_lock));
1289 spin_lock(&iommu->lock);
1291 maddr = bus_to_context_maddr(iommu, bus);
1292 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1293 context = &context_entries[devfn];
1295 if ( !context_present(*context) )
1297 spin_unlock(&iommu->lock);
1298 unmap_vtd_domain_page(context_entries);
1299 return 0;
1302 context_clear_present(*context);
1303 context_clear_entry(*context);
1304 iommu_flush_cache_entry(context);
1306 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1307 (((u16)bus) << 8) | devfn,
1308 DMA_CCMD_MASK_NOBIT, 0) )
1309 iommu_flush_write_buffer(iommu);
1310 else
1311 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
1313 spin_unlock(&iommu->lock);
1314 unmap_vtd_domain_page(context_entries);
1316 return 0;
1319 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1321 struct acpi_drhd_unit *drhd;
1322 int ret = 0;
1323 u32 type;
1324 u8 secbus, secdevfn;
1325 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1327 BUG_ON(!pdev);
1329 drhd = acpi_find_matched_drhd_unit(pdev);
1330 if ( !drhd )
1331 return -ENODEV;
1333 type = pdev_type(bus, devfn);
1334 switch ( type )
1336 case DEV_TYPE_PCIe_BRIDGE:
1337 case DEV_TYPE_PCI_BRIDGE:
1338 break;
1340 case DEV_TYPE_PCIe_ENDPOINT:
1341 gdprintk(XENLOG_INFO VTDPREFIX,
1342 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1343 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1344 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1345 break;
1347 case DEV_TYPE_PCI:
1348 gdprintk(XENLOG_INFO VTDPREFIX,
1349 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1350 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1351 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1352 if ( ret )
1353 break;
1355 secbus = bus;
1356 secdevfn = devfn;
1357 /* dependent devices unmapping */
1358 while ( bus2bridge[bus].map )
1360 secbus = bus;
1361 secdevfn = devfn;
1362 devfn = bus2bridge[bus].devfn;
1363 bus = bus2bridge[bus].bus;
1364 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1365 if ( ret )
1366 return ret;
1369 if ( (secbus != bus) && (secdevfn != 0) )
1370 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1371 break;
1373 default:
1374 gdprintk(XENLOG_ERR VTDPREFIX,
1375 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1376 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1377 ret = -EINVAL;
1378 break;
1381 return ret;
1384 static int reassign_device_ownership(
1385 struct domain *source,
1386 struct domain *target,
1387 u8 bus, u8 devfn)
1389 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1390 struct pci_dev *pdev;
1391 struct acpi_drhd_unit *drhd;
1392 struct iommu *pdev_iommu;
1393 int ret, found = 0;
1395 ASSERT(spin_is_locked(&pcidevs_lock));
1396 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1398 if (!pdev)
1399 return -ENODEV;
1401 drhd = acpi_find_matched_drhd_unit(pdev);
1402 pdev_iommu = drhd->iommu;
1403 domain_context_unmap(source, bus, devfn);
1405 ret = domain_context_mapping(target, bus, devfn);
1406 if ( ret )
1407 return ret;
1409 list_move(&pdev->domain_list, &target->arch.pdev_list);
1410 pdev->domain = target;
1412 for_each_pdev ( source, pdev )
1414 drhd = acpi_find_matched_drhd_unit(pdev);
1415 if ( drhd->iommu == pdev_iommu )
1417 found = 1;
1418 break;
1422 if ( !found )
1423 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1425 return ret;
1428 void iommu_domain_teardown(struct domain *d)
1430 struct hvm_iommu *hd = domain_hvm_iommu(d);
1432 if ( list_empty(&acpi_drhd_units) )
1433 return;
1435 spin_lock(&hd->mapping_lock);
1436 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1437 hd->pgd_maddr = 0;
1438 spin_unlock(&hd->mapping_lock);
1440 iommu_domid_release(d);
1443 int intel_iommu_map_page(
1444 struct domain *d, unsigned long gfn, unsigned long mfn)
1446 struct hvm_iommu *hd = domain_hvm_iommu(d);
1447 struct acpi_drhd_unit *drhd;
1448 struct iommu *iommu;
1449 struct dma_pte *page = NULL, *pte = NULL;
1450 u64 pg_maddr;
1451 int pte_present;
1453 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1454 iommu = drhd->iommu;
1456 /* do nothing if dom0 and iommu supports pass thru */
1457 if ( iommu_passthrough && (d->domain_id == 0) )
1458 return 0;
1460 spin_lock(&hd->mapping_lock);
1462 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1463 if ( pg_maddr == 0 )
1465 spin_unlock(&hd->mapping_lock);
1466 return -ENOMEM;
1468 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1469 pte = page + (gfn & LEVEL_MASK);
1470 pte_present = dma_pte_present(*pte);
1471 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1472 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1474 /* Set the SNP on leaf page table if Snoop Control available */
1475 if ( iommu_snoop )
1476 dma_set_pte_snp(*pte);
1478 iommu_flush_cache_entry(pte);
1479 spin_unlock(&hd->mapping_lock);
1480 unmap_vtd_domain_page(page);
1482 /*
1483 * No need pcideves_lock here because we have flush
1484 * when assign/deassign device
1485 */
1486 for_each_drhd_unit ( drhd )
1488 iommu = drhd->iommu;
1490 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1491 continue;
1493 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1494 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1495 !pte_present) )
1496 iommu_flush_write_buffer(iommu);
1499 return 0;
1502 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1504 struct acpi_drhd_unit *drhd;
1505 struct iommu *iommu;
1507 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1508 iommu = drhd->iommu;
1510 /* do nothing if dom0 and iommu supports pass thru */
1511 if ( iommu_passthrough && (d->domain_id == 0) )
1512 return 0;
1514 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1516 return 0;
1519 static int iommu_prepare_rmrr_dev(struct domain *d,
1520 struct acpi_rmrr_unit *rmrr,
1521 u8 bus, u8 devfn)
1523 int ret = 0;
1524 u64 base, end;
1525 unsigned long base_pfn, end_pfn;
1527 ASSERT(spin_is_locked(&pcidevs_lock));
1528 ASSERT(rmrr->base_address < rmrr->end_address);
1530 base = rmrr->base_address & PAGE_MASK_4K;
1531 base_pfn = base >> PAGE_SHIFT_4K;
1532 end = PAGE_ALIGN_4K(rmrr->end_address);
1533 end_pfn = end >> PAGE_SHIFT_4K;
1535 while ( base_pfn < end_pfn )
1537 intel_iommu_map_page(d, base_pfn, base_pfn);
1538 base_pfn++;
1541 ret = domain_context_mapping(d, bus, devfn);
1543 return ret;
1546 static int intel_iommu_add_device(struct pci_dev *pdev)
1548 struct acpi_rmrr_unit *rmrr;
1549 u16 bdf;
1550 int ret, i;
1552 ASSERT(spin_is_locked(&pcidevs_lock));
1554 if ( !pdev->domain )
1555 return -EINVAL;
1557 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1558 if ( ret )
1560 gdprintk(XENLOG_ERR VTDPREFIX,
1561 "intel_iommu_add_device: context mapping failed\n");
1562 return ret;
1565 for_each_rmrr_device ( rmrr, bdf, i )
1567 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1569 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1570 pdev->bus, pdev->devfn);
1571 if ( ret )
1572 gdprintk(XENLOG_ERR VTDPREFIX,
1573 "intel_iommu_add_device: RMRR mapping failed\n");
1574 break;
1578 return ret;
1581 static int intel_iommu_remove_device(struct pci_dev *pdev)
1583 struct acpi_rmrr_unit *rmrr;
1584 u16 bdf;
1585 int i;
1587 if ( !pdev->domain )
1588 return -EINVAL;
1590 /* If the device belongs to dom0, and it has RMRR, don't remove it
1591 * from dom0, because BIOS may use RMRR at booting time.
1592 */
1593 if ( pdev->domain->domain_id == 0 )
1595 for_each_rmrr_device ( rmrr, bdf, i )
1597 if ( PCI_BUS(bdf) == pdev->bus &&
1598 PCI_DEVFN2(bdf) == pdev->devfn )
1599 return 0;
1603 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1606 static void setup_dom0_devices(struct domain *d)
1608 struct hvm_iommu *hd;
1609 struct pci_dev *pdev;
1610 int bus, dev, func;
1611 u32 l;
1613 hd = domain_hvm_iommu(d);
1615 spin_lock(&pcidevs_lock);
1616 for ( bus = 0; bus < 256; bus++ )
1618 for ( dev = 0; dev < 32; dev++ )
1620 for ( func = 0; func < 8; func++ )
1622 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1623 /* some broken boards return 0 or ~0 if a slot is empty: */
1624 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1625 (l == 0x0000ffff) || (l == 0xffff0000) )
1626 continue;
1628 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1629 pdev->domain = d;
1630 list_add(&pdev->domain_list, &d->arch.pdev_list);
1631 domain_context_mapping(d, pdev->bus, pdev->devfn);
1635 spin_unlock(&pcidevs_lock);
1638 void clear_fault_bits(struct iommu *iommu)
1640 u64 val;
1642 val = dmar_readq(
1643 iommu->reg,
1644 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1645 dmar_writeq(
1646 iommu->reg,
1647 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1648 val);
1649 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1652 static int init_vtd_hw(void)
1654 struct acpi_drhd_unit *drhd;
1655 struct iommu *iommu;
1656 struct iommu_flush *flush = NULL;
1657 int vector;
1658 int ret;
1660 for_each_drhd_unit ( drhd )
1662 iommu = drhd->iommu;
1663 ret = iommu_set_root_entry(iommu);
1664 if ( ret )
1666 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1667 return -EIO;
1670 vector = iommu_set_interrupt(iommu);
1671 if ( vector < 0 )
1673 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1674 return vector;
1676 dma_msi_data_init(iommu, vector);
1677 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1678 iommu->vector = vector;
1679 clear_fault_bits(iommu);
1680 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1682 /* initialize flush functions */
1683 flush = iommu_get_flush(iommu);
1684 flush->context = flush_context_reg;
1685 flush->iotlb = flush_iotlb_reg;
1688 if ( iommu_qinval )
1690 for_each_drhd_unit ( drhd )
1692 iommu = drhd->iommu;
1693 if ( qinval_setup(iommu) != 0 )
1695 dprintk(XENLOG_INFO VTDPREFIX,
1696 "Failed to enable Queued Invalidation!\n");
1697 break;
1702 if ( iommu_intremap )
1704 for_each_drhd_unit ( drhd )
1706 iommu = drhd->iommu;
1707 if ( intremap_setup(iommu) != 0 )
1709 dprintk(XENLOG_INFO VTDPREFIX,
1710 "Failed to enable Interrupt Remapping!\n");
1711 break;
1716 return 0;
1719 static void setup_dom0_rmrr(struct domain *d)
1721 struct acpi_rmrr_unit *rmrr;
1722 u16 bdf;
1723 int ret, i;
1725 spin_lock(&pcidevs_lock);
1726 for_each_rmrr_device ( rmrr, bdf, i )
1728 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1729 if ( ret )
1730 gdprintk(XENLOG_ERR VTDPREFIX,
1731 "IOMMU: mapping reserved region failed\n");
1733 spin_unlock(&pcidevs_lock);
1736 static void platform_quirks(void)
1738 u32 id;
1740 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1741 id = pci_conf_read32(0, 0, 0, 0);
1742 if ( id == 0x2a408086 )
1744 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1745 rwbf_quirk = 1;
1749 int intel_vtd_setup(void)
1751 struct acpi_drhd_unit *drhd;
1752 struct iommu *iommu;
1754 if ( !vtd_enabled )
1755 return -ENODEV;
1757 platform_quirks();
1759 spin_lock_init(&domid_bitmap_lock);
1760 clflush_size = get_cache_line_size();
1762 /* We enable the following features only if they are supported by all VT-d
1763 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1764 * Interrupt Remapping.
1765 */
1766 for_each_drhd_unit ( drhd )
1768 if ( iommu_alloc(drhd) != 0 )
1769 goto error;
1771 iommu = drhd->iommu;
1773 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1774 iommu_snoop = 0;
1776 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1777 iommu_passthrough = 0;
1779 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1780 iommu_qinval = 0;
1782 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1783 iommu_intremap = 0;
1785 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1786 P(iommu_snoop, "Snoop Control");
1787 P(iommu_passthrough, "DMA Passthrough");
1788 P(iommu_qinval, "Queued Invalidation");
1789 P(iommu_intremap, "Interrupt Remapping");
1790 #undef P
1792 /* Allocate IO page directory page for the domain. */
1793 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1794 iommu = drhd->iommu;
1796 /* Allocate domain id bitmap, and set bit 0 as reserved */
1797 domid_bitmap_size = cap_ndoms(iommu->cap);
1798 domid_bitmap = xmalloc_array(unsigned long,
1799 BITS_TO_LONGS(domid_bitmap_size));
1800 if ( domid_bitmap == NULL )
1801 goto error;
1802 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1803 set_bit(0, domid_bitmap);
1805 if ( init_vtd_hw() )
1806 goto error;
1808 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1810 return 0;
1812 error:
1813 for_each_drhd_unit ( drhd )
1814 iommu_free(drhd);
1815 vtd_enabled = 0;
1816 iommu_snoop = 0;
1817 iommu_passthrough = 0;
1818 iommu_qinval = 0;
1819 iommu_intremap = 0;
1820 return -ENOMEM;
1823 /*
1824 * If the device isn't owned by dom0, it means it already
1825 * has been assigned to other domain, or it's not exist.
1826 */
1827 int device_assigned(u8 bus, u8 devfn)
1829 struct pci_dev *pdev;
1831 spin_lock(&pcidevs_lock);
1832 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1833 if (!pdev)
1835 spin_unlock(&pcidevs_lock);
1836 return -1;
1839 spin_unlock(&pcidevs_lock);
1840 return 0;
1843 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1845 struct acpi_rmrr_unit *rmrr;
1846 int ret = 0, i;
1847 struct pci_dev *pdev;
1848 u16 bdf;
1850 if ( list_empty(&acpi_drhd_units) )
1851 return -ENODEV;
1853 ASSERT(spin_is_locked(&pcidevs_lock));
1854 pdev = pci_get_pdev(bus, devfn);
1855 if (!pdev)
1856 return -ENODEV;
1858 if (pdev->domain != dom0)
1860 gdprintk(XENLOG_ERR VTDPREFIX,
1861 "IOMMU: assign a assigned device\n");
1862 return -EBUSY;
1865 ret = reassign_device_ownership(dom0, d, bus, devfn);
1866 if ( ret )
1867 goto done;
1869 /* Setup rmrr identity mapping */
1870 for_each_rmrr_device( rmrr, bdf, i )
1872 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1874 /* FIXME: Because USB RMRR conflicts with guest bios region,
1875 * ignore USB RMRR temporarily.
1876 */
1877 if ( is_usb_device(bus, devfn) )
1879 ret = 0;
1880 goto done;
1883 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1884 if ( ret )
1885 gdprintk(XENLOG_ERR VTDPREFIX,
1886 "IOMMU: mapping reserved region failed\n");
1887 goto done;
1891 done:
1892 return ret;
1895 static int intel_iommu_group_id(u8 bus, u8 devfn)
1897 u8 secbus;
1898 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1899 return PCI_BDF2(bus, devfn);
1900 else
1901 return -1;
1904 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1905 void iommu_suspend(void)
1907 struct acpi_drhd_unit *drhd;
1908 struct iommu *iommu;
1909 u32 i;
1911 if ( !vtd_enabled )
1912 return;
1914 iommu_flush_all();
1916 for_each_drhd_unit ( drhd )
1918 iommu = drhd->iommu;
1919 i = iommu->index;
1921 iommu_state[i][DMAR_FECTL_REG] =
1922 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1923 iommu_state[i][DMAR_FEDATA_REG] =
1924 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1925 iommu_state[i][DMAR_FEADDR_REG] =
1926 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1927 iommu_state[i][DMAR_FEUADDR_REG] =
1928 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1932 void iommu_resume(void)
1934 struct acpi_drhd_unit *drhd;
1935 struct iommu *iommu;
1936 u32 i;
1938 if ( !vtd_enabled )
1939 return;
1941 iommu_flush_all();
1943 if ( init_vtd_hw() != 0 && force_iommu )
1944 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1946 for_each_drhd_unit ( drhd )
1948 iommu = drhd->iommu;
1949 i = iommu->index;
1951 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1952 (u32) iommu_state[i][DMAR_FECTL_REG]);
1953 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1954 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1955 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1956 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1957 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1958 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1959 iommu_enable_translation(iommu);
1963 struct iommu_ops intel_iommu_ops = {
1964 .init = intel_iommu_domain_init,
1965 .add_device = intel_iommu_add_device,
1966 .remove_device = intel_iommu_remove_device,
1967 .assign_device = intel_iommu_assign_device,
1968 .teardown = iommu_domain_teardown,
1969 .map_page = intel_iommu_map_page,
1970 .unmap_page = intel_iommu_unmap_page,
1971 .reassign_device = reassign_device_ownership,
1972 .get_device_group_id = intel_iommu_group_id,
1973 .update_ire_from_apic = io_apic_write_remap_rte,
1974 .update_ire_from_msi = msi_msg_write_remap_rte,
1975 };
1977 /*
1978 * Local variables:
1979 * mode: C
1980 * c-set-style: "BSD"
1981 * c-basic-offset: 4
1982 * tab-width: 4
1983 * indent-tabs-mode: nil
1984 * End:
1985 */