ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19187:1eb6afcad849

vtd: adding support for multiple queued invalidation pages

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 09 14:23:51 2009 +0000 (2009-02-09)
parents ab514cfbcdc5
children 9e3be0660c1e
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include "iommu.h"
34 #include "dmar.h"
35 #include "extern.h"
36 #include "vtd.h"
38 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
40 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
41 static int domid_bitmap_size; /* domain id bitmap size in bits */
42 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static void setup_dom0_devices(struct domain *d);
45 static void setup_dom0_rmrr(struct domain *d);
47 #define DID_FIELD_WIDTH 16
48 #define DID_HIGH_OFFSET 8
49 static void context_set_domain_id(struct context_entry *context,
50 struct domain *d)
51 {
52 domid_t iommu_domid = domain_iommu_domid(d);
54 if ( iommu_domid == 0 )
55 {
56 spin_lock(&domid_bitmap_lock);
57 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
58 set_bit(iommu_domid, domid_bitmap);
59 spin_unlock(&domid_bitmap_lock);
60 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
61 }
63 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
64 context->hi |= iommu_domid << DID_HIGH_OFFSET;
65 }
67 static void iommu_domid_release(struct domain *d)
68 {
69 domid_t iommu_domid = domain_iommu_domid(d);
71 if ( iommu_domid != 0 )
72 {
73 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
74 clear_bit(iommu_domid, domid_bitmap);
75 }
76 }
78 static struct intel_iommu *alloc_intel_iommu(void)
79 {
80 struct intel_iommu *intel;
82 intel = xmalloc(struct intel_iommu);
83 if ( intel == NULL )
84 return NULL;
85 memset(intel, 0, sizeof(struct intel_iommu));
87 spin_lock_init(&intel->qi_ctrl.qinval_lock);
88 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
89 spin_lock_init(&intel->ir_ctrl.iremap_lock);
91 return intel;
92 }
94 static void free_intel_iommu(struct intel_iommu *intel)
95 {
96 xfree(intel);
97 }
99 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
100 {
101 return iommu ? &iommu->intel->qi_ctrl : NULL;
102 }
104 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
105 {
106 return iommu ? &iommu->intel->ir_ctrl : NULL;
107 }
109 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
110 {
111 return iommu ? &iommu->intel->flush : NULL;
112 }
114 static unsigned int clflush_size;
115 static int iommus_incoherent;
116 static void __iommu_flush_cache(void *addr, int size)
117 {
118 int i;
120 if ( !iommus_incoherent )
121 return;
123 for ( i = 0; i < size; i += clflush_size )
124 cacheline_flush((char *)addr + i);
125 }
127 void iommu_flush_cache_entry(void *addr)
128 {
129 __iommu_flush_cache(addr, 8);
130 }
132 void iommu_flush_cache_page(void *addr, unsigned long npages)
133 {
134 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
135 }
137 int nr_iommus;
138 /* context entry handling */
139 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
140 {
141 struct root_entry *root, *root_entries;
142 u64 maddr;
144 ASSERT(spin_is_locked(&iommu->lock));
145 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
146 root = &root_entries[bus];
147 if ( !root_present(*root) )
148 {
149 maddr = alloc_pgtable_maddr(NULL, 1);
150 if ( maddr == 0 )
151 {
152 unmap_vtd_domain_page(root_entries);
153 return 0;
154 }
155 set_root_value(*root, maddr);
156 set_root_present(*root);
157 iommu_flush_cache_entry(root);
158 }
159 maddr = (u64) get_context_addr(*root);
160 unmap_vtd_domain_page(root_entries);
161 return maddr;
162 }
164 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
165 {
166 struct hvm_iommu *hd = domain_hvm_iommu(domain);
167 int addr_width = agaw_to_width(hd->agaw);
168 struct dma_pte *parent, *pte = NULL;
169 int level = agaw_to_level(hd->agaw);
170 int offset;
171 u64 pte_maddr = 0, maddr;
172 u64 *vaddr = NULL;
174 addr &= (((u64)1) << addr_width) - 1;
175 ASSERT(spin_is_locked(&hd->mapping_lock));
176 if ( hd->pgd_maddr == 0 )
177 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
178 goto out;
180 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
181 while ( level > 1 )
182 {
183 offset = address_level_offset(addr, level);
184 pte = &parent[offset];
186 if ( dma_pte_addr(*pte) == 0 )
187 {
188 if ( !alloc )
189 break;
190 maddr = alloc_pgtable_maddr(domain, 1);
191 if ( !maddr )
192 break;
193 dma_set_pte_addr(*pte, maddr);
194 vaddr = map_vtd_domain_page(maddr);
196 /*
197 * high level table always sets r/w, last level
198 * page table control read/write
199 */
200 dma_set_pte_readable(*pte);
201 dma_set_pte_writable(*pte);
202 iommu_flush_cache_entry(pte);
203 }
204 else
205 {
206 vaddr = map_vtd_domain_page(pte->val);
207 }
209 if ( level == 2 )
210 {
211 pte_maddr = pte->val & PAGE_MASK_4K;
212 unmap_vtd_domain_page(vaddr);
213 break;
214 }
216 unmap_vtd_domain_page(parent);
217 parent = (struct dma_pte *)vaddr;
218 vaddr = NULL;
219 level--;
220 }
222 unmap_vtd_domain_page(parent);
223 out:
224 return pte_maddr;
225 }
227 static void iommu_flush_write_buffer(struct iommu *iommu)
228 {
229 u32 val;
230 unsigned long flag;
231 s_time_t start_time;
233 if ( !cap_rwbf(iommu->cap) )
234 return;
235 val = iommu->gcmd | DMA_GCMD_WBF;
237 spin_lock_irqsave(&iommu->register_lock, flag);
238 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
240 /* Make sure hardware complete it */
241 start_time = NOW();
242 for ( ; ; )
243 {
244 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
245 if ( !(val & DMA_GSTS_WBFS) )
246 break;
247 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
248 panic("%s: DMAR hardware is malfunctional,"
249 " please disable IOMMU\n", __func__);
250 cpu_relax();
251 }
252 spin_unlock_irqrestore(&iommu->register_lock, flag);
253 }
255 /* return value determine if we need a write buffer flush */
256 static int flush_context_reg(
257 void *_iommu,
258 u16 did, u16 source_id, u8 function_mask, u64 type,
259 int non_present_entry_flush)
260 {
261 struct iommu *iommu = (struct iommu *) _iommu;
262 u64 val = 0;
263 unsigned long flag;
264 s_time_t start_time;
266 /*
267 * In the non-present entry flush case, if hardware doesn't cache
268 * non-present entry we do nothing and if hardware cache non-present
269 * entry, we flush entries of domain 0 (the domain id is used to cache
270 * any non-present entries)
271 */
272 if ( non_present_entry_flush )
273 {
274 if ( !cap_caching_mode(iommu->cap) )
275 return 1;
276 else
277 did = 0;
278 }
280 /* use register invalidation */
281 switch ( type )
282 {
283 case DMA_CCMD_GLOBAL_INVL:
284 val = DMA_CCMD_GLOBAL_INVL;
285 break;
286 case DMA_CCMD_DOMAIN_INVL:
287 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
288 break;
289 case DMA_CCMD_DEVICE_INVL:
290 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
291 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
292 break;
293 default:
294 BUG();
295 }
296 val |= DMA_CCMD_ICC;
298 spin_lock_irqsave(&iommu->register_lock, flag);
299 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
301 /* Make sure hardware complete it */
302 start_time = NOW();
303 for ( ; ; )
304 {
305 val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
306 if ( !(val & DMA_CCMD_ICC) )
307 break;
308 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
309 panic("%s: DMAR hardware is malfunctional,"
310 " please disable IOMMU\n", __func__);
311 cpu_relax();
312 }
313 spin_unlock_irqrestore(&iommu->register_lock, flag);
314 /* flush context entry will implicitly flush write buffer */
315 return 0;
316 }
318 static int inline iommu_flush_context_global(
319 struct iommu *iommu, int non_present_entry_flush)
320 {
321 struct iommu_flush *flush = iommu_get_flush(iommu);
322 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
323 non_present_entry_flush);
324 }
326 static int inline iommu_flush_context_domain(
327 struct iommu *iommu, u16 did, int non_present_entry_flush)
328 {
329 struct iommu_flush *flush = iommu_get_flush(iommu);
330 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
331 non_present_entry_flush);
332 }
334 static int inline iommu_flush_context_device(
335 struct iommu *iommu, u16 did, u16 source_id,
336 u8 function_mask, int non_present_entry_flush)
337 {
338 struct iommu_flush *flush = iommu_get_flush(iommu);
339 return flush->context(iommu, did, source_id, function_mask,
340 DMA_CCMD_DEVICE_INVL,
341 non_present_entry_flush);
342 }
344 /* return value determine if we need a write buffer flush */
345 static int flush_iotlb_reg(void *_iommu, u16 did,
346 u64 addr, unsigned int size_order, u64 type,
347 int non_present_entry_flush)
348 {
349 struct iommu *iommu = (struct iommu *) _iommu;
350 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
351 u64 val = 0, val_iva = 0;
352 unsigned long flag;
353 s_time_t start_time;
355 /*
356 * In the non-present entry flush case, if hardware doesn't cache
357 * non-present entry we do nothing and if hardware cache non-present
358 * entry, we flush entries of domain 0 (the domain id is used to cache
359 * any non-present entries)
360 */
361 if ( non_present_entry_flush )
362 {
363 if ( !cap_caching_mode(iommu->cap) )
364 return 1;
365 else
366 did = 0;
367 }
369 /* use register invalidation */
370 switch ( type )
371 {
372 case DMA_TLB_GLOBAL_FLUSH:
373 /* global flush doesn't need set IVA_REG */
374 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
375 break;
376 case DMA_TLB_DSI_FLUSH:
377 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
378 break;
379 case DMA_TLB_PSI_FLUSH:
380 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
381 /* Note: always flush non-leaf currently */
382 val_iva = size_order | addr;
383 break;
384 default:
385 BUG();
386 }
387 /* Note: set drain read/write */
388 if ( cap_read_drain(iommu->cap) )
389 val |= DMA_TLB_READ_DRAIN;
390 if ( cap_write_drain(iommu->cap) )
391 val |= DMA_TLB_WRITE_DRAIN;
393 spin_lock_irqsave(&iommu->register_lock, flag);
394 /* Note: Only uses first TLB reg currently */
395 if ( val_iva )
396 dmar_writeq(iommu->reg, tlb_offset, val_iva);
397 dmar_writeq(iommu->reg, tlb_offset + 8, val);
399 /* Make sure hardware complete it */
400 start_time = NOW();
401 for ( ; ; )
402 {
403 val = dmar_readq(iommu->reg, tlb_offset + 8);
404 if ( !(val & DMA_TLB_IVT) )
405 break;
406 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
407 panic("%s: DMAR hardware is malfunctional,"
408 " please disable IOMMU\n", __func__);
409 cpu_relax();
410 }
411 spin_unlock_irqrestore(&iommu->register_lock, flag);
413 /* check IOTLB invalidation granularity */
414 if ( DMA_TLB_IAIG(val) == 0 )
415 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
417 /* flush iotlb entry will implicitly flush write buffer */
418 return 0;
419 }
421 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
422 int non_present_entry_flush)
423 {
424 struct iommu_flush *flush = iommu_get_flush(iommu);
425 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
426 non_present_entry_flush);
427 }
429 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
430 int non_present_entry_flush)
431 {
432 struct iommu_flush *flush = iommu_get_flush(iommu);
433 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
434 non_present_entry_flush);
435 }
437 static int inline get_alignment(u64 base, unsigned int size)
438 {
439 int t = 0;
440 u64 end;
442 end = base + size - 1;
443 while ( base != end )
444 {
445 t++;
446 base >>= 1;
447 end >>= 1;
448 }
449 return t;
450 }
452 static int inline iommu_flush_iotlb_psi(
453 struct iommu *iommu, u16 did,
454 u64 addr, unsigned int pages, int non_present_entry_flush)
455 {
456 unsigned int align;
457 struct iommu_flush *flush = iommu_get_flush(iommu);
459 ASSERT(!(addr & (~PAGE_MASK_4K)));
460 ASSERT(pages > 0);
462 /* Fallback to domain selective flush if no PSI support */
463 if ( !cap_pgsel_inv(iommu->cap) )
464 return iommu_flush_iotlb_dsi(iommu, did,
465 non_present_entry_flush);
467 /*
468 * PSI requires page size is 2 ^ x, and the base address is naturally
469 * aligned to the size
470 */
471 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
472 /* Fallback to domain selective flush if size is too big */
473 if ( align > cap_max_amask_val(iommu->cap) )
474 return iommu_flush_iotlb_dsi(iommu, did,
475 non_present_entry_flush);
477 addr >>= PAGE_SHIFT_4K + align;
478 addr <<= PAGE_SHIFT_4K + align;
480 return flush->iotlb(iommu, did, addr, align,
481 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
482 }
484 void iommu_flush_all(void)
485 {
486 struct acpi_drhd_unit *drhd;
487 struct iommu *iommu;
489 flush_all_cache();
490 for_each_drhd_unit ( drhd )
491 {
492 iommu = drhd->iommu;
493 iommu_flush_context_global(iommu, 0);
494 iommu_flush_iotlb_global(iommu, 0);
495 }
496 }
498 /* clear one page's page table */
499 static void dma_pte_clear_one(struct domain *domain, u64 addr)
500 {
501 struct hvm_iommu *hd = domain_hvm_iommu(domain);
502 struct acpi_drhd_unit *drhd;
503 struct iommu *iommu;
504 struct dma_pte *page = NULL, *pte = NULL;
505 u64 pg_maddr;
507 spin_lock(&hd->mapping_lock);
508 /* get last level pte */
509 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
510 if ( pg_maddr == 0 )
511 {
512 spin_unlock(&hd->mapping_lock);
513 return;
514 }
516 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
517 pte = page + address_level_offset(addr, 1);
519 if ( !dma_pte_present(*pte) )
520 {
521 spin_unlock(&hd->mapping_lock);
522 unmap_vtd_domain_page(page);
523 return;
524 }
526 dma_clear_pte(*pte);
527 spin_unlock(&hd->mapping_lock);
528 iommu_flush_cache_entry(pte);
530 /* No need pcidevs_lock here since do that on assign/deassign device*/
531 for_each_drhd_unit ( drhd )
532 {
533 iommu = drhd->iommu;
534 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
535 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
536 addr, 1, 0))
537 iommu_flush_write_buffer(iommu);
538 }
540 unmap_vtd_domain_page(page);
541 }
543 static void iommu_free_pagetable(u64 pt_maddr, int level)
544 {
545 int i;
546 struct dma_pte *pt_vaddr, *pte;
547 int next_level = level - 1;
549 if ( pt_maddr == 0 )
550 return;
552 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
554 for ( i = 0; i < PTE_NUM; i++ )
555 {
556 pte = &pt_vaddr[i];
557 if ( !dma_pte_present(*pte) )
558 continue;
560 if ( next_level >= 1 )
561 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
563 dma_clear_pte(*pte);
564 iommu_flush_cache_entry(pte);
565 }
567 unmap_vtd_domain_page(pt_vaddr);
568 free_pgtable_maddr(pt_maddr);
569 }
571 static int iommu_set_root_entry(struct iommu *iommu)
572 {
573 u32 cmd, sts;
574 unsigned long flags;
575 s_time_t start_time;
577 spin_lock(&iommu->lock);
579 if ( iommu->root_maddr == 0 )
580 iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
581 if ( iommu->root_maddr == 0 )
582 {
583 spin_unlock(&iommu->lock);
584 return -ENOMEM;
585 }
587 spin_unlock(&iommu->lock);
588 spin_lock_irqsave(&iommu->register_lock, flags);
589 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
590 cmd = iommu->gcmd | DMA_GCMD_SRTP;
591 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
593 /* Make sure hardware complete it */
594 start_time = NOW();
595 for ( ; ; )
596 {
597 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
598 if ( sts & DMA_GSTS_RTPS )
599 break;
600 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
601 panic("%s: DMAR hardware is malfunctional,"
602 " please disable IOMMU\n", __func__);
603 cpu_relax();
604 }
606 spin_unlock_irqrestore(&iommu->register_lock, flags);
608 return 0;
609 }
611 static void iommu_enable_translation(struct iommu *iommu)
612 {
613 u32 sts;
614 unsigned long flags;
615 s_time_t start_time;
617 dprintk(XENLOG_INFO VTDPREFIX,
618 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
619 spin_lock_irqsave(&iommu->register_lock, flags);
620 iommu->gcmd |= DMA_GCMD_TE;
621 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
622 /* Make sure hardware complete it */
623 start_time = NOW();
624 for ( ; ; )
625 {
626 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
627 if ( sts & DMA_GSTS_TES )
628 break;
629 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
630 panic("%s: DMAR hardware is malfunctional,"
631 " please disable IOMMU\n", __func__);
632 cpu_relax();
633 }
635 /* Disable PMRs when VT-d engine takes effect per spec definition */
636 disable_pmr(iommu);
637 spin_unlock_irqrestore(&iommu->register_lock, flags);
638 }
640 int iommu_disable_translation(struct iommu *iommu)
641 {
642 u32 sts;
643 unsigned long flags;
644 s_time_t start_time;
646 spin_lock_irqsave(&iommu->register_lock, flags);
647 iommu->gcmd &= ~ DMA_GCMD_TE;
648 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
650 /* Make sure hardware complete it */
651 start_time = NOW();
652 for ( ; ; )
653 {
654 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
655 if ( !(sts & DMA_GSTS_TES) )
656 break;
657 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
658 panic("%s: DMAR hardware is malfunctional,"
659 " please disable IOMMU\n", __func__);
660 cpu_relax();
661 }
662 spin_unlock_irqrestore(&iommu->register_lock, flags);
663 return 0;
664 }
666 static struct iommu *vector_to_iommu[NR_VECTORS];
667 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
668 u8 fault_reason, u16 source_id, u64 addr)
669 {
670 dprintk(XENLOG_WARNING VTDPREFIX,
671 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
672 "iommu->reg = %p\n",
673 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
674 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
675 fault_reason, iommu->reg);
677 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
678 if ( fault_reason < 0x20 )
679 print_vtd_entries(iommu, (source_id >> 8),
680 (source_id & 0xff), (addr >> PAGE_SHIFT));
681 #endif
683 return 0;
684 }
686 static void iommu_fault_status(u32 fault_status)
687 {
688 if ( fault_status & DMA_FSTS_PFO )
689 dprintk(XENLOG_ERR VTDPREFIX,
690 "iommu_fault_status: Fault Overflow\n");
691 if ( fault_status & DMA_FSTS_PPF )
692 dprintk(XENLOG_ERR VTDPREFIX,
693 "iommu_fault_status: Primary Pending Fault\n");
694 if ( fault_status & DMA_FSTS_AFO )
695 dprintk(XENLOG_ERR VTDPREFIX,
696 "iommu_fault_status: Advanced Fault Overflow\n");
697 if ( fault_status & DMA_FSTS_APF )
698 dprintk(XENLOG_ERR VTDPREFIX,
699 "iommu_fault_status: Advanced Pending Fault\n");
700 if ( fault_status & DMA_FSTS_IQE )
701 dprintk(XENLOG_ERR VTDPREFIX,
702 "iommu_fault_status: Invalidation Queue Error\n");
703 if ( fault_status & DMA_FSTS_ICE )
704 dprintk(XENLOG_ERR VTDPREFIX,
705 "iommu_fault_status: Invalidation Completion Error\n");
706 if ( fault_status & DMA_FSTS_ITE )
707 dprintk(XENLOG_ERR VTDPREFIX,
708 "iommu_fault_status: Invalidation Time-out Error\n");
709 }
711 #define PRIMARY_FAULT_REG_LEN (16)
712 static void iommu_page_fault(int vector, void *dev_id,
713 struct cpu_user_regs *regs)
714 {
715 struct iommu *iommu = dev_id;
716 int reg, fault_index;
717 u32 fault_status;
718 unsigned long flags;
720 dprintk(XENLOG_WARNING VTDPREFIX,
721 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
723 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
725 iommu_fault_status(fault_status);
727 /* FIXME: ignore advanced fault log */
728 if ( !(fault_status & DMA_FSTS_PPF) )
729 goto clear_overflow;
731 fault_index = dma_fsts_fault_record_index(fault_status);
732 reg = cap_fault_reg_offset(iommu->cap);
733 while (1)
734 {
735 u8 fault_reason;
736 u16 source_id;
737 u32 data;
738 u64 guest_addr;
739 int type;
741 /* highest 32 bits */
742 spin_lock_irqsave(&iommu->register_lock, flags);
743 data = dmar_readl(iommu->reg, reg +
744 fault_index * PRIMARY_FAULT_REG_LEN + 12);
745 if ( !(data & DMA_FRCD_F) )
746 {
747 spin_unlock_irqrestore(&iommu->register_lock, flags);
748 break;
749 }
751 fault_reason = dma_frcd_fault_reason(data);
752 type = dma_frcd_type(data);
754 data = dmar_readl(iommu->reg, reg +
755 fault_index * PRIMARY_FAULT_REG_LEN + 8);
756 source_id = dma_frcd_source_id(data);
758 guest_addr = dmar_readq(iommu->reg, reg +
759 fault_index * PRIMARY_FAULT_REG_LEN);
760 guest_addr = dma_frcd_page_addr(guest_addr);
761 /* clear the fault */
762 dmar_writel(iommu->reg, reg +
763 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
764 spin_unlock_irqrestore(&iommu->register_lock, flags);
766 iommu_page_fault_do_one(iommu, type, fault_reason,
767 source_id, guest_addr);
769 fault_index++;
770 if ( fault_index > cap_num_fault_regs(iommu->cap) )
771 fault_index = 0;
772 }
773 clear_overflow:
774 /* clear primary fault overflow */
775 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
776 if ( fault_status & DMA_FSTS_PFO )
777 {
778 spin_lock_irqsave(&iommu->register_lock, flags);
779 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
780 spin_unlock_irqrestore(&iommu->register_lock, flags);
781 }
782 }
784 static void dma_msi_unmask(unsigned int vector)
785 {
786 struct iommu *iommu = vector_to_iommu[vector];
787 unsigned long flags;
789 /* unmask it */
790 spin_lock_irqsave(&iommu->register_lock, flags);
791 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
792 spin_unlock_irqrestore(&iommu->register_lock, flags);
793 }
795 static void dma_msi_mask(unsigned int vector)
796 {
797 unsigned long flags;
798 struct iommu *iommu = vector_to_iommu[vector];
800 /* mask it */
801 spin_lock_irqsave(&iommu->register_lock, flags);
802 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
803 spin_unlock_irqrestore(&iommu->register_lock, flags);
804 }
806 static unsigned int dma_msi_startup(unsigned int vector)
807 {
808 dma_msi_unmask(vector);
809 return 0;
810 }
812 static void dma_msi_end(unsigned int vector)
813 {
814 dma_msi_unmask(vector);
815 ack_APIC_irq();
816 }
818 static void dma_msi_data_init(struct iommu *iommu, int vector)
819 {
820 u32 msi_data = 0;
821 unsigned long flags;
823 /* Fixed, edge, assert mode. Follow MSI setting */
824 msi_data |= vector & 0xff;
825 msi_data |= 1 << 14;
827 spin_lock_irqsave(&iommu->register_lock, flags);
828 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
829 spin_unlock_irqrestore(&iommu->register_lock, flags);
830 }
832 #ifdef SUPPORT_MSI_REMAPPING
833 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
834 {
835 u64 msi_address;
836 unsigned long flags;
838 /* Physical, dedicated cpu. Follow MSI setting */
839 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
840 msi_address |= MSI_PHYSICAL_MODE << 2;
841 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
842 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
844 spin_lock_irqsave(&iommu->register_lock, flags);
845 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
846 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
847 spin_unlock_irqrestore(&iommu->register_lock, flags);
848 }
849 #else
850 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
851 {
852 /* ia64: TODO */
853 }
854 #endif
856 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
857 {
858 struct iommu *iommu = vector_to_iommu[vector];
859 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
860 }
862 static struct hw_interrupt_type dma_msi_type = {
863 .typename = "DMA_MSI",
864 .startup = dma_msi_startup,
865 .shutdown = dma_msi_mask,
866 .enable = dma_msi_unmask,
867 .disable = dma_msi_mask,
868 .ack = dma_msi_mask,
869 .end = dma_msi_end,
870 .set_affinity = dma_msi_set_affinity,
871 };
873 int iommu_set_interrupt(struct iommu *iommu)
874 {
875 int vector, ret;
877 vector = assign_irq_vector(AUTO_ASSIGN);
878 if ( vector <= 0 )
879 {
880 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
881 return -EINVAL;
882 }
884 irq_desc[vector].handler = &dma_msi_type;
885 ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
886 if ( ret )
887 {
888 irq_desc[vector].handler = &no_irq_type;
889 free_irq_vector(vector);
890 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
891 return ret;
892 }
894 /* Make sure that vector is never re-used. */
895 vector_irq[vector] = NEVER_ASSIGN;
896 vector_to_iommu[vector] = iommu;
898 return vector;
899 }
901 static int iommu_alloc(struct acpi_drhd_unit *drhd)
902 {
903 struct iommu *iommu;
904 unsigned long sagaw;
905 int agaw;
907 if ( nr_iommus > MAX_IOMMUS )
908 {
909 gdprintk(XENLOG_ERR VTDPREFIX,
910 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
911 return -ENOMEM;
912 }
914 iommu = xmalloc(struct iommu);
915 if ( iommu == NULL )
916 return -ENOMEM;
917 memset(iommu, 0, sizeof(struct iommu));
919 iommu->intel = alloc_intel_iommu();
920 if ( iommu->intel == NULL )
921 {
922 xfree(iommu);
923 return -ENOMEM;
924 }
926 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
927 iommu->index = nr_iommus++;
929 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
930 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
932 /* Calculate number of pagetable levels: between 2 and 4. */
933 sagaw = cap_sagaw(iommu->cap);
934 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
935 if ( test_bit(agaw, &sagaw) )
936 break;
937 if ( agaw < 0 )
938 {
939 gdprintk(XENLOG_ERR VTDPREFIX,
940 "IOMMU: unsupported sagaw %lx\n", sagaw);
941 xfree(iommu);
942 return -ENODEV;
943 }
944 iommu->nr_pt_levels = agaw_to_level(agaw);
946 if ( !ecap_coherent(iommu->ecap) )
947 iommus_incoherent = 1;
949 spin_lock_init(&iommu->lock);
950 spin_lock_init(&iommu->register_lock);
952 drhd->iommu = iommu;
953 return 0;
954 }
956 static void iommu_free(struct acpi_drhd_unit *drhd)
957 {
958 struct iommu *iommu = drhd->iommu;
960 if ( iommu == NULL )
961 return;
963 if ( iommu->root_maddr != 0 )
964 {
965 free_pgtable_maddr(iommu->root_maddr);
966 iommu->root_maddr = 0;
967 }
969 if ( iommu->reg )
970 iounmap(iommu->reg);
972 free_intel_iommu(iommu->intel);
973 free_irq(iommu->vector);
974 xfree(iommu);
976 drhd->iommu = NULL;
977 }
979 #define guestwidth_to_adjustwidth(gaw) ({ \
980 int agaw, r = (gaw - 12) % 9; \
981 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
982 if ( agaw > 64 ) \
983 agaw = 64; \
984 agaw; })
986 static int intel_iommu_domain_init(struct domain *d)
987 {
988 struct hvm_iommu *hd = domain_hvm_iommu(d);
989 struct iommu *iommu = NULL;
990 u64 i, j, tmp;
991 struct acpi_drhd_unit *drhd;
993 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
994 iommu = drhd->iommu;
996 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
998 if ( d->domain_id == 0 )
999 {
1000 extern int xen_in_range(paddr_t start, paddr_t end);
1002 /* Set up 1:1 page table for dom0 for all RAM except Xen bits. */
1003 for ( i = 0; i < max_page; i++ )
1005 if ( !page_is_conventional_ram(i) ||
1006 xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) )
1007 continue;
1009 tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K);
1010 for ( j = 0; j < tmp; j++ )
1011 iommu_map_page(d, (i*tmp+j), (i*tmp+j));
1014 setup_dom0_devices(d);
1015 setup_dom0_rmrr(d);
1017 iommu_flush_all();
1019 for_each_drhd_unit ( drhd )
1021 iommu = drhd->iommu;
1022 iommu_enable_translation(iommu);
1026 return 0;
1029 static int domain_context_mapping_one(
1030 struct domain *domain,
1031 struct iommu *iommu,
1032 u8 bus, u8 devfn)
1034 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1035 struct context_entry *context, *context_entries;
1036 u64 maddr, pgd_maddr;
1037 struct pci_dev *pdev = NULL;
1038 int agaw;
1040 ASSERT(spin_is_locked(&pcidevs_lock));
1041 spin_lock(&iommu->lock);
1042 maddr = bus_to_context_maddr(iommu, bus);
1043 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1044 context = &context_entries[devfn];
1046 if ( context_present(*context) )
1048 int res = 0;
1050 pdev = pci_get_pdev(bus, devfn);
1051 if (!pdev)
1052 res = -ENODEV;
1053 else if (pdev->domain != domain)
1054 res = -EINVAL;
1055 unmap_vtd_domain_page(context_entries);
1056 spin_unlock(&iommu->lock);
1057 return res;
1060 if ( iommu_passthrough &&
1061 ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
1063 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1064 agaw = level_to_agaw(iommu->nr_pt_levels);
1066 else
1068 spin_lock(&hd->mapping_lock);
1070 /* Ensure we have pagetables allocated down to leaf PTE. */
1071 if ( hd->pgd_maddr == 0 )
1073 addr_to_dma_page_maddr(domain, 0, 1);
1074 if ( hd->pgd_maddr == 0 )
1076 nomem:
1077 spin_unlock(&hd->mapping_lock);
1078 spin_unlock(&iommu->lock);
1079 unmap_vtd_domain_page(context_entries);
1080 return -ENOMEM;
1084 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1085 pgd_maddr = hd->pgd_maddr;
1086 for ( agaw = level_to_agaw(4);
1087 agaw != level_to_agaw(iommu->nr_pt_levels);
1088 agaw-- )
1090 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1091 pgd_maddr = dma_pte_addr(*p);
1092 unmap_vtd_domain_page(p);
1093 if ( pgd_maddr == 0 )
1094 goto nomem;
1097 context_set_address_root(*context, pgd_maddr);
1098 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1099 spin_unlock(&hd->mapping_lock);
1102 /*
1103 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1104 * be 1 based as required by intel's iommu hw.
1105 */
1106 context_set_domain_id(context, domain);
1107 context_set_address_width(*context, agaw);
1108 context_set_fault_enable(*context);
1109 context_set_present(*context);
1110 iommu_flush_cache_entry(context);
1111 spin_unlock(&iommu->lock);
1113 /* Context entry was previously non-present (with domid 0). */
1114 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1115 DMA_CCMD_MASK_NOBIT, 1) )
1116 iommu_flush_write_buffer(iommu);
1117 else
1118 iommu_flush_iotlb_dsi(iommu, 0, 1);
1120 set_bit(iommu->index, &hd->iommu_bitmap);
1122 unmap_vtd_domain_page(context_entries);
1124 return 0;
1127 #define PCI_BASE_CLASS_BRIDGE 0x06
1128 #define PCI_CLASS_BRIDGE_PCI 0x0604
1130 enum {
1131 DEV_TYPE_PCIe_ENDPOINT,
1132 DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch
1133 DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
1134 DEV_TYPE_PCI,
1135 };
1137 int pdev_type(u8 bus, u8 devfn)
1139 u16 class_device;
1140 u16 status, creg;
1141 int pos;
1142 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1144 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1145 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1147 pos = pci_find_next_cap(bus, devfn,
1148 PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1149 if ( !pos )
1150 return DEV_TYPE_PCI_BRIDGE;
1151 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1152 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1153 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1156 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1157 if ( !(status & PCI_STATUS_CAP_LIST) )
1158 return DEV_TYPE_PCI;
1160 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1161 return DEV_TYPE_PCIe_ENDPOINT;
1163 return DEV_TYPE_PCI;
1166 #define MAX_BUSES 256
1167 static DEFINE_SPINLOCK(bus2bridge_lock);
1168 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1170 static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1172 int cnt = 0;
1173 *secbus = *bus;
1175 ASSERT(spin_is_locked(&bus2bridge_lock));
1176 if ( !bus2bridge[*bus].map )
1177 return 0;
1179 while ( bus2bridge[*bus].map )
1181 *secbus = *bus;
1182 *devfn = bus2bridge[*bus].devfn;
1183 *bus = bus2bridge[*bus].bus;
1184 if ( cnt++ >= MAX_BUSES )
1185 return 0;
1188 return 1;
1191 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1193 int ret = 0;
1195 if ( *bus == 0 )
1196 /* assume integrated PCI devices in RC have valid requester-id */
1197 return 1;
1199 spin_lock(&bus2bridge_lock);
1200 ret = _find_pcie_endpoint(bus, devfn, secbus);
1201 spin_unlock(&bus2bridge_lock);
1203 return ret;
1206 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1208 struct acpi_drhd_unit *drhd;
1209 int ret = 0;
1210 u16 sec_bus, sub_bus;
1211 u32 type;
1212 u8 secbus, secdevfn;
1214 drhd = acpi_find_matched_drhd_unit(bus, devfn);
1215 if ( !drhd )
1216 return -ENODEV;
1218 ASSERT(spin_is_locked(&pcidevs_lock));
1220 type = pdev_type(bus, devfn);
1221 switch ( type )
1223 case DEV_TYPE_PCIe_BRIDGE:
1224 break;
1226 case DEV_TYPE_PCI_BRIDGE:
1227 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1228 PCI_SECONDARY_BUS);
1229 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1230 PCI_SUBORDINATE_BUS);
1232 spin_lock(&bus2bridge_lock);
1233 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1235 bus2bridge[sec_bus].map = 1;
1236 bus2bridge[sec_bus].bus = bus;
1237 bus2bridge[sec_bus].devfn = devfn;
1239 spin_unlock(&bus2bridge_lock);
1240 break;
1242 case DEV_TYPE_PCIe_ENDPOINT:
1243 gdprintk(XENLOG_INFO VTDPREFIX,
1244 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1245 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1246 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1247 break;
1249 case DEV_TYPE_PCI:
1250 gdprintk(XENLOG_INFO VTDPREFIX,
1251 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1252 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1254 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1255 if ( ret )
1256 break;
1258 secbus = bus;
1259 secdevfn = devfn;
1260 /* dependent devices mapping */
1261 while ( bus2bridge[bus].map )
1263 secbus = bus;
1264 secdevfn = devfn;
1265 devfn = bus2bridge[bus].devfn;
1266 bus = bus2bridge[bus].bus;
1267 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1268 if ( ret )
1269 return ret;
1272 if ( (secbus != bus) && (secdevfn != 0) )
1273 /*
1274 * The source-id for transactions on non-PCIe buses seem
1275 * to originate from devfn=0 on the secondary bus behind
1276 * the bridge. Map that id as well. The id to use in
1277 * these scanarios is not particularly well documented
1278 * anywhere.
1279 */
1280 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1281 break;
1283 default:
1284 gdprintk(XENLOG_ERR VTDPREFIX,
1285 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1286 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1287 ret = -EINVAL;
1288 break;
1291 return ret;
1294 static int domain_context_unmap_one(
1295 struct domain *domain,
1296 struct iommu *iommu,
1297 u8 bus, u8 devfn)
1299 struct context_entry *context, *context_entries;
1300 u64 maddr;
1302 ASSERT(spin_is_locked(&pcidevs_lock));
1303 spin_lock(&iommu->lock);
1305 maddr = bus_to_context_maddr(iommu, bus);
1306 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1307 context = &context_entries[devfn];
1309 if ( !context_present(*context) )
1311 spin_unlock(&iommu->lock);
1312 unmap_vtd_domain_page(context_entries);
1313 return 0;
1316 context_clear_present(*context);
1317 context_clear_entry(*context);
1318 iommu_flush_cache_entry(context);
1320 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1321 (((u16)bus) << 8) | devfn,
1322 DMA_CCMD_MASK_NOBIT, 0) )
1323 iommu_flush_write_buffer(iommu);
1324 else
1325 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
1327 spin_unlock(&iommu->lock);
1328 unmap_vtd_domain_page(context_entries);
1330 return 0;
1333 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1335 struct acpi_drhd_unit *drhd;
1336 int ret = 0;
1337 u32 type;
1338 u8 secbus, secdevfn;
1340 drhd = acpi_find_matched_drhd_unit(bus, devfn);
1341 if ( !drhd )
1342 return -ENODEV;
1344 type = pdev_type(bus, devfn);
1345 switch ( type )
1347 case DEV_TYPE_PCIe_BRIDGE:
1348 case DEV_TYPE_PCI_BRIDGE:
1349 break;
1351 case DEV_TYPE_PCIe_ENDPOINT:
1352 gdprintk(XENLOG_INFO VTDPREFIX,
1353 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1354 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1355 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1356 break;
1358 case DEV_TYPE_PCI:
1359 gdprintk(XENLOG_INFO VTDPREFIX,
1360 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1361 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1362 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1363 if ( ret )
1364 break;
1366 secbus = bus;
1367 secdevfn = devfn;
1368 /* dependent devices unmapping */
1369 while ( bus2bridge[bus].map )
1371 secbus = bus;
1372 secdevfn = devfn;
1373 devfn = bus2bridge[bus].devfn;
1374 bus = bus2bridge[bus].bus;
1375 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1376 if ( ret )
1377 return ret;
1380 if ( (secbus != bus) && (secdevfn != 0) )
1381 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1382 break;
1384 default:
1385 gdprintk(XENLOG_ERR VTDPREFIX,
1386 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1387 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1388 ret = -EINVAL;
1389 break;
1392 return ret;
1395 static int reassign_device_ownership(
1396 struct domain *source,
1397 struct domain *target,
1398 u8 bus, u8 devfn)
1400 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1401 struct pci_dev *pdev;
1402 struct acpi_drhd_unit *drhd;
1403 struct iommu *pdev_iommu;
1404 int ret, found = 0;
1406 ASSERT(spin_is_locked(&pcidevs_lock));
1407 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1409 if (!pdev)
1410 return -ENODEV;
1412 drhd = acpi_find_matched_drhd_unit(bus, devfn);
1413 pdev_iommu = drhd->iommu;
1414 domain_context_unmap(source, bus, devfn);
1416 ret = domain_context_mapping(target, bus, devfn);
1417 if ( ret )
1418 return ret;
1420 list_move(&pdev->domain_list, &target->arch.pdev_list);
1421 pdev->domain = target;
1423 for_each_pdev ( source, pdev )
1425 drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
1426 if ( drhd->iommu == pdev_iommu )
1428 found = 1;
1429 break;
1433 if ( !found )
1434 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1436 return ret;
1439 void iommu_domain_teardown(struct domain *d)
1441 struct hvm_iommu *hd = domain_hvm_iommu(d);
1443 if ( list_empty(&acpi_drhd_units) )
1444 return;
1446 spin_lock(&hd->mapping_lock);
1447 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1448 hd->pgd_maddr = 0;
1449 spin_unlock(&hd->mapping_lock);
1451 iommu_domid_release(d);
1454 int intel_iommu_map_page(
1455 struct domain *d, unsigned long gfn, unsigned long mfn)
1457 struct hvm_iommu *hd = domain_hvm_iommu(d);
1458 struct acpi_drhd_unit *drhd;
1459 struct iommu *iommu;
1460 struct dma_pte *page = NULL, *pte = NULL;
1461 u64 pg_maddr;
1462 int pte_present;
1464 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1465 iommu = drhd->iommu;
1467 /* do nothing if dom0 and iommu supports pass thru */
1468 if ( iommu_passthrough &&
1469 ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
1470 return 0;
1472 spin_lock(&hd->mapping_lock);
1474 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1475 if ( pg_maddr == 0 )
1477 spin_unlock(&hd->mapping_lock);
1478 return -ENOMEM;
1480 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1481 pte = page + (gfn & LEVEL_MASK);
1482 pte_present = dma_pte_present(*pte);
1483 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1484 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1486 /* Set the SNP on leaf page table if Snoop Control available */
1487 if ( iommu_snoop )
1488 dma_set_pte_snp(*pte);
1490 iommu_flush_cache_entry(pte);
1491 spin_unlock(&hd->mapping_lock);
1492 unmap_vtd_domain_page(page);
1494 /*
1495 * No need pcideves_lock here because we have flush
1496 * when assign/deassign device
1497 */
1498 for_each_drhd_unit ( drhd )
1500 iommu = drhd->iommu;
1502 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1503 continue;
1505 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1506 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1507 !pte_present) )
1508 iommu_flush_write_buffer(iommu);
1511 return 0;
1514 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1516 struct acpi_drhd_unit *drhd;
1517 struct iommu *iommu;
1519 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1520 iommu = drhd->iommu;
1522 /* do nothing if dom0 and iommu supports pass thru */
1523 if ( iommu_passthrough &&
1524 ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
1525 return 0;
1527 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1529 return 0;
1532 static int iommu_prepare_rmrr_dev(struct domain *d,
1533 struct acpi_rmrr_unit *rmrr,
1534 u8 bus, u8 devfn)
1536 int ret = 0;
1537 u64 base, end;
1538 unsigned long base_pfn, end_pfn;
1540 ASSERT(spin_is_locked(&pcidevs_lock));
1541 ASSERT(rmrr->base_address < rmrr->end_address);
1543 base = rmrr->base_address & PAGE_MASK_4K;
1544 base_pfn = base >> PAGE_SHIFT_4K;
1545 end = PAGE_ALIGN_4K(rmrr->end_address);
1546 end_pfn = end >> PAGE_SHIFT_4K;
1548 while ( base_pfn < end_pfn )
1550 intel_iommu_map_page(d, base_pfn, base_pfn);
1551 base_pfn++;
1554 ret = domain_context_mapping(d, bus, devfn);
1556 return ret;
1559 static int intel_iommu_add_device(struct pci_dev *pdev)
1561 struct acpi_rmrr_unit *rmrr;
1562 u16 bdf;
1563 int ret, i;
1565 ASSERT(spin_is_locked(&pcidevs_lock));
1567 if ( !pdev->domain )
1568 return -EINVAL;
1570 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1571 if ( ret )
1573 gdprintk(XENLOG_ERR VTDPREFIX,
1574 "intel_iommu_add_device: context mapping failed\n");
1575 return ret;
1578 for_each_rmrr_device ( rmrr, bdf, i )
1580 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1582 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1583 pdev->bus, pdev->devfn);
1584 if ( ret )
1585 gdprintk(XENLOG_ERR VTDPREFIX,
1586 "intel_iommu_add_device: RMRR mapping failed\n");
1587 break;
1591 return ret;
1594 static int intel_iommu_remove_device(struct pci_dev *pdev)
1596 struct acpi_rmrr_unit *rmrr;
1597 u16 bdf;
1598 int i;
1600 if ( !pdev->domain )
1601 return -EINVAL;
1603 /* If the device belongs to dom0, and it has RMRR, don't remove it
1604 * from dom0, because BIOS may use RMRR at booting time.
1605 */
1606 if ( pdev->domain->domain_id == 0 )
1608 for_each_rmrr_device ( rmrr, bdf, i )
1610 if ( PCI_BUS(bdf) == pdev->bus &&
1611 PCI_DEVFN2(bdf) == pdev->devfn )
1612 return 0;
1616 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1619 static void setup_dom0_devices(struct domain *d)
1621 struct hvm_iommu *hd;
1622 struct pci_dev *pdev;
1623 int bus, dev, func;
1624 u32 l;
1626 hd = domain_hvm_iommu(d);
1628 spin_lock(&pcidevs_lock);
1629 for ( bus = 0; bus < 256; bus++ )
1631 for ( dev = 0; dev < 32; dev++ )
1633 for ( func = 0; func < 8; func++ )
1635 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1636 /* some broken boards return 0 or ~0 if a slot is empty: */
1637 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1638 (l == 0x0000ffff) || (l == 0xffff0000) )
1639 continue;
1641 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1642 pdev->domain = d;
1643 list_add(&pdev->domain_list, &d->arch.pdev_list);
1644 domain_context_mapping(d, pdev->bus, pdev->devfn);
1648 spin_unlock(&pcidevs_lock);
1651 void clear_fault_bits(struct iommu *iommu)
1653 u64 val;
1655 val = dmar_readq(
1656 iommu->reg,
1657 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1658 dmar_writeq(
1659 iommu->reg,
1660 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1661 val);
1662 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1665 static int init_vtd_hw(void)
1667 struct acpi_drhd_unit *drhd;
1668 struct iommu *iommu;
1669 struct iommu_flush *flush = NULL;
1670 int vector;
1671 int ret;
1673 for_each_drhd_unit ( drhd )
1675 iommu = drhd->iommu;
1676 ret = iommu_set_root_entry(iommu);
1677 if ( ret )
1679 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1680 return -EIO;
1683 vector = iommu_set_interrupt(iommu);
1684 if ( vector < 0 )
1686 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1687 return vector;
1689 dma_msi_data_init(iommu, vector);
1690 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1691 iommu->vector = vector;
1692 clear_fault_bits(iommu);
1693 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1695 /* initialize flush functions */
1696 flush = iommu_get_flush(iommu);
1697 flush->context = flush_context_reg;
1698 flush->iotlb = flush_iotlb_reg;
1701 for_each_drhd_unit ( drhd )
1703 iommu = drhd->iommu;
1704 if ( qinval_setup(iommu) != 0 )
1705 dprintk(XENLOG_INFO VTDPREFIX,
1706 "Queued Invalidation hardware not found\n");
1709 for_each_drhd_unit ( drhd )
1711 iommu = drhd->iommu;
1712 if ( intremap_setup(iommu) != 0 )
1713 dprintk(XENLOG_INFO VTDPREFIX,
1714 "Interrupt Remapping hardware not found\n");
1717 return 0;
1720 static void setup_dom0_rmrr(struct domain *d)
1722 struct acpi_rmrr_unit *rmrr;
1723 u16 bdf;
1724 int ret, i;
1726 spin_lock(&pcidevs_lock);
1727 for_each_rmrr_device ( rmrr, bdf, i )
1729 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1730 if ( ret )
1731 gdprintk(XENLOG_ERR VTDPREFIX,
1732 "IOMMU: mapping reserved region failed\n");
1734 spin_unlock(&pcidevs_lock);
1737 int intel_vtd_setup(void)
1739 struct acpi_drhd_unit *drhd;
1740 struct iommu *iommu;
1742 if ( !vtd_enabled )
1743 return -ENODEV;
1745 spin_lock_init(&domid_bitmap_lock);
1746 clflush_size = get_cache_line_size();
1748 for_each_drhd_unit ( drhd )
1749 if ( iommu_alloc(drhd) != 0 )
1750 goto error;
1752 /* Allocate IO page directory page for the domain. */
1753 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1754 iommu = drhd->iommu;
1756 /* Allocate domain id bitmap, and set bit 0 as reserved */
1757 domid_bitmap_size = cap_ndoms(iommu->cap);
1758 domid_bitmap = xmalloc_array(unsigned long,
1759 BITS_TO_LONGS(domid_bitmap_size));
1760 if ( domid_bitmap == NULL )
1761 goto error;
1762 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1763 set_bit(0, domid_bitmap);
1765 if ( init_vtd_hw() )
1766 goto error;
1768 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1770 return 0;
1772 error:
1773 for_each_drhd_unit ( drhd )
1774 iommu_free(drhd);
1775 vtd_enabled = 0;
1776 return -ENOMEM;
1779 /*
1780 * If the device isn't owned by dom0, it means it already
1781 * has been assigned to other domain, or it's not exist.
1782 */
1783 int device_assigned(u8 bus, u8 devfn)
1785 struct pci_dev *pdev;
1787 spin_lock(&pcidevs_lock);
1788 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1789 if (!pdev)
1791 spin_unlock(&pcidevs_lock);
1792 return -1;
1795 spin_unlock(&pcidevs_lock);
1796 return 0;
1799 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1801 struct acpi_rmrr_unit *rmrr;
1802 int ret = 0, i;
1803 struct pci_dev *pdev;
1804 u16 bdf;
1806 if ( list_empty(&acpi_drhd_units) )
1807 return -ENODEV;
1809 ASSERT(spin_is_locked(&pcidevs_lock));
1810 pdev = pci_get_pdev(bus, devfn);
1811 if (!pdev)
1812 return -ENODEV;
1814 if (pdev->domain != dom0)
1816 gdprintk(XENLOG_ERR VTDPREFIX,
1817 "IOMMU: assign a assigned device\n");
1818 return -EBUSY;
1821 ret = reassign_device_ownership(dom0, d, bus, devfn);
1822 if ( ret )
1823 goto done;
1825 /* Setup rmrr identity mapping */
1826 for_each_rmrr_device( rmrr, bdf, i )
1828 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1830 /* FIXME: Because USB RMRR conflicts with guest bios region,
1831 * ignore USB RMRR temporarily.
1832 */
1833 if ( is_usb_device(bus, devfn) )
1835 ret = 0;
1836 goto done;
1839 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1840 if ( ret )
1841 gdprintk(XENLOG_ERR VTDPREFIX,
1842 "IOMMU: mapping reserved region failed\n");
1843 goto done;
1847 done:
1848 return ret;
1851 static int intel_iommu_group_id(u8 bus, u8 devfn)
1853 u8 secbus;
1854 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1855 return PCI_BDF2(bus, devfn);
1856 else
1857 return -1;
1860 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1861 void iommu_suspend(void)
1863 struct acpi_drhd_unit *drhd;
1864 struct iommu *iommu;
1865 u32 i;
1867 if ( !vtd_enabled )
1868 return;
1870 iommu_flush_all();
1872 for_each_drhd_unit ( drhd )
1874 iommu = drhd->iommu;
1875 i = iommu->index;
1877 iommu_state[i][DMAR_FECTL_REG] =
1878 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1879 iommu_state[i][DMAR_FEDATA_REG] =
1880 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1881 iommu_state[i][DMAR_FEADDR_REG] =
1882 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1883 iommu_state[i][DMAR_FEUADDR_REG] =
1884 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1888 void iommu_resume(void)
1890 struct acpi_drhd_unit *drhd;
1891 struct iommu *iommu;
1892 u32 i;
1894 if ( !vtd_enabled )
1895 return;
1897 iommu_flush_all();
1899 if ( init_vtd_hw() != 0 && force_iommu )
1900 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1902 for_each_drhd_unit ( drhd )
1904 iommu = drhd->iommu;
1905 i = iommu->index;
1907 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1908 (u32) iommu_state[i][DMAR_FECTL_REG]);
1909 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1910 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1911 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1912 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1913 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1914 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1915 iommu_enable_translation(iommu);
1919 struct iommu_ops intel_iommu_ops = {
1920 .init = intel_iommu_domain_init,
1921 .add_device = intel_iommu_add_device,
1922 .remove_device = intel_iommu_remove_device,
1923 .assign_device = intel_iommu_assign_device,
1924 .teardown = iommu_domain_teardown,
1925 .map_page = intel_iommu_map_page,
1926 .unmap_page = intel_iommu_unmap_page,
1927 .reassign_device = reassign_device_ownership,
1928 .get_device_group_id = intel_iommu_group_id,
1929 .update_ire_from_apic = io_apic_write_remap_rte,
1930 .update_ire_from_msi = msi_msg_write_remap_rte,
1931 };
1933 /*
1934 * Local variables:
1935 * mode: C
1936 * c-set-style: "BSD"
1937 * c-basic-offset: 4
1938 * tab-width: 4
1939 * indent-tabs-mode: nil
1940 * End:
1941 */