ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 19810:aa472909b39c

vtd: IO NUMA support

This patch adds VT-d RHSA processing for IO NUMA support. The basic
idea is to parse ACPI RHSA structure to obtain VT-d HW to proximity
domain mapping. This mapping is then used when allocating pages for
Vt-d HW data structures.

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jun 23 11:14:24 2009 +0100 (2009-06-23)
parents 2f1fa2215e60
children 180ae4bca33e
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include "iommu.h"
35 #include "dmar.h"
36 #include "extern.h"
37 #include "vtd.h"
39 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
41 int nr_iommus;
42 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
43 static int domid_bitmap_size; /* domain id bitmap size in bits */
44 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
45 static bool_t rwbf_quirk;
47 static void setup_dom0_devices(struct domain *d);
48 static void setup_dom0_rmrr(struct domain *d);
50 #define DID_FIELD_WIDTH 16
51 #define DID_HIGH_OFFSET 8
52 static void context_set_domain_id(struct context_entry *context,
53 struct domain *d)
54 {
55 domid_t iommu_domid = domain_iommu_domid(d);
57 if ( iommu_domid == 0 )
58 {
59 spin_lock(&domid_bitmap_lock);
60 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
61 set_bit(iommu_domid, domid_bitmap);
62 spin_unlock(&domid_bitmap_lock);
63 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
64 }
66 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
67 context->hi |= iommu_domid << DID_HIGH_OFFSET;
68 }
70 static void iommu_domid_release(struct domain *d)
71 {
72 domid_t iommu_domid = domain_iommu_domid(d);
74 if ( iommu_domid != 0 )
75 {
76 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
77 clear_bit(iommu_domid, domid_bitmap);
78 }
79 }
81 static struct intel_iommu *alloc_intel_iommu(void)
82 {
83 struct intel_iommu *intel;
85 intel = xmalloc(struct intel_iommu);
86 if ( intel == NULL )
87 return NULL;
88 memset(intel, 0, sizeof(struct intel_iommu));
90 spin_lock_init(&intel->qi_ctrl.qinval_lock);
91 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
92 spin_lock_init(&intel->ir_ctrl.iremap_lock);
94 return intel;
95 }
97 static void free_intel_iommu(struct intel_iommu *intel)
98 {
99 xfree(intel);
100 }
102 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
103 {
104 return iommu ? &iommu->intel->qi_ctrl : NULL;
105 }
107 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
108 {
109 return iommu ? &iommu->intel->ir_ctrl : NULL;
110 }
112 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
113 {
114 return iommu ? &iommu->intel->flush : NULL;
115 }
117 static unsigned int clflush_size;
118 static int iommus_incoherent;
119 static void __iommu_flush_cache(void *addr, int size)
120 {
121 int i;
123 if ( !iommus_incoherent )
124 return;
126 for ( i = 0; i < size; i += clflush_size )
127 cacheline_flush((char *)addr + i);
128 }
130 void iommu_flush_cache_entry(void *addr)
131 {
132 __iommu_flush_cache(addr, 8);
133 }
135 void iommu_flush_cache_page(void *addr, unsigned long npages)
136 {
137 __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
138 }
140 /* Allocate page table, return its machine address */
141 u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages)
142 {
143 struct acpi_rhsa_unit *rhsa;
144 struct page_info *pg;
145 u64 *vaddr;
147 rhsa = drhd_to_rhsa(drhd);
148 if ( !rhsa )
149 dprintk(XENLOG_INFO VTDPREFIX,
150 "IOMMU: RHSA == NULL, IO NUMA memory allocation disabled\n");
152 pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
153 rhsa ? rhsa->domain : 0);
154 if ( !pg )
155 return 0;
156 vaddr = map_domain_page(page_to_mfn(pg));
157 if ( !vaddr )
158 return 0;
159 memset(vaddr, 0, PAGE_SIZE * npages);
161 iommu_flush_cache_page(vaddr, npages);
162 unmap_domain_page(vaddr);
164 return page_to_maddr(pg);
165 }
167 /* context entry handling */
168 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
169 {
170 struct acpi_drhd_unit *drhd;
171 struct root_entry *root, *root_entries;
172 u64 maddr;
174 ASSERT(spin_is_locked(&iommu->lock));
175 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
176 root = &root_entries[bus];
177 if ( !root_present(*root) )
178 {
179 drhd = iommu_to_drhd(iommu);
180 maddr = alloc_pgtable_maddr(drhd, 1);
181 if ( maddr == 0 )
182 {
183 unmap_vtd_domain_page(root_entries);
184 return 0;
185 }
186 set_root_value(*root, maddr);
187 set_root_present(*root);
188 iommu_flush_cache_entry(root);
189 }
190 maddr = (u64) get_context_addr(*root);
191 unmap_vtd_domain_page(root_entries);
192 return maddr;
193 }
195 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
196 {
197 struct acpi_drhd_unit *drhd;
198 struct pci_dev *pdev;
199 struct hvm_iommu *hd = domain_hvm_iommu(domain);
200 int addr_width = agaw_to_width(hd->agaw);
201 struct dma_pte *parent, *pte = NULL;
202 int level = agaw_to_level(hd->agaw);
203 int offset;
204 u64 pte_maddr = 0, maddr;
205 u64 *vaddr = NULL;
207 addr &= (((u64)1) << addr_width) - 1;
208 ASSERT(spin_is_locked(&hd->mapping_lock));
209 if ( hd->pgd_maddr == 0 )
210 {
211 pdev = pci_get_pdev_by_domain(domain, -1, -1);
212 drhd = acpi_find_matched_drhd_unit(pdev);
213 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(drhd, 1)) == 0) )
214 goto out;
215 }
217 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
218 while ( level > 1 )
219 {
220 offset = address_level_offset(addr, level);
221 pte = &parent[offset];
223 if ( dma_pte_addr(*pte) == 0 )
224 {
225 if ( !alloc )
226 break;
228 pdev = pci_get_pdev_by_domain(domain, -1, -1);
229 drhd = acpi_find_matched_drhd_unit(pdev);
230 maddr = alloc_pgtable_maddr(drhd, 1);
231 if ( !maddr )
232 break;
234 dma_set_pte_addr(*pte, maddr);
235 vaddr = map_vtd_domain_page(maddr);
237 /*
238 * high level table always sets r/w, last level
239 * page table control read/write
240 */
241 dma_set_pte_readable(*pte);
242 dma_set_pte_writable(*pte);
243 iommu_flush_cache_entry(pte);
244 }
245 else
246 {
247 vaddr = map_vtd_domain_page(pte->val);
248 }
250 if ( level == 2 )
251 {
252 pte_maddr = pte->val & PAGE_MASK_4K;
253 unmap_vtd_domain_page(vaddr);
254 break;
255 }
257 unmap_vtd_domain_page(parent);
258 parent = (struct dma_pte *)vaddr;
259 vaddr = NULL;
260 level--;
261 }
263 unmap_vtd_domain_page(parent);
264 out:
265 return pte_maddr;
266 }
268 static void iommu_flush_write_buffer(struct iommu *iommu)
269 {
270 u32 val;
271 unsigned long flags;
273 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
274 return;
276 spin_lock_irqsave(&iommu->register_lock, flags);
277 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
278 dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
280 /* Make sure hardware complete it */
281 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
282 !(val & DMA_GSTS_WBFS), val);
284 spin_unlock_irqrestore(&iommu->register_lock, flags);
285 }
287 /* return value determine if we need a write buffer flush */
288 static int flush_context_reg(
289 void *_iommu,
290 u16 did, u16 source_id, u8 function_mask, u64 type,
291 int flush_non_present_entry)
292 {
293 struct iommu *iommu = (struct iommu *) _iommu;
294 u64 val = 0;
295 unsigned long flags;
297 /*
298 * In the non-present entry flush case, if hardware doesn't cache
299 * non-present entry we do nothing and if hardware cache non-present
300 * entry, we flush entries of domain 0 (the domain id is used to cache
301 * any non-present entries)
302 */
303 if ( flush_non_present_entry )
304 {
305 if ( !cap_caching_mode(iommu->cap) )
306 return 1;
307 else
308 did = 0;
309 }
311 /* use register invalidation */
312 switch ( type )
313 {
314 case DMA_CCMD_GLOBAL_INVL:
315 val = DMA_CCMD_GLOBAL_INVL;
316 break;
317 case DMA_CCMD_DOMAIN_INVL:
318 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
319 break;
320 case DMA_CCMD_DEVICE_INVL:
321 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
322 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
323 break;
324 default:
325 BUG();
326 }
327 val |= DMA_CCMD_ICC;
329 spin_lock_irqsave(&iommu->register_lock, flags);
330 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
332 /* Make sure hardware complete it */
333 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
334 !(val & DMA_CCMD_ICC), val);
336 spin_unlock_irqrestore(&iommu->register_lock, flags);
337 /* flush context entry will implicitly flush write buffer */
338 return 0;
339 }
341 static int inline iommu_flush_context_global(
342 struct iommu *iommu, int flush_non_present_entry)
343 {
344 struct iommu_flush *flush = iommu_get_flush(iommu);
345 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
346 flush_non_present_entry);
347 }
349 static int inline iommu_flush_context_domain(
350 struct iommu *iommu, u16 did, int flush_non_present_entry)
351 {
352 struct iommu_flush *flush = iommu_get_flush(iommu);
353 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
354 flush_non_present_entry);
355 }
357 static int inline iommu_flush_context_device(
358 struct iommu *iommu, u16 did, u16 source_id,
359 u8 function_mask, int flush_non_present_entry)
360 {
361 struct iommu_flush *flush = iommu_get_flush(iommu);
362 return flush->context(iommu, did, source_id, function_mask,
363 DMA_CCMD_DEVICE_INVL,
364 flush_non_present_entry);
365 }
367 /* return value determine if we need a write buffer flush */
368 static int flush_iotlb_reg(void *_iommu, u16 did,
369 u64 addr, unsigned int size_order, u64 type,
370 int flush_non_present_entry, int flush_dev_iotlb)
371 {
372 struct iommu *iommu = (struct iommu *) _iommu;
373 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
374 u64 val = 0, val_iva = 0;
375 unsigned long flags;
377 /*
378 * In the non-present entry flush case, if hardware doesn't cache
379 * non-present entry we do nothing and if hardware cache non-present
380 * entry, we flush entries of domain 0 (the domain id is used to cache
381 * any non-present entries)
382 */
383 if ( flush_non_present_entry )
384 {
385 if ( !cap_caching_mode(iommu->cap) )
386 return 1;
387 else
388 did = 0;
389 }
391 /* use register invalidation */
392 switch ( type )
393 {
394 case DMA_TLB_GLOBAL_FLUSH:
395 /* global flush doesn't need set IVA_REG */
396 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
397 break;
398 case DMA_TLB_DSI_FLUSH:
399 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
400 break;
401 case DMA_TLB_PSI_FLUSH:
402 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
403 /* Note: always flush non-leaf currently */
404 val_iva = size_order | addr;
405 break;
406 default:
407 BUG();
408 }
409 /* Note: set drain read/write */
410 if ( cap_read_drain(iommu->cap) )
411 val |= DMA_TLB_READ_DRAIN;
412 if ( cap_write_drain(iommu->cap) )
413 val |= DMA_TLB_WRITE_DRAIN;
415 spin_lock_irqsave(&iommu->register_lock, flags);
416 /* Note: Only uses first TLB reg currently */
417 if ( val_iva )
418 dmar_writeq(iommu->reg, tlb_offset, val_iva);
419 dmar_writeq(iommu->reg, tlb_offset + 8, val);
421 /* Make sure hardware complete it */
422 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
423 !(val & DMA_TLB_IVT), val);
424 spin_unlock_irqrestore(&iommu->register_lock, flags);
426 /* check IOTLB invalidation granularity */
427 if ( DMA_TLB_IAIG(val) == 0 )
428 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
430 /* flush iotlb entry will implicitly flush write buffer */
431 return 0;
432 }
434 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
435 int flush_non_present_entry, int flush_dev_iotlb)
436 {
437 struct iommu_flush *flush = iommu_get_flush(iommu);
438 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
439 flush_non_present_entry, flush_dev_iotlb);
440 }
442 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
443 int flush_non_present_entry, int flush_dev_iotlb)
444 {
445 struct iommu_flush *flush = iommu_get_flush(iommu);
446 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
447 flush_non_present_entry, flush_dev_iotlb);
448 }
450 static int inline get_alignment(u64 base, unsigned int size)
451 {
452 int t = 0;
453 u64 end;
455 end = base + size - 1;
456 while ( base != end )
457 {
458 t++;
459 base >>= 1;
460 end >>= 1;
461 }
462 return t;
463 }
465 static int inline iommu_flush_iotlb_psi(
466 struct iommu *iommu, u16 did, u64 addr, unsigned int pages,
467 int flush_non_present_entry, int flush_dev_iotlb)
468 {
469 unsigned int align;
470 struct iommu_flush *flush = iommu_get_flush(iommu);
472 ASSERT(!(addr & (~PAGE_MASK_4K)));
473 ASSERT(pages > 0);
475 /* Fallback to domain selective flush if no PSI support */
476 if ( !cap_pgsel_inv(iommu->cap) )
477 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
479 /*
480 * PSI requires page size is 2 ^ x, and the base address is naturally
481 * aligned to the size
482 */
483 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
484 /* Fallback to domain selective flush if size is too big */
485 if ( align > cap_max_amask_val(iommu->cap) )
486 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
488 addr >>= PAGE_SHIFT_4K + align;
489 addr <<= PAGE_SHIFT_4K + align;
491 return flush->iotlb(iommu, did, addr, align, DMA_TLB_PSI_FLUSH,
492 flush_non_present_entry, flush_dev_iotlb);
493 }
495 void iommu_flush_all(void)
496 {
497 struct acpi_drhd_unit *drhd;
498 struct iommu *iommu;
499 int flush_dev_iotlb;
501 flush_all_cache();
502 for_each_drhd_unit ( drhd )
503 {
504 iommu = drhd->iommu;
505 iommu_flush_context_global(iommu, 0);
506 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
507 iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
508 }
509 }
511 /* clear one page's page table */
512 static void dma_pte_clear_one(struct domain *domain, u64 addr)
513 {
514 struct hvm_iommu *hd = domain_hvm_iommu(domain);
515 struct acpi_drhd_unit *drhd;
516 struct iommu *iommu;
517 struct dma_pte *page = NULL, *pte = NULL;
518 u64 pg_maddr;
519 int flush_dev_iotlb;
521 spin_lock(&hd->mapping_lock);
522 /* get last level pte */
523 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
524 if ( pg_maddr == 0 )
525 {
526 spin_unlock(&hd->mapping_lock);
527 return;
528 }
530 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
531 pte = page + address_level_offset(addr, 1);
533 if ( !dma_pte_present(*pte) )
534 {
535 spin_unlock(&hd->mapping_lock);
536 unmap_vtd_domain_page(page);
537 return;
538 }
540 dma_clear_pte(*pte);
541 spin_unlock(&hd->mapping_lock);
542 iommu_flush_cache_entry(pte);
544 /* No need pcidevs_lock here since do that on assign/deassign device*/
545 for_each_drhd_unit ( drhd )
546 {
547 iommu = drhd->iommu;
548 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
549 {
550 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
551 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
552 addr, 1, 0, flush_dev_iotlb) )
553 iommu_flush_write_buffer(iommu);
554 }
555 }
557 unmap_vtd_domain_page(page);
558 }
560 static void iommu_free_pagetable(u64 pt_maddr, int level)
561 {
562 int i;
563 struct dma_pte *pt_vaddr, *pte;
564 int next_level = level - 1;
566 if ( pt_maddr == 0 )
567 return;
569 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
571 for ( i = 0; i < PTE_NUM; i++ )
572 {
573 pte = &pt_vaddr[i];
574 if ( !dma_pte_present(*pte) )
575 continue;
577 if ( next_level >= 1 )
578 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
580 dma_clear_pte(*pte);
581 iommu_flush_cache_entry(pte);
582 }
584 unmap_vtd_domain_page(pt_vaddr);
585 free_pgtable_maddr(pt_maddr);
586 }
588 static int iommu_set_root_entry(struct iommu *iommu)
589 {
590 struct acpi_drhd_unit *drhd;
591 u32 sts;
592 unsigned long flags;
594 spin_lock(&iommu->lock);
596 if ( iommu->root_maddr == 0 )
597 {
598 drhd = iommu_to_drhd(iommu);
599 iommu->root_maddr = alloc_pgtable_maddr(drhd, 1);
600 }
602 if ( iommu->root_maddr == 0 )
603 {
604 spin_unlock(&iommu->lock);
605 return -ENOMEM;
606 }
608 spin_unlock(&iommu->lock);
609 spin_lock_irqsave(&iommu->register_lock, flags);
610 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
612 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
613 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
615 /* Make sure hardware complete it */
616 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
617 (sts & DMA_GSTS_RTPS), sts);
618 spin_unlock_irqrestore(&iommu->register_lock, flags);
620 return 0;
621 }
623 static void iommu_enable_translation(struct iommu *iommu)
624 {
625 u32 sts;
626 unsigned long flags;
628 dprintk(XENLOG_INFO VTDPREFIX,
629 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
630 spin_lock_irqsave(&iommu->register_lock, flags);
631 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
632 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
634 /* Make sure hardware complete it */
635 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
636 (sts & DMA_GSTS_TES), sts);
637 spin_unlock_irqrestore(&iommu->register_lock, flags);
639 /* Disable PMRs when VT-d engine takes effect per spec definition */
640 disable_pmr(iommu);
641 }
643 static void iommu_disable_translation(struct iommu *iommu)
644 {
645 u32 sts;
646 unsigned long flags;
648 spin_lock_irqsave(&iommu->register_lock, flags);
649 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
650 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
652 /* Make sure hardware complete it */
653 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
654 !(sts & DMA_GSTS_TES), sts);
655 spin_unlock_irqrestore(&iommu->register_lock, flags);
656 }
658 static struct iommu *vector_to_iommu[NR_VECTORS];
659 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
660 u8 fault_reason, u16 source_id, u64 addr)
661 {
662 dprintk(XENLOG_WARNING VTDPREFIX,
663 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
664 "iommu->reg = %p\n",
665 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
666 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
667 fault_reason, iommu->reg);
669 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
670 if ( fault_reason < 0x20 )
671 print_vtd_entries(iommu, (source_id >> 8),
672 (source_id & 0xff), (addr >> PAGE_SHIFT));
673 #endif
675 return 0;
676 }
678 static void iommu_fault_status(u32 fault_status)
679 {
680 if ( fault_status & DMA_FSTS_PFO )
681 dprintk(XENLOG_ERR VTDPREFIX,
682 "iommu_fault_status: Fault Overflow\n");
683 if ( fault_status & DMA_FSTS_PPF )
684 dprintk(XENLOG_ERR VTDPREFIX,
685 "iommu_fault_status: Primary Pending Fault\n");
686 if ( fault_status & DMA_FSTS_AFO )
687 dprintk(XENLOG_ERR VTDPREFIX,
688 "iommu_fault_status: Advanced Fault Overflow\n");
689 if ( fault_status & DMA_FSTS_APF )
690 dprintk(XENLOG_ERR VTDPREFIX,
691 "iommu_fault_status: Advanced Pending Fault\n");
692 if ( fault_status & DMA_FSTS_IQE )
693 dprintk(XENLOG_ERR VTDPREFIX,
694 "iommu_fault_status: Invalidation Queue Error\n");
695 if ( fault_status & DMA_FSTS_ICE )
696 dprintk(XENLOG_ERR VTDPREFIX,
697 "iommu_fault_status: Invalidation Completion Error\n");
698 if ( fault_status & DMA_FSTS_ITE )
699 dprintk(XENLOG_ERR VTDPREFIX,
700 "iommu_fault_status: Invalidation Time-out Error\n");
701 }
703 #define PRIMARY_FAULT_REG_LEN (16)
704 static void iommu_page_fault(int vector, void *dev_id,
705 struct cpu_user_regs *regs)
706 {
707 struct iommu *iommu = dev_id;
708 int reg, fault_index;
709 u32 fault_status;
710 unsigned long flags;
712 dprintk(XENLOG_WARNING VTDPREFIX,
713 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
715 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
717 iommu_fault_status(fault_status);
719 /* FIXME: ignore advanced fault log */
720 if ( !(fault_status & DMA_FSTS_PPF) )
721 goto clear_overflow;
723 fault_index = dma_fsts_fault_record_index(fault_status);
724 reg = cap_fault_reg_offset(iommu->cap);
725 while (1)
726 {
727 u8 fault_reason;
728 u16 source_id;
729 u32 data;
730 u64 guest_addr;
731 int type;
733 /* highest 32 bits */
734 spin_lock_irqsave(&iommu->register_lock, flags);
735 data = dmar_readl(iommu->reg, reg +
736 fault_index * PRIMARY_FAULT_REG_LEN + 12);
737 if ( !(data & DMA_FRCD_F) )
738 {
739 spin_unlock_irqrestore(&iommu->register_lock, flags);
740 break;
741 }
743 fault_reason = dma_frcd_fault_reason(data);
744 type = dma_frcd_type(data);
746 data = dmar_readl(iommu->reg, reg +
747 fault_index * PRIMARY_FAULT_REG_LEN + 8);
748 source_id = dma_frcd_source_id(data);
750 guest_addr = dmar_readq(iommu->reg, reg +
751 fault_index * PRIMARY_FAULT_REG_LEN);
752 guest_addr = dma_frcd_page_addr(guest_addr);
753 /* clear the fault */
754 dmar_writel(iommu->reg, reg +
755 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
756 spin_unlock_irqrestore(&iommu->register_lock, flags);
758 iommu_page_fault_do_one(iommu, type, fault_reason,
759 source_id, guest_addr);
761 fault_index++;
762 if ( fault_index > cap_num_fault_regs(iommu->cap) )
763 fault_index = 0;
764 }
765 clear_overflow:
766 /* clear primary fault overflow */
767 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
768 if ( fault_status & DMA_FSTS_PFO )
769 {
770 spin_lock_irqsave(&iommu->register_lock, flags);
771 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
772 spin_unlock_irqrestore(&iommu->register_lock, flags);
773 }
774 }
776 static void dma_msi_unmask(unsigned int vector)
777 {
778 struct iommu *iommu = vector_to_iommu[vector];
779 unsigned long flags;
781 /* unmask it */
782 spin_lock_irqsave(&iommu->register_lock, flags);
783 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
784 spin_unlock_irqrestore(&iommu->register_lock, flags);
785 }
787 static void dma_msi_mask(unsigned int vector)
788 {
789 unsigned long flags;
790 struct iommu *iommu = vector_to_iommu[vector];
792 /* mask it */
793 spin_lock_irqsave(&iommu->register_lock, flags);
794 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
795 spin_unlock_irqrestore(&iommu->register_lock, flags);
796 }
798 static unsigned int dma_msi_startup(unsigned int vector)
799 {
800 dma_msi_unmask(vector);
801 return 0;
802 }
804 static void dma_msi_end(unsigned int vector)
805 {
806 dma_msi_unmask(vector);
807 ack_APIC_irq();
808 }
810 static void dma_msi_data_init(struct iommu *iommu, int vector)
811 {
812 u32 msi_data = 0;
813 unsigned long flags;
815 /* Fixed, edge, assert mode. Follow MSI setting */
816 msi_data |= vector & 0xff;
817 msi_data |= 1 << 14;
819 spin_lock_irqsave(&iommu->register_lock, flags);
820 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
821 spin_unlock_irqrestore(&iommu->register_lock, flags);
822 }
824 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
825 {
826 u64 msi_address;
827 unsigned long flags;
829 /* Physical, dedicated cpu. Follow MSI setting */
830 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
831 msi_address |= MSI_PHYSICAL_MODE << 2;
832 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
833 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
835 spin_lock_irqsave(&iommu->register_lock, flags);
836 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
837 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
838 spin_unlock_irqrestore(&iommu->register_lock, flags);
839 }
841 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
842 {
843 struct iommu *iommu = vector_to_iommu[vector];
844 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
845 }
847 static struct hw_interrupt_type dma_msi_type = {
848 .typename = "DMA_MSI",
849 .startup = dma_msi_startup,
850 .shutdown = dma_msi_mask,
851 .enable = dma_msi_unmask,
852 .disable = dma_msi_mask,
853 .ack = dma_msi_mask,
854 .end = dma_msi_end,
855 .set_affinity = dma_msi_set_affinity,
856 };
858 static int iommu_set_interrupt(struct iommu *iommu)
859 {
860 int vector, ret;
862 vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
863 if ( vector <= 0 )
864 {
865 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
866 return -EINVAL;
867 }
869 irq_desc[vector].handler = &dma_msi_type;
870 vector_to_iommu[vector] = iommu;
871 ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
872 if ( ret )
873 {
874 irq_desc[vector].handler = &no_irq_type;
875 vector_to_iommu[vector] = NULL;
876 free_irq_vector(vector);
877 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
878 return ret;
879 }
881 /* Make sure that vector is never re-used. */
882 vector_irq[vector] = NEVER_ASSIGN_IRQ;
884 return vector;
885 }
887 static int iommu_alloc(struct acpi_drhd_unit *drhd)
888 {
889 struct iommu *iommu;
890 unsigned long sagaw;
891 int agaw;
893 if ( nr_iommus > MAX_IOMMUS )
894 {
895 gdprintk(XENLOG_ERR VTDPREFIX,
896 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
897 return -ENOMEM;
898 }
900 iommu = xmalloc(struct iommu);
901 if ( iommu == NULL )
902 return -ENOMEM;
903 memset(iommu, 0, sizeof(struct iommu));
905 iommu->vector = -1; /* No vector assigned yet. */
907 iommu->intel = alloc_intel_iommu();
908 if ( iommu->intel == NULL )
909 {
910 xfree(iommu);
911 return -ENOMEM;
912 }
914 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
915 iommu->index = nr_iommus++;
917 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
918 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
920 gdprintk(XENLOG_INFO VTDPREFIX,
921 "drhd->address = %"PRIx64"\n", drhd->address);
922 gdprintk(XENLOG_INFO VTDPREFIX, "iommu->reg = %p\n", iommu->reg);
924 /* Calculate number of pagetable levels: between 2 and 4. */
925 sagaw = cap_sagaw(iommu->cap);
926 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
927 if ( test_bit(agaw, &sagaw) )
928 break;
929 if ( agaw < 0 )
930 {
931 gdprintk(XENLOG_ERR VTDPREFIX,
932 "IOMMU: unsupported sagaw %lx\n", sagaw);
933 xfree(iommu);
934 return -ENODEV;
935 }
936 iommu->nr_pt_levels = agaw_to_level(agaw);
938 if ( !ecap_coherent(iommu->ecap) )
939 iommus_incoherent = 1;
941 spin_lock_init(&iommu->lock);
942 spin_lock_init(&iommu->register_lock);
944 drhd->iommu = iommu;
945 return 0;
946 }
948 static void iommu_free(struct acpi_drhd_unit *drhd)
949 {
950 struct iommu *iommu = drhd->iommu;
952 if ( iommu == NULL )
953 return;
955 if ( iommu->root_maddr != 0 )
956 {
957 free_pgtable_maddr(iommu->root_maddr);
958 iommu->root_maddr = 0;
959 }
961 if ( iommu->reg )
962 iounmap(iommu->reg);
964 free_intel_iommu(iommu->intel);
965 release_irq_vector(iommu->vector);
966 xfree(iommu);
968 drhd->iommu = NULL;
969 }
971 #define guestwidth_to_adjustwidth(gaw) ({ \
972 int agaw, r = (gaw - 12) % 9; \
973 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
974 if ( agaw > 64 ) \
975 agaw = 64; \
976 agaw; })
978 static int intel_iommu_domain_init(struct domain *d)
979 {
980 struct hvm_iommu *hd = domain_hvm_iommu(d);
981 struct iommu *iommu = NULL;
982 struct acpi_drhd_unit *drhd;
984 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
985 iommu = drhd->iommu;
987 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
989 if ( d->domain_id == 0 )
990 {
991 /* Set up 1:1 page table for dom0 */
992 iommu_set_dom0_mapping(d);
994 setup_dom0_devices(d);
995 setup_dom0_rmrr(d);
997 iommu_flush_all();
999 for_each_drhd_unit ( drhd )
1001 iommu = drhd->iommu;
1002 iommu_enable_translation(iommu);
1006 return 0;
1009 static int domain_context_mapping_one(
1010 struct domain *domain,
1011 struct iommu *iommu,
1012 u8 bus, u8 devfn)
1014 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1015 struct context_entry *context, *context_entries;
1016 u64 maddr, pgd_maddr;
1017 struct pci_dev *pdev = NULL;
1018 int agaw;
1020 ASSERT(spin_is_locked(&pcidevs_lock));
1021 spin_lock(&iommu->lock);
1022 maddr = bus_to_context_maddr(iommu, bus);
1023 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1024 context = &context_entries[devfn];
1026 if ( context_present(*context) )
1028 int res = 0;
1030 pdev = pci_get_pdev(bus, devfn);
1031 if (!pdev)
1032 res = -ENODEV;
1033 else if (pdev->domain != domain)
1034 res = -EINVAL;
1035 unmap_vtd_domain_page(context_entries);
1036 spin_unlock(&iommu->lock);
1037 return res;
1040 if ( iommu_passthrough && (domain->domain_id == 0) )
1042 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1043 agaw = level_to_agaw(iommu->nr_pt_levels);
1045 else
1047 spin_lock(&hd->mapping_lock);
1049 /* Ensure we have pagetables allocated down to leaf PTE. */
1050 if ( hd->pgd_maddr == 0 )
1052 addr_to_dma_page_maddr(domain, 0, 1);
1053 if ( hd->pgd_maddr == 0 )
1055 nomem:
1056 spin_unlock(&hd->mapping_lock);
1057 spin_unlock(&iommu->lock);
1058 unmap_vtd_domain_page(context_entries);
1059 return -ENOMEM;
1063 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1064 pgd_maddr = hd->pgd_maddr;
1065 for ( agaw = level_to_agaw(4);
1066 agaw != level_to_agaw(iommu->nr_pt_levels);
1067 agaw-- )
1069 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1070 pgd_maddr = dma_pte_addr(*p);
1071 unmap_vtd_domain_page(p);
1072 if ( pgd_maddr == 0 )
1073 goto nomem;
1076 context_set_address_root(*context, pgd_maddr);
1077 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1078 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1079 else
1080 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1082 spin_unlock(&hd->mapping_lock);
1085 /*
1086 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1087 * be 1 based as required by intel's iommu hw.
1088 */
1089 context_set_domain_id(context, domain);
1090 context_set_address_width(*context, agaw);
1091 context_set_fault_enable(*context);
1092 context_set_present(*context);
1093 iommu_flush_cache_entry(context);
1094 spin_unlock(&iommu->lock);
1096 /* Context entry was previously non-present (with domid 0). */
1097 if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1098 DMA_CCMD_MASK_NOBIT, 1) )
1099 iommu_flush_write_buffer(iommu);
1100 else
1102 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1103 iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1106 set_bit(iommu->index, &hd->iommu_bitmap);
1108 unmap_vtd_domain_page(context_entries);
1110 return 0;
1113 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1115 struct acpi_drhd_unit *drhd;
1116 int ret = 0;
1117 u32 type;
1118 u8 secbus;
1119 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1121 if ( pdev == NULL )
1123 /* We can reach here by setup_dom0_rmrr() -> iommu_prepare_rmrr_dev()
1124 * -> domain_context_mapping().
1125 * In the case a user enables VT-d and disables USB (that usually needs
1126 * RMRR) in BIOS, we can't discover the BDF of the USB controller in
1127 * setup_dom0_devices(), but the ACPI RMRR structures may still contain
1128 * the BDF and at last pci_get_pdev() returns NULL here.
1129 */
1130 gdprintk(XENLOG_WARNING VTDPREFIX,
1131 "domain_context_mapping: can't find bdf = %x:%x.%x\n",
1132 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1133 return 0;
1136 drhd = acpi_find_matched_drhd_unit(pdev);
1137 if ( !drhd )
1138 return -ENODEV;
1140 ASSERT(spin_is_locked(&pcidevs_lock));
1142 type = pdev_type(bus, devfn);
1143 switch ( type )
1145 case DEV_TYPE_PCIe_BRIDGE:
1146 case DEV_TYPE_PCIe2PCI_BRIDGE:
1147 case DEV_TYPE_LEGACY_PCI_BRIDGE:
1148 break;
1150 case DEV_TYPE_PCIe_ENDPOINT:
1151 gdprintk(XENLOG_INFO VTDPREFIX,
1152 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1153 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1154 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1155 break;
1157 case DEV_TYPE_PCI:
1158 gdprintk(XENLOG_INFO VTDPREFIX,
1159 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1160 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1162 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1163 if ( ret )
1164 break;
1166 if ( find_upstream_bridge(&bus, &devfn, &secbus) < 1 )
1167 break;
1169 /* PCIe to PCI/PCIx bridge */
1170 if ( pdev_type(bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1172 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1173 if ( ret )
1174 return ret;
1176 /*
1177 * Devices behind PCIe-to-PCI/PCIx bridge may generate
1178 * different requester-id. It may originate from devfn=0
1179 * on the secondary bus behind the bridge. Map that id
1180 * as well.
1181 */
1182 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1184 else /* Legacy PCI bridge */
1185 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1187 break;
1189 default:
1190 gdprintk(XENLOG_ERR VTDPREFIX,
1191 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1192 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1193 ret = -EINVAL;
1194 break;
1197 return ret;
1200 static int domain_context_unmap_one(
1201 struct domain *domain,
1202 struct iommu *iommu,
1203 u8 bus, u8 devfn)
1205 struct context_entry *context, *context_entries;
1206 u64 maddr;
1208 ASSERT(spin_is_locked(&pcidevs_lock));
1209 spin_lock(&iommu->lock);
1211 maddr = bus_to_context_maddr(iommu, bus);
1212 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1213 context = &context_entries[devfn];
1215 if ( !context_present(*context) )
1217 spin_unlock(&iommu->lock);
1218 unmap_vtd_domain_page(context_entries);
1219 return 0;
1222 context_clear_present(*context);
1223 context_clear_entry(*context);
1224 iommu_flush_cache_entry(context);
1226 if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
1227 (((u16)bus) << 8) | devfn,
1228 DMA_CCMD_MASK_NOBIT, 0) )
1229 iommu_flush_write_buffer(iommu);
1230 else
1232 int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1233 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0, flush_dev_iotlb);
1236 spin_unlock(&iommu->lock);
1237 unmap_vtd_domain_page(context_entries);
1239 return 0;
1242 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1244 struct acpi_drhd_unit *drhd;
1245 int ret = 0;
1246 u32 type;
1247 u8 secbus;
1248 struct pci_dev *pdev = pci_get_pdev(bus, devfn);
1250 BUG_ON(!pdev);
1252 drhd = acpi_find_matched_drhd_unit(pdev);
1253 if ( !drhd )
1254 return -ENODEV;
1256 type = pdev_type(bus, devfn);
1257 switch ( type )
1259 case DEV_TYPE_PCIe_BRIDGE:
1260 case DEV_TYPE_PCIe2PCI_BRIDGE:
1261 case DEV_TYPE_LEGACY_PCI_BRIDGE:
1262 break;
1264 case DEV_TYPE_PCIe_ENDPOINT:
1265 gdprintk(XENLOG_INFO VTDPREFIX,
1266 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
1267 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1268 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1269 break;
1271 case DEV_TYPE_PCI:
1272 gdprintk(XENLOG_INFO VTDPREFIX,
1273 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
1274 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1275 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1276 if ( ret )
1277 break;
1279 if ( find_upstream_bridge(&bus, &devfn, &secbus) < 1 )
1280 break;
1282 /* PCIe to PCI/PCIx bridge */
1283 if ( pdev_type(bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1285 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1286 if ( ret )
1287 return ret;
1289 ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1291 else /* Legacy PCI bridge */
1292 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1294 break;
1296 default:
1297 gdprintk(XENLOG_ERR VTDPREFIX,
1298 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1299 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1300 ret = -EINVAL;
1301 break;
1304 return ret;
1307 static int reassign_device_ownership(
1308 struct domain *source,
1309 struct domain *target,
1310 u8 bus, u8 devfn)
1312 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1313 struct pci_dev *pdev;
1314 struct acpi_drhd_unit *drhd;
1315 struct iommu *pdev_iommu;
1316 int ret, found = 0;
1318 ASSERT(spin_is_locked(&pcidevs_lock));
1319 pdev = pci_get_pdev_by_domain(source, bus, devfn);
1321 if (!pdev)
1322 return -ENODEV;
1324 drhd = acpi_find_matched_drhd_unit(pdev);
1325 pdev_iommu = drhd->iommu;
1326 domain_context_unmap(source, bus, devfn);
1328 ret = domain_context_mapping(target, bus, devfn);
1329 if ( ret )
1330 return ret;
1332 list_move(&pdev->domain_list, &target->arch.pdev_list);
1333 pdev->domain = target;
1335 for_each_pdev ( source, pdev )
1337 drhd = acpi_find_matched_drhd_unit(pdev);
1338 if ( drhd->iommu == pdev_iommu )
1340 found = 1;
1341 break;
1345 if ( !found )
1346 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1348 return ret;
1351 void iommu_domain_teardown(struct domain *d)
1353 struct hvm_iommu *hd = domain_hvm_iommu(d);
1355 if ( list_empty(&acpi_drhd_units) )
1356 return;
1358 spin_lock(&hd->mapping_lock);
1359 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1360 hd->pgd_maddr = 0;
1361 spin_unlock(&hd->mapping_lock);
1363 iommu_domid_release(d);
1366 int intel_iommu_map_page(
1367 struct domain *d, unsigned long gfn, unsigned long mfn)
1369 struct hvm_iommu *hd = domain_hvm_iommu(d);
1370 struct acpi_drhd_unit *drhd;
1371 struct iommu *iommu;
1372 struct dma_pte *page = NULL, *pte = NULL;
1373 u64 pg_maddr;
1374 int pte_present;
1375 int flush_dev_iotlb;
1377 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1378 iommu = drhd->iommu;
1380 /* do nothing if dom0 and iommu supports pass thru */
1381 if ( iommu_passthrough && (d->domain_id == 0) )
1382 return 0;
1384 spin_lock(&hd->mapping_lock);
1386 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1387 if ( pg_maddr == 0 )
1389 spin_unlock(&hd->mapping_lock);
1390 return -ENOMEM;
1392 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1393 pte = page + (gfn & LEVEL_MASK);
1394 pte_present = dma_pte_present(*pte);
1395 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1396 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1398 /* Set the SNP on leaf page table if Snoop Control available */
1399 if ( iommu_snoop )
1400 dma_set_pte_snp(*pte);
1402 iommu_flush_cache_entry(pte);
1403 spin_unlock(&hd->mapping_lock);
1404 unmap_vtd_domain_page(page);
1406 /*
1407 * No need pcideves_lock here because we have flush
1408 * when assign/deassign device
1409 */
1410 for_each_drhd_unit ( drhd )
1412 iommu = drhd->iommu;
1414 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1415 continue;
1417 flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
1418 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1419 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1420 !pte_present, flush_dev_iotlb) )
1421 iommu_flush_write_buffer(iommu);
1424 return 0;
1427 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1429 struct acpi_drhd_unit *drhd;
1430 struct iommu *iommu;
1432 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1433 iommu = drhd->iommu;
1435 /* do nothing if dom0 and iommu supports pass thru */
1436 if ( iommu_passthrough && (d->domain_id == 0) )
1437 return 0;
1439 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1441 return 0;
1444 static int iommu_prepare_rmrr_dev(struct domain *d,
1445 struct acpi_rmrr_unit *rmrr,
1446 u8 bus, u8 devfn)
1448 int ret = 0;
1449 u64 base, end;
1450 unsigned long base_pfn, end_pfn;
1452 ASSERT(spin_is_locked(&pcidevs_lock));
1453 ASSERT(rmrr->base_address < rmrr->end_address);
1455 base = rmrr->base_address & PAGE_MASK_4K;
1456 base_pfn = base >> PAGE_SHIFT_4K;
1457 end = PAGE_ALIGN_4K(rmrr->end_address);
1458 end_pfn = end >> PAGE_SHIFT_4K;
1460 while ( base_pfn < end_pfn )
1462 intel_iommu_map_page(d, base_pfn, base_pfn);
1463 base_pfn++;
1466 ret = domain_context_mapping(d, bus, devfn);
1468 return ret;
1471 static int intel_iommu_add_device(struct pci_dev *pdev)
1473 struct acpi_rmrr_unit *rmrr;
1474 u16 bdf;
1475 int ret, i;
1477 ASSERT(spin_is_locked(&pcidevs_lock));
1479 if ( !pdev->domain )
1480 return -EINVAL;
1482 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1483 if ( ret )
1485 gdprintk(XENLOG_ERR VTDPREFIX,
1486 "intel_iommu_add_device: context mapping failed\n");
1487 return ret;
1490 for_each_rmrr_device ( rmrr, bdf, i )
1492 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1494 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1495 pdev->bus, pdev->devfn);
1496 if ( ret )
1497 gdprintk(XENLOG_ERR VTDPREFIX,
1498 "intel_iommu_add_device: RMRR mapping failed\n");
1499 break;
1503 return ret;
1506 static int intel_iommu_remove_device(struct pci_dev *pdev)
1508 struct acpi_rmrr_unit *rmrr;
1509 u16 bdf;
1510 int i;
1512 if ( !pdev->domain )
1513 return -EINVAL;
1515 /* If the device belongs to dom0, and it has RMRR, don't remove it
1516 * from dom0, because BIOS may use RMRR at booting time.
1517 */
1518 if ( pdev->domain->domain_id == 0 )
1520 for_each_rmrr_device ( rmrr, bdf, i )
1522 if ( PCI_BUS(bdf) == pdev->bus &&
1523 PCI_DEVFN2(bdf) == pdev->devfn )
1524 return 0;
1528 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1531 static void setup_dom0_devices(struct domain *d)
1533 struct hvm_iommu *hd;
1534 struct pci_dev *pdev;
1535 int bus, devfn;
1537 hd = domain_hvm_iommu(d);
1539 spin_lock(&pcidevs_lock);
1540 for ( bus = 0; bus < 256; bus++ )
1542 for ( devfn = 0; devfn < 256; devfn++ )
1544 pdev = pci_get_pdev(bus, devfn);
1545 if ( !pdev )
1546 continue;
1548 pdev->domain = d;
1549 list_add(&pdev->domain_list, &d->arch.pdev_list);
1550 domain_context_mapping(d, pdev->bus, pdev->devfn);
1551 if ( ats_device(0, pdev->bus, pdev->devfn) )
1552 enable_ats_device(0, pdev->bus, pdev->devfn);
1555 spin_unlock(&pcidevs_lock);
1558 void clear_fault_bits(struct iommu *iommu)
1560 u64 val;
1561 unsigned long flags;
1563 spin_lock_irqsave(&iommu->register_lock, flags);
1564 val = dmar_readq(
1565 iommu->reg,
1566 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1567 dmar_writeq(
1568 iommu->reg,
1569 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1570 val);
1571 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1572 spin_unlock_irqrestore(&iommu->register_lock, flags);
1575 static int init_vtd_hw(void)
1577 struct acpi_drhd_unit *drhd;
1578 struct iommu *iommu;
1579 struct iommu_flush *flush = NULL;
1580 int vector;
1581 int ret;
1582 unsigned long flags;
1584 for_each_drhd_unit ( drhd )
1586 iommu = drhd->iommu;
1587 if ( iommu->vector < 0 )
1589 vector = iommu_set_interrupt(iommu);
1590 if ( vector < 0 )
1592 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
1593 return vector;
1595 iommu->vector = vector;
1597 dma_msi_data_init(iommu, iommu->vector);
1598 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1599 clear_fault_bits(iommu);
1601 spin_lock_irqsave(&iommu->register_lock, flags);
1602 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1603 spin_unlock_irqrestore(&iommu->register_lock, flags);
1605 /* initialize flush functions */
1606 flush = iommu_get_flush(iommu);
1607 flush->context = flush_context_reg;
1608 flush->iotlb = flush_iotlb_reg;
1611 if ( iommu_qinval )
1613 for_each_drhd_unit ( drhd )
1615 iommu = drhd->iommu;
1616 if ( enable_qinval(iommu) != 0 )
1618 dprintk(XENLOG_INFO VTDPREFIX,
1619 "Failed to enable Queued Invalidation!\n");
1620 break;
1625 if ( iommu_intremap )
1627 for_each_drhd_unit ( drhd )
1629 iommu = drhd->iommu;
1630 if ( enable_intremap(iommu) != 0 )
1632 dprintk(XENLOG_INFO VTDPREFIX,
1633 "Failed to enable Interrupt Remapping!\n");
1634 break;
1639 for_each_drhd_unit ( drhd )
1641 iommu = drhd->iommu;
1642 ret = iommu_set_root_entry(iommu);
1643 if ( ret )
1645 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1646 return -EIO;
1650 /*
1651 * After set root entry, must globally invalidate context cache, and
1652 * then globally invalidate IOTLB
1653 */
1654 iommu_flush_all();
1656 return 0;
1659 static void setup_dom0_rmrr(struct domain *d)
1661 struct acpi_rmrr_unit *rmrr;
1662 u16 bdf;
1663 int ret, i;
1665 spin_lock(&pcidevs_lock);
1666 for_each_rmrr_device ( rmrr, bdf, i )
1668 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1669 if ( ret )
1670 gdprintk(XENLOG_ERR VTDPREFIX,
1671 "IOMMU: mapping reserved region failed\n");
1673 spin_unlock(&pcidevs_lock);
1676 static void platform_quirks(void)
1678 u32 id;
1680 /* Mobile 4 Series Chipset neglects to set RWBF capability. */
1681 id = pci_conf_read32(0, 0, 0, 0);
1682 if ( id == 0x2a408086 )
1684 dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
1685 rwbf_quirk = 1;
1689 int intel_vtd_setup(void)
1691 struct acpi_drhd_unit *drhd;
1692 struct iommu *iommu;
1694 if ( !iommu_enabled )
1695 return -ENODEV;
1697 platform_quirks();
1699 spin_lock_init(&domid_bitmap_lock);
1700 clflush_size = get_cache_line_size();
1702 /* We enable the following features only if they are supported by all VT-d
1703 * engines: Snoop Control, DMA passthrough, Queued Invalidation and
1704 * Interrupt Remapping.
1705 */
1706 for_each_drhd_unit ( drhd )
1708 if ( iommu_alloc(drhd) != 0 )
1709 goto error;
1711 iommu = drhd->iommu;
1713 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
1714 iommu_snoop = 0;
1716 if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
1717 iommu_passthrough = 0;
1719 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
1720 iommu_qinval = 0;
1722 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
1723 iommu_intremap = 0;
1726 if ( !iommu_qinval && iommu_intremap )
1728 iommu_intremap = 0;
1729 gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
1730 "since Queued Invalidation isn't supported or enabled.\n");
1733 #define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
1734 P(iommu_snoop, "Snoop Control");
1735 P(iommu_passthrough, "DMA Passthrough");
1736 P(iommu_qinval, "Queued Invalidation");
1737 P(iommu_intremap, "Interrupt Remapping");
1738 #undef P
1740 /* Allocate IO page directory page for the domain. */
1741 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1742 iommu = drhd->iommu;
1744 /* Allocate domain id bitmap, and set bit 0 as reserved */
1745 domid_bitmap_size = cap_ndoms(iommu->cap);
1746 domid_bitmap = xmalloc_array(unsigned long,
1747 BITS_TO_LONGS(domid_bitmap_size));
1748 if ( domid_bitmap == NULL )
1749 goto error;
1750 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1751 set_bit(0, domid_bitmap);
1753 scan_pci_devices();
1755 if ( init_vtd_hw() )
1756 goto error;
1758 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1760 return 0;
1762 error:
1763 for_each_drhd_unit ( drhd )
1764 iommu_free(drhd);
1765 iommu_enabled = 0;
1766 iommu_snoop = 0;
1767 iommu_passthrough = 0;
1768 iommu_qinval = 0;
1769 iommu_intremap = 0;
1770 return -ENOMEM;
1773 /*
1774 * If the device isn't owned by dom0, it means it already
1775 * has been assigned to other domain, or it's not exist.
1776 */
1777 int device_assigned(u8 bus, u8 devfn)
1779 struct pci_dev *pdev;
1781 spin_lock(&pcidevs_lock);
1782 pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
1783 if (!pdev)
1785 spin_unlock(&pcidevs_lock);
1786 return -1;
1789 spin_unlock(&pcidevs_lock);
1790 return 0;
1793 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1795 struct acpi_rmrr_unit *rmrr;
1796 int ret = 0, i;
1797 struct pci_dev *pdev;
1798 u16 bdf;
1800 if ( list_empty(&acpi_drhd_units) )
1801 return -ENODEV;
1803 ASSERT(spin_is_locked(&pcidevs_lock));
1804 pdev = pci_get_pdev(bus, devfn);
1805 if (!pdev)
1806 return -ENODEV;
1808 if (pdev->domain != dom0)
1810 gdprintk(XENLOG_ERR VTDPREFIX,
1811 "IOMMU: assign a assigned device\n");
1812 return -EBUSY;
1815 ret = reassign_device_ownership(dom0, d, bus, devfn);
1816 if ( ret )
1817 goto done;
1819 /* Setup rmrr identity mapping */
1820 for_each_rmrr_device( rmrr, bdf, i )
1822 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1824 /* FIXME: Because USB RMRR conflicts with guest bios region,
1825 * ignore USB RMRR temporarily.
1826 */
1827 if ( is_usb_device(bus, devfn) )
1829 ret = 0;
1830 goto done;
1833 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1834 if ( ret )
1835 gdprintk(XENLOG_ERR VTDPREFIX,
1836 "IOMMU: mapping reserved region failed\n");
1837 goto done;
1841 done:
1842 return ret;
1845 static int intel_iommu_group_id(u8 bus, u8 devfn)
1847 u8 secbus;
1848 if ( find_upstream_bridge(&bus, &devfn, &secbus) < 0 )
1849 return -1;
1850 else
1851 return PCI_BDF2(bus, devfn);
1854 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1855 void iommu_suspend(void)
1857 struct acpi_drhd_unit *drhd;
1858 struct iommu *iommu;
1859 u32 i;
1861 if ( !iommu_enabled )
1862 return;
1864 iommu_flush_all();
1866 for_each_drhd_unit ( drhd )
1868 iommu = drhd->iommu;
1869 i = iommu->index;
1871 iommu_state[i][DMAR_FECTL_REG] =
1872 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1873 iommu_state[i][DMAR_FEDATA_REG] =
1874 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1875 iommu_state[i][DMAR_FEADDR_REG] =
1876 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1877 iommu_state[i][DMAR_FEUADDR_REG] =
1878 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1880 iommu_disable_translation(iommu);
1882 if ( iommu_intremap )
1883 disable_intremap(iommu);
1885 if ( iommu_qinval )
1886 disable_qinval(iommu);
1890 void iommu_resume(void)
1892 struct acpi_drhd_unit *drhd;
1893 struct iommu *iommu;
1894 u32 i;
1895 unsigned long flags;
1897 if ( !iommu_enabled )
1898 return;
1900 if ( init_vtd_hw() != 0 && force_iommu )
1901 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1903 for_each_drhd_unit ( drhd )
1905 iommu = drhd->iommu;
1906 i = iommu->index;
1908 spin_lock_irqsave(&iommu->register_lock, flags);
1909 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1910 (u32) iommu_state[i][DMAR_FECTL_REG]);
1911 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1912 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1913 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1914 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1915 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1916 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1917 spin_unlock_irqrestore(&iommu->register_lock, flags);
1919 iommu_enable_translation(iommu);
1923 struct iommu_ops intel_iommu_ops = {
1924 .init = intel_iommu_domain_init,
1925 .add_device = intel_iommu_add_device,
1926 .remove_device = intel_iommu_remove_device,
1927 .assign_device = intel_iommu_assign_device,
1928 .teardown = iommu_domain_teardown,
1929 .map_page = intel_iommu_map_page,
1930 .unmap_page = intel_iommu_unmap_page,
1931 .reassign_device = reassign_device_ownership,
1932 .get_device_group_id = intel_iommu_group_id,
1933 .update_ire_from_apic = io_apic_write_remap_rte,
1934 .update_ire_from_msi = msi_msg_write_remap_rte,
1935 .read_apic_from_ire = io_apic_read_remap_rte,
1936 .read_msi_from_ire = msi_msg_read_remap_rte,
1937 };
1939 /*
1940 * Local variables:
1941 * mode: C
1942 * c-set-style: "BSD"
1943 * c-basic-offset: 4
1944 * tab-width: 4
1945 * indent-tabs-mode: nil
1946 * End:
1947 */