ia64/xen-unstable

view xen/drivers/passthrough/vtd/iommu.c @ 18803:2604400f75e3

vtd: fix memory allocation from NUMA node for VT-d.

Signed-off-by: Yuji Shimada <shimada-yxb@necst.nec.co.jp>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 18 10:52:42 2008 +0000 (2008-11-18)
parents 36bda0bb805f
children 1975e33b79f1
line source
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
20 */
22 #include <xen/irq.h>
23 #include <xen/sched.h>
24 #include <xen/xmalloc.h>
25 #include <xen/domain_page.h>
26 #include <xen/iommu.h>
27 #include <asm/hvm/iommu.h>
28 #include <xen/numa.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include "iommu.h"
34 #include "dmar.h"
35 #include "extern.h"
36 #include "vtd.h"
38 #define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
40 static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */
41 static int domid_bitmap_size; /* domain id bitmap size in bits */
42 static unsigned long *domid_bitmap; /* iommu domain id bitmap */
44 static void setup_dom0_devices(struct domain *d);
45 static void setup_dom0_rmrr(struct domain *d);
47 #define DID_FIELD_WIDTH 16
48 #define DID_HIGH_OFFSET 8
49 static void context_set_domain_id(struct context_entry *context,
50 struct domain *d)
51 {
52 unsigned long flags;
53 domid_t iommu_domid = domain_iommu_domid(d);
55 if ( iommu_domid == 0 )
56 {
57 spin_lock_irqsave(&domid_bitmap_lock, flags);
58 iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
59 set_bit(iommu_domid, domid_bitmap);
60 spin_unlock_irqrestore(&domid_bitmap_lock, flags);
61 d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
62 }
64 context->hi &= (1 << DID_HIGH_OFFSET) - 1;
65 context->hi |= iommu_domid << DID_HIGH_OFFSET;
66 }
68 static void iommu_domid_release(struct domain *d)
69 {
70 domid_t iommu_domid = domain_iommu_domid(d);
72 if ( iommu_domid != 0 )
73 {
74 d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
75 clear_bit(iommu_domid, domid_bitmap);
76 }
77 }
79 static struct intel_iommu *alloc_intel_iommu(void)
80 {
81 struct intel_iommu *intel;
83 intel = xmalloc(struct intel_iommu);
84 if ( intel == NULL )
85 return NULL;
86 memset(intel, 0, sizeof(struct intel_iommu));
88 spin_lock_init(&intel->qi_ctrl.qinval_lock);
89 spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
90 spin_lock_init(&intel->ir_ctrl.iremap_lock);
92 return intel;
93 }
95 static void free_intel_iommu(struct intel_iommu *intel)
96 {
97 xfree(intel);
98 }
100 struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
101 {
102 return iommu ? &iommu->intel->qi_ctrl : NULL;
103 }
105 struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
106 {
107 return iommu ? &iommu->intel->ir_ctrl : NULL;
108 }
110 struct iommu_flush *iommu_get_flush(struct iommu *iommu)
111 {
112 return iommu ? &iommu->intel->flush : NULL;
113 }
115 static unsigned int clflush_size;
116 static int iommus_incoherent;
117 static void __iommu_flush_cache(void *addr, int size)
118 {
119 int i;
121 if ( !iommus_incoherent )
122 return;
124 for ( i = 0; i < size; i += clflush_size )
125 cacheline_flush((char *)addr + i);
126 }
128 void iommu_flush_cache_entry(void *addr)
129 {
130 __iommu_flush_cache(addr, 8);
131 }
133 void iommu_flush_cache_page(void *addr)
134 {
135 __iommu_flush_cache(addr, PAGE_SIZE_4K);
136 }
138 int nr_iommus;
139 /* context entry handling */
140 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
141 {
142 struct root_entry *root, *root_entries;
143 unsigned long flags;
144 u64 maddr;
146 spin_lock_irqsave(&iommu->lock, flags);
147 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
148 root = &root_entries[bus];
149 if ( !root_present(*root) )
150 {
151 maddr = alloc_pgtable_maddr(NULL);
152 if ( maddr == 0 )
153 {
154 unmap_vtd_domain_page(root_entries);
155 spin_unlock_irqrestore(&iommu->lock, flags);
156 return 0;
157 }
158 set_root_value(*root, maddr);
159 set_root_present(*root);
160 iommu_flush_cache_entry(root);
161 }
162 maddr = (u64) get_context_addr(*root);
163 unmap_vtd_domain_page(root_entries);
164 spin_unlock_irqrestore(&iommu->lock, flags);
165 return maddr;
166 }
168 static int device_context_mapped(struct iommu *iommu, u8 bus, u8 devfn)
169 {
170 struct root_entry *root, *root_entries;
171 struct context_entry *context;
172 u64 context_maddr;
173 int ret;
174 unsigned long flags;
176 spin_lock_irqsave(&iommu->lock, flags);
177 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
178 root = &root_entries[bus];
179 if ( !root_present(*root) )
180 {
181 ret = 0;
182 goto out;
183 }
184 context_maddr = get_context_addr(*root);
185 context = (struct context_entry *)map_vtd_domain_page(context_maddr);
186 ret = context_present(context[devfn]);
187 unmap_vtd_domain_page(context);
188 out:
189 unmap_vtd_domain_page(root_entries);
190 spin_unlock_irqrestore(&iommu->lock, flags);
191 return ret;
192 }
194 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
195 {
196 struct hvm_iommu *hd = domain_hvm_iommu(domain);
197 int addr_width = agaw_to_width(hd->agaw);
198 struct dma_pte *parent, *pte = NULL;
199 int level = agaw_to_level(hd->agaw);
200 int offset;
201 unsigned long flags;
202 u64 pte_maddr = 0, maddr;
203 u64 *vaddr = NULL;
205 addr &= (((u64)1) << addr_width) - 1;
206 spin_lock_irqsave(&hd->mapping_lock, flags);
207 if ( hd->pgd_maddr == 0 )
208 if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) )
209 goto out;
211 parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
212 while ( level > 1 )
213 {
214 offset = address_level_offset(addr, level);
215 pte = &parent[offset];
217 if ( dma_pte_addr(*pte) == 0 )
218 {
219 if ( !alloc )
220 break;
221 maddr = alloc_pgtable_maddr(domain);
222 if ( !maddr )
223 break;
224 dma_set_pte_addr(*pte, maddr);
225 vaddr = map_vtd_domain_page(maddr);
227 /*
228 * high level table always sets r/w, last level
229 * page table control read/write
230 */
231 dma_set_pte_readable(*pte);
232 dma_set_pte_writable(*pte);
233 iommu_flush_cache_entry(pte);
234 }
235 else
236 {
237 vaddr = map_vtd_domain_page(pte->val);
238 }
240 if ( level == 2 )
241 {
242 pte_maddr = pte->val & PAGE_MASK_4K;
243 unmap_vtd_domain_page(vaddr);
244 break;
245 }
247 unmap_vtd_domain_page(parent);
248 parent = (struct dma_pte *)vaddr;
249 vaddr = NULL;
250 level--;
251 }
253 unmap_vtd_domain_page(parent);
254 out:
255 spin_unlock_irqrestore(&hd->mapping_lock, flags);
256 return pte_maddr;
257 }
259 static void iommu_flush_write_buffer(struct iommu *iommu)
260 {
261 u32 val;
262 unsigned long flag;
263 s_time_t start_time;
265 if ( !cap_rwbf(iommu->cap) )
266 return;
267 val = iommu->gcmd | DMA_GCMD_WBF;
269 spin_lock_irqsave(&iommu->register_lock, flag);
270 dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
272 /* Make sure hardware complete it */
273 start_time = NOW();
274 for ( ; ; )
275 {
276 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
277 if ( !(val & DMA_GSTS_WBFS) )
278 break;
279 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
280 panic("%s: DMAR hardware is malfunctional,"
281 " please disable IOMMU\n", __func__);
282 cpu_relax();
283 }
284 spin_unlock_irqrestore(&iommu->register_lock, flag);
285 }
287 /* return value determine if we need a write buffer flush */
288 static int flush_context_reg(
289 void *_iommu,
290 u16 did, u16 source_id, u8 function_mask, u64 type,
291 int non_present_entry_flush)
292 {
293 struct iommu *iommu = (struct iommu *) _iommu;
294 u64 val = 0;
295 unsigned long flag;
296 s_time_t start_time;
298 /*
299 * In the non-present entry flush case, if hardware doesn't cache
300 * non-present entry we do nothing and if hardware cache non-present
301 * entry, we flush entries of domain 0 (the domain id is used to cache
302 * any non-present entries)
303 */
304 if ( non_present_entry_flush )
305 {
306 if ( !cap_caching_mode(iommu->cap) )
307 return 1;
308 else
309 did = 0;
310 }
312 /* use register invalidation */
313 switch ( type )
314 {
315 case DMA_CCMD_GLOBAL_INVL:
316 val = DMA_CCMD_GLOBAL_INVL;
317 break;
318 case DMA_CCMD_DOMAIN_INVL:
319 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
320 break;
321 case DMA_CCMD_DEVICE_INVL:
322 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
323 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
324 break;
325 default:
326 BUG();
327 }
328 val |= DMA_CCMD_ICC;
330 spin_lock_irqsave(&iommu->register_lock, flag);
331 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
333 /* Make sure hardware complete it */
334 start_time = NOW();
335 for ( ; ; )
336 {
337 val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
338 if ( !(val & DMA_CCMD_ICC) )
339 break;
340 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
341 panic("%s: DMAR hardware is malfunctional,"
342 " please disable IOMMU\n", __func__);
343 cpu_relax();
344 }
345 spin_unlock_irqrestore(&iommu->register_lock, flag);
346 /* flush context entry will implicitly flush write buffer */
347 return 0;
348 }
350 static int inline iommu_flush_context_global(
351 struct iommu *iommu, int non_present_entry_flush)
352 {
353 struct iommu_flush *flush = iommu_get_flush(iommu);
354 return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
355 non_present_entry_flush);
356 }
358 static int inline iommu_flush_context_domain(
359 struct iommu *iommu, u16 did, int non_present_entry_flush)
360 {
361 struct iommu_flush *flush = iommu_get_flush(iommu);
362 return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
363 non_present_entry_flush);
364 }
366 static int inline iommu_flush_context_device(
367 struct iommu *iommu, u16 did, u16 source_id,
368 u8 function_mask, int non_present_entry_flush)
369 {
370 struct iommu_flush *flush = iommu_get_flush(iommu);
371 return flush->context(iommu, did, source_id, function_mask,
372 DMA_CCMD_DEVICE_INVL,
373 non_present_entry_flush);
374 }
376 /* return value determine if we need a write buffer flush */
377 static int flush_iotlb_reg(void *_iommu, u16 did,
378 u64 addr, unsigned int size_order, u64 type,
379 int non_present_entry_flush)
380 {
381 struct iommu *iommu = (struct iommu *) _iommu;
382 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
383 u64 val = 0, val_iva = 0;
384 unsigned long flag;
385 s_time_t start_time;
387 /*
388 * In the non-present entry flush case, if hardware doesn't cache
389 * non-present entry we do nothing and if hardware cache non-present
390 * entry, we flush entries of domain 0 (the domain id is used to cache
391 * any non-present entries)
392 */
393 if ( non_present_entry_flush )
394 {
395 if ( !cap_caching_mode(iommu->cap) )
396 return 1;
397 else
398 did = 0;
399 }
401 /* use register invalidation */
402 switch ( type )
403 {
404 case DMA_TLB_GLOBAL_FLUSH:
405 /* global flush doesn't need set IVA_REG */
406 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
407 break;
408 case DMA_TLB_DSI_FLUSH:
409 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
410 break;
411 case DMA_TLB_PSI_FLUSH:
412 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
413 /* Note: always flush non-leaf currently */
414 val_iva = size_order | addr;
415 break;
416 default:
417 BUG();
418 }
419 /* Note: set drain read/write */
420 if ( cap_read_drain(iommu->cap) )
421 val |= DMA_TLB_READ_DRAIN;
422 if ( cap_write_drain(iommu->cap) )
423 val |= DMA_TLB_WRITE_DRAIN;
425 spin_lock_irqsave(&iommu->register_lock, flag);
426 /* Note: Only uses first TLB reg currently */
427 if ( val_iva )
428 dmar_writeq(iommu->reg, tlb_offset, val_iva);
429 dmar_writeq(iommu->reg, tlb_offset + 8, val);
431 /* Make sure hardware complete it */
432 start_time = NOW();
433 for ( ; ; )
434 {
435 val = dmar_readq(iommu->reg, tlb_offset + 8);
436 if ( !(val & DMA_TLB_IVT) )
437 break;
438 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
439 panic("%s: DMAR hardware is malfunctional,"
440 " please disable IOMMU\n", __func__);
441 cpu_relax();
442 }
443 spin_unlock_irqrestore(&iommu->register_lock, flag);
445 /* check IOTLB invalidation granularity */
446 if ( DMA_TLB_IAIG(val) == 0 )
447 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
449 if ( DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type) )
450 dprintk(XENLOG_INFO VTDPREFIX,
451 "IOMMU: tlb flush request %x, actual %x\n",
452 (u32)DMA_TLB_IIRG(type), (u32)DMA_TLB_IAIG(val));
453 /* flush iotlb entry will implicitly flush write buffer */
454 return 0;
455 }
457 static int inline iommu_flush_iotlb_global(struct iommu *iommu,
458 int non_present_entry_flush)
459 {
460 struct iommu_flush *flush = iommu_get_flush(iommu);
461 return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
462 non_present_entry_flush);
463 }
465 static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
466 int non_present_entry_flush)
467 {
468 struct iommu_flush *flush = iommu_get_flush(iommu);
469 return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
470 non_present_entry_flush);
471 }
473 static int inline get_alignment(u64 base, unsigned int size)
474 {
475 int t = 0;
476 u64 end;
478 end = base + size - 1;
479 while ( base != end )
480 {
481 t++;
482 base >>= 1;
483 end >>= 1;
484 }
485 return t;
486 }
488 static int inline iommu_flush_iotlb_psi(
489 struct iommu *iommu, u16 did,
490 u64 addr, unsigned int pages, int non_present_entry_flush)
491 {
492 unsigned int align;
493 struct iommu_flush *flush = iommu_get_flush(iommu);
495 ASSERT(!(addr & (~PAGE_MASK_4K)));
496 ASSERT(pages > 0);
498 /* Fallback to domain selective flush if no PSI support */
499 if ( !cap_pgsel_inv(iommu->cap) )
500 return iommu_flush_iotlb_dsi(iommu, did,
501 non_present_entry_flush);
503 /*
504 * PSI requires page size is 2 ^ x, and the base address is naturally
505 * aligned to the size
506 */
507 align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
508 /* Fallback to domain selective flush if size is too big */
509 if ( align > cap_max_amask_val(iommu->cap) )
510 return iommu_flush_iotlb_dsi(iommu, did,
511 non_present_entry_flush);
513 addr >>= PAGE_SHIFT_4K + align;
514 addr <<= PAGE_SHIFT_4K + align;
516 return flush->iotlb(iommu, did, addr, align,
517 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
518 }
520 void iommu_flush_all(void)
521 {
522 struct acpi_drhd_unit *drhd;
523 struct iommu *iommu;
525 flush_all_cache();
526 for_each_drhd_unit ( drhd )
527 {
528 iommu = drhd->iommu;
529 iommu_flush_context_global(iommu, 0);
530 iommu_flush_iotlb_global(iommu, 0);
531 }
532 }
534 /* clear one page's page table */
535 static void dma_pte_clear_one(struct domain *domain, u64 addr)
536 {
537 struct hvm_iommu *hd = domain_hvm_iommu(domain);
538 struct acpi_drhd_unit *drhd;
539 struct iommu *iommu;
540 struct dma_pte *page = NULL, *pte = NULL;
541 u64 pg_maddr;
543 /* get last level pte */
544 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
545 if ( pg_maddr == 0 )
546 return;
547 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
548 pte = page + address_level_offset(addr, 1);
550 if ( !dma_pte_present(*pte) )
551 {
552 unmap_vtd_domain_page(page);
553 return;
554 }
556 dma_clear_pte(*pte);
557 iommu_flush_cache_entry(pte);
559 for_each_drhd_unit ( drhd )
560 {
561 iommu = drhd->iommu;
562 if ( test_bit(iommu->index, &hd->iommu_bitmap) )
563 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
564 addr, 1, 0))
565 iommu_flush_write_buffer(iommu);
566 }
568 unmap_vtd_domain_page(page);
569 }
571 static void iommu_free_pagetable(u64 pt_maddr, int level)
572 {
573 int i;
574 struct dma_pte *pt_vaddr, *pte;
575 int next_level = level - 1;
577 if ( pt_maddr == 0 )
578 return;
580 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
582 for ( i = 0; i < PTE_NUM; i++ )
583 {
584 pte = &pt_vaddr[i];
585 if ( !dma_pte_present(*pte) )
586 continue;
588 if ( next_level >= 1 )
589 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
591 dma_clear_pte(*pte);
592 iommu_flush_cache_entry(pte);
593 }
595 unmap_vtd_domain_page(pt_vaddr);
596 free_pgtable_maddr(pt_maddr);
597 }
599 static int iommu_set_root_entry(struct iommu *iommu)
600 {
601 u32 cmd, sts;
602 unsigned long flags;
603 s_time_t start_time;
605 spin_lock_irqsave(&iommu->register_lock, flags);
607 if ( iommu->root_maddr == 0 )
608 iommu->root_maddr = alloc_pgtable_maddr(NULL);
609 if ( iommu->root_maddr == 0 )
610 {
611 spin_unlock_irqrestore(&iommu->register_lock, flags);
612 return -ENOMEM;
613 }
615 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
616 cmd = iommu->gcmd | DMA_GCMD_SRTP;
617 dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
619 /* Make sure hardware complete it */
620 start_time = NOW();
621 for ( ; ; )
622 {
623 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
624 if ( sts & DMA_GSTS_RTPS )
625 break;
626 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
627 panic("%s: DMAR hardware is malfunctional,"
628 " please disable IOMMU\n", __func__);
629 cpu_relax();
630 }
632 spin_unlock_irqrestore(&iommu->register_lock, flags);
634 return 0;
635 }
637 static void iommu_enable_translation(struct iommu *iommu)
638 {
639 u32 sts;
640 unsigned long flags;
641 s_time_t start_time;
643 dprintk(XENLOG_INFO VTDPREFIX,
644 "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
645 spin_lock_irqsave(&iommu->register_lock, flags);
646 iommu->gcmd |= DMA_GCMD_TE;
647 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
648 /* Make sure hardware complete it */
649 start_time = NOW();
650 for ( ; ; )
651 {
652 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
653 if ( sts & DMA_GSTS_TES )
654 break;
655 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
656 panic("%s: DMAR hardware is malfunctional,"
657 " please disable IOMMU\n", __func__);
658 cpu_relax();
659 }
661 /* Disable PMRs when VT-d engine takes effect per spec definition */
662 disable_pmr(iommu);
663 spin_unlock_irqrestore(&iommu->register_lock, flags);
664 }
666 int iommu_disable_translation(struct iommu *iommu)
667 {
668 u32 sts;
669 unsigned long flags;
670 s_time_t start_time;
672 spin_lock_irqsave(&iommu->register_lock, flags);
673 iommu->gcmd &= ~ DMA_GCMD_TE;
674 dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
676 /* Make sure hardware complete it */
677 start_time = NOW();
678 for ( ; ; )
679 {
680 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
681 if ( !(sts & DMA_GSTS_TES) )
682 break;
683 if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
684 panic("%s: DMAR hardware is malfunctional,"
685 " please disable IOMMU\n", __func__);
686 cpu_relax();
687 }
688 spin_unlock_irqrestore(&iommu->register_lock, flags);
689 return 0;
690 }
692 static struct iommu *vector_to_iommu[NR_VECTORS];
693 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
694 u8 fault_reason, u16 source_id, u64 addr)
695 {
696 dprintk(XENLOG_WARNING VTDPREFIX,
697 "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
698 "iommu->reg = %p\n",
699 (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
700 PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
701 fault_reason, iommu->reg);
703 #ifndef __i386__ /* map_domain_page() cannot be used in this context */
704 if ( fault_reason < 0x20 )
705 print_vtd_entries(iommu, (source_id >> 8),
706 (source_id & 0xff), (addr >> PAGE_SHIFT));
707 #endif
709 return 0;
710 }
712 static void iommu_fault_status(u32 fault_status)
713 {
714 if ( fault_status & DMA_FSTS_PFO )
715 dprintk(XENLOG_ERR VTDPREFIX,
716 "iommu_fault_status: Fault Overflow\n");
717 else if ( fault_status & DMA_FSTS_PPF )
718 dprintk(XENLOG_ERR VTDPREFIX,
719 "iommu_fault_status: Primary Pending Fault\n");
720 else if ( fault_status & DMA_FSTS_AFO )
721 dprintk(XENLOG_ERR VTDPREFIX,
722 "iommu_fault_status: Advanced Fault Overflow\n");
723 else if ( fault_status & DMA_FSTS_APF )
724 dprintk(XENLOG_ERR VTDPREFIX,
725 "iommu_fault_status: Advanced Pending Fault\n");
726 else if ( fault_status & DMA_FSTS_IQE )
727 dprintk(XENLOG_ERR VTDPREFIX,
728 "iommu_fault_status: Invalidation Queue Error\n");
729 else if ( fault_status & DMA_FSTS_ICE )
730 dprintk(XENLOG_ERR VTDPREFIX,
731 "iommu_fault_status: Invalidation Completion Error\n");
732 else if ( fault_status & DMA_FSTS_ITE )
733 dprintk(XENLOG_ERR VTDPREFIX,
734 "iommu_fault_status: Invalidation Time-out Error\n");
735 }
737 #define PRIMARY_FAULT_REG_LEN (16)
738 static void iommu_page_fault(int vector, void *dev_id,
739 struct cpu_user_regs *regs)
740 {
741 struct iommu *iommu = dev_id;
742 int reg, fault_index;
743 u32 fault_status;
744 unsigned long flags;
746 dprintk(XENLOG_WARNING VTDPREFIX,
747 "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
749 spin_lock_irqsave(&iommu->register_lock, flags);
750 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
751 spin_unlock_irqrestore(&iommu->register_lock, flags);
753 iommu_fault_status(fault_status);
755 /* FIXME: ignore advanced fault log */
756 if ( !(fault_status & DMA_FSTS_PPF) )
757 return;
758 fault_index = dma_fsts_fault_record_index(fault_status);
759 reg = cap_fault_reg_offset(iommu->cap);
760 for ( ; ; )
761 {
762 u8 fault_reason;
763 u16 source_id;
764 u32 data;
765 u64 guest_addr;
766 int type;
768 /* highest 32 bits */
769 spin_lock_irqsave(&iommu->register_lock, flags);
770 data = dmar_readl(iommu->reg, reg +
771 fault_index * PRIMARY_FAULT_REG_LEN + 12);
772 if ( !(data & DMA_FRCD_F) )
773 {
774 spin_unlock_irqrestore(&iommu->register_lock, flags);
775 break;
776 }
778 fault_reason = dma_frcd_fault_reason(data);
779 type = dma_frcd_type(data);
781 data = dmar_readl(iommu->reg, reg +
782 fault_index * PRIMARY_FAULT_REG_LEN + 8);
783 source_id = dma_frcd_source_id(data);
785 guest_addr = dmar_readq(iommu->reg, reg +
786 fault_index * PRIMARY_FAULT_REG_LEN);
787 guest_addr = dma_frcd_page_addr(guest_addr);
788 /* clear the fault */
789 dmar_writel(iommu->reg, reg +
790 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
791 spin_unlock_irqrestore(&iommu->register_lock, flags);
793 iommu_page_fault_do_one(iommu, type, fault_reason,
794 source_id, guest_addr);
796 fault_index++;
797 if ( fault_index > cap_num_fault_regs(iommu->cap) )
798 fault_index = 0;
799 }
801 /* clear primary fault overflow */
802 if ( fault_status & DMA_FSTS_PFO )
803 {
804 spin_lock_irqsave(&iommu->register_lock, flags);
805 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
806 spin_unlock_irqrestore(&iommu->register_lock, flags);
807 }
808 }
810 static void dma_msi_unmask(unsigned int vector)
811 {
812 struct iommu *iommu = vector_to_iommu[vector];
813 unsigned long flags;
815 /* unmask it */
816 spin_lock_irqsave(&iommu->register_lock, flags);
817 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
818 spin_unlock_irqrestore(&iommu->register_lock, flags);
819 }
821 static void dma_msi_mask(unsigned int vector)
822 {
823 unsigned long flags;
824 struct iommu *iommu = vector_to_iommu[vector];
826 /* mask it */
827 spin_lock_irqsave(&iommu->register_lock, flags);
828 dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
829 spin_unlock_irqrestore(&iommu->register_lock, flags);
830 }
832 static unsigned int dma_msi_startup(unsigned int vector)
833 {
834 dma_msi_unmask(vector);
835 return 0;
836 }
838 static void dma_msi_end(unsigned int vector)
839 {
840 dma_msi_unmask(vector);
841 ack_APIC_irq();
842 }
844 static void dma_msi_data_init(struct iommu *iommu, int vector)
845 {
846 u32 msi_data = 0;
847 unsigned long flags;
849 /* Fixed, edge, assert mode. Follow MSI setting */
850 msi_data |= vector & 0xff;
851 msi_data |= 1 << 14;
853 spin_lock_irqsave(&iommu->register_lock, flags);
854 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
855 spin_unlock_irqrestore(&iommu->register_lock, flags);
856 }
858 #ifdef SUPPORT_MSI_REMAPPING
859 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
860 {
861 u64 msi_address;
862 unsigned long flags;
864 /* Physical, dedicated cpu. Follow MSI setting */
865 msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
866 msi_address |= MSI_PHYSICAL_MODE << 2;
867 msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
868 msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
870 spin_lock_irqsave(&iommu->register_lock, flags);
871 dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
872 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
873 spin_unlock_irqrestore(&iommu->register_lock, flags);
874 }
875 #else
876 static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
877 {
878 /* ia64: TODO */
879 }
880 #endif
882 static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
883 {
884 struct iommu *iommu = vector_to_iommu[vector];
885 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
886 }
888 static struct hw_interrupt_type dma_msi_type = {
889 .typename = "DMA_MSI",
890 .startup = dma_msi_startup,
891 .shutdown = dma_msi_mask,
892 .enable = dma_msi_unmask,
893 .disable = dma_msi_mask,
894 .ack = dma_msi_mask,
895 .end = dma_msi_end,
896 .set_affinity = dma_msi_set_affinity,
897 };
899 int iommu_set_interrupt(struct iommu *iommu)
900 {
901 int vector, ret;
903 vector = assign_irq_vector(AUTO_ASSIGN);
904 vector_to_iommu[vector] = iommu;
906 /* VT-d fault is a MSI, make irq == vector */
907 irq_vector[vector] = vector;
908 vector_irq[vector] = vector;
910 if ( !vector )
911 {
912 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
913 return -EINVAL;
914 }
916 irq_desc[vector].handler = &dma_msi_type;
917 ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
918 if ( ret )
919 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
920 return vector;
921 }
923 static int iommu_alloc(struct acpi_drhd_unit *drhd)
924 {
925 struct iommu *iommu;
926 unsigned long sagaw;
927 int agaw;
929 if ( nr_iommus > MAX_IOMMUS )
930 {
931 gdprintk(XENLOG_ERR VTDPREFIX,
932 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
933 return -ENOMEM;
934 }
936 iommu = xmalloc(struct iommu);
937 if ( iommu == NULL )
938 return -ENOMEM;
939 memset(iommu, 0, sizeof(struct iommu));
941 iommu->intel = alloc_intel_iommu();
942 if ( iommu->intel == NULL )
943 {
944 xfree(iommu);
945 return -ENOMEM;
946 }
948 iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
949 iommu->index = nr_iommus++;
951 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
952 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
954 /* Calculate number of pagetable levels: between 2 and 4. */
955 sagaw = cap_sagaw(iommu->cap);
956 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
957 if ( test_bit(agaw, &sagaw) )
958 break;
959 if ( agaw < 0 )
960 {
961 gdprintk(XENLOG_ERR VTDPREFIX,
962 "IOMMU: unsupported sagaw %lx\n", sagaw);
963 xfree(iommu);
964 return -ENODEV;
965 }
966 iommu->nr_pt_levels = agaw_to_level(agaw);
968 if ( !ecap_coherent(iommu->ecap) )
969 iommus_incoherent = 1;
971 spin_lock_init(&iommu->lock);
972 spin_lock_init(&iommu->register_lock);
974 drhd->iommu = iommu;
975 return 0;
976 }
978 static void iommu_free(struct acpi_drhd_unit *drhd)
979 {
980 struct iommu *iommu = drhd->iommu;
982 if ( iommu == NULL )
983 return;
985 if ( iommu->root_maddr != 0 )
986 {
987 free_pgtable_maddr(iommu->root_maddr);
988 iommu->root_maddr = 0;
989 }
991 if ( iommu->reg )
992 iounmap(iommu->reg);
994 free_intel_iommu(iommu->intel);
995 free_irq(iommu->vector);
996 xfree(iommu);
998 drhd->iommu = NULL;
999 }
1001 #define guestwidth_to_adjustwidth(gaw) ({ \
1002 int agaw, r = (gaw - 12) % 9; \
1003 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
1004 if ( agaw > 64 ) \
1005 agaw = 64; \
1006 agaw; })
1008 static int intel_iommu_domain_init(struct domain *d)
1010 struct hvm_iommu *hd = domain_hvm_iommu(d);
1011 struct iommu *iommu = NULL;
1012 u64 i, j, tmp;
1013 struct acpi_drhd_unit *drhd;
1015 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1016 iommu = drhd->iommu;
1018 hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1020 if ( d->domain_id == 0 )
1022 extern int xen_in_range(paddr_t start, paddr_t end);
1023 extern int tboot_in_range(paddr_t start, paddr_t end);
1025 /*
1026 * Set up 1:1 page table for dom0 except the critical segments
1027 * like Xen and tboot.
1028 */
1029 for ( i = 0; i < max_page; i++ )
1031 if ( xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) ||
1032 tboot_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) )
1033 continue;
1035 tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K);
1036 for ( j = 0; j < tmp; j++ )
1037 iommu_map_page(d, (i*tmp+j), (i*tmp+j));
1040 setup_dom0_devices(d);
1041 setup_dom0_rmrr(d);
1043 iommu_flush_all();
1045 for_each_drhd_unit ( drhd )
1047 iommu = drhd->iommu;
1048 iommu_enable_translation(iommu);
1052 return 0;
1055 static int domain_context_mapping_one(
1056 struct domain *domain,
1057 struct iommu *iommu,
1058 u8 bus, u8 devfn)
1060 struct hvm_iommu *hd = domain_hvm_iommu(domain);
1061 struct context_entry *context, *context_entries;
1062 unsigned long flags;
1063 u64 maddr, pgd_maddr;
1064 int agaw;
1066 maddr = bus_to_context_maddr(iommu, bus);
1067 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1068 context = &context_entries[devfn];
1070 if ( context_present(*context) )
1072 unmap_vtd_domain_page(context_entries);
1073 return 0;
1076 spin_lock_irqsave(&iommu->lock, flags);
1077 if ( iommu_passthrough &&
1078 ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
1080 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1081 agaw = level_to_agaw(iommu->nr_pt_levels);
1083 else
1085 /* Ensure we have pagetables allocated down to leaf PTE. */
1086 if ( hd->pgd_maddr == 0 )
1088 addr_to_dma_page_maddr(domain, 0, 1);
1089 if ( hd->pgd_maddr == 0 )
1091 nomem:
1092 unmap_vtd_domain_page(context_entries);
1093 spin_unlock_irqrestore(&iommu->lock, flags);
1094 return -ENOMEM;
1098 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1099 pgd_maddr = hd->pgd_maddr;
1100 for ( agaw = level_to_agaw(4);
1101 agaw != level_to_agaw(iommu->nr_pt_levels);
1102 agaw-- )
1104 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1105 pgd_maddr = dma_pte_addr(*p);
1106 unmap_vtd_domain_page(p);
1107 if ( pgd_maddr == 0 )
1108 goto nomem;
1111 context_set_address_root(*context, pgd_maddr);
1112 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1115 /*
1116 * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
1117 * be 1 based as required by intel's iommu hw.
1118 */
1119 context_set_domain_id(context, domain);
1120 context_set_address_width(*context, agaw);
1121 context_set_fault_enable(*context);
1122 context_set_present(*context);
1123 iommu_flush_cache_entry(context);
1125 unmap_vtd_domain_page(context_entries);
1127 /* Context entry was previously non-present (with domid 0). */
1128 iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
1129 DMA_CCMD_MASK_NOBIT, 1);
1130 if ( iommu_flush_iotlb_dsi(iommu, 0, 1) )
1131 iommu_flush_write_buffer(iommu);
1133 set_bit(iommu->index, &hd->iommu_bitmap);
1134 spin_unlock_irqrestore(&iommu->lock, flags);
1136 return 0;
1139 #define PCI_BASE_CLASS_BRIDGE 0x06
1140 #define PCI_CLASS_BRIDGE_PCI 0x0604
1142 enum {
1143 DEV_TYPE_PCIe_ENDPOINT,
1144 DEV_TYPE_PCIe_BRIDGE,
1145 DEV_TYPE_PCI_BRIDGE,
1146 DEV_TYPE_PCI,
1147 };
1149 int pdev_type(u8 bus, u8 devfn)
1151 u16 class_device;
1152 u16 status, creg;
1153 int pos;
1154 u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
1156 class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
1157 if ( class_device == PCI_CLASS_BRIDGE_PCI )
1159 pos = pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
1160 if ( !pos )
1161 return DEV_TYPE_PCI_BRIDGE;
1162 creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
1163 return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
1164 DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
1167 status = pci_conf_read16(bus, d, f, PCI_STATUS);
1168 if ( !(status & PCI_STATUS_CAP_LIST) )
1169 return DEV_TYPE_PCI;
1171 if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
1172 return DEV_TYPE_PCIe_ENDPOINT;
1174 return DEV_TYPE_PCI;
1177 #define MAX_BUSES 256
1178 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
1180 static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
1182 int cnt = 0;
1183 *secbus = *bus;
1185 if ( *bus == 0 )
1186 /* assume integrated PCI devices in RC have valid requester-id */
1187 return 1;
1189 if ( !bus2bridge[*bus].map )
1190 return 0;
1192 while ( bus2bridge[*bus].map )
1194 *secbus = *bus;
1195 *devfn = bus2bridge[*bus].devfn;
1196 *bus = bus2bridge[*bus].bus;
1197 if ( cnt++ >= MAX_BUSES )
1198 return 0;
1201 return 1;
1204 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
1206 struct acpi_drhd_unit *drhd;
1207 int ret = 0;
1208 u16 sec_bus, sub_bus, ob, odf;
1209 u32 type;
1210 u8 secbus;
1212 drhd = acpi_find_matched_drhd_unit(bus, devfn);
1213 if ( !drhd )
1214 return -ENODEV;
1216 type = pdev_type(bus, devfn);
1217 switch ( type )
1219 case DEV_TYPE_PCIe_BRIDGE:
1220 case DEV_TYPE_PCI_BRIDGE:
1221 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1222 PCI_SECONDARY_BUS);
1223 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1224 PCI_SUBORDINATE_BUS);
1225 /*dmar_scope_add_buses(&drhd->scope, sec_bus, sub_bus);*/
1227 if ( type == DEV_TYPE_PCIe_BRIDGE )
1228 break;
1230 for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
1232 bus2bridge[sec_bus].map = 1;
1233 bus2bridge[sec_bus].bus = bus;
1234 bus2bridge[sec_bus].devfn = devfn;
1236 break;
1238 case DEV_TYPE_PCIe_ENDPOINT:
1239 gdprintk(XENLOG_INFO VTDPREFIX,
1240 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
1241 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1242 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1243 break;
1245 case DEV_TYPE_PCI:
1246 gdprintk(XENLOG_INFO VTDPREFIX,
1247 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
1248 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1250 ob = bus; odf = devfn;
1251 if ( !find_pcie_endpoint(&bus, &devfn, &secbus) )
1253 gdprintk(XENLOG_WARNING VTDPREFIX,
1254 "domain_context_mapping:invalid\n");
1255 break;
1258 if ( ob != bus || odf != devfn )
1259 gdprintk(XENLOG_INFO VTDPREFIX,
1260 "domain_context_mapping:map: "
1261 "bdf = %x:%x.%x -> %x:%x.%x\n",
1262 ob, PCI_SLOT(odf), PCI_FUNC(odf),
1263 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1265 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
1266 if ( secbus != bus )
1267 /*
1268 * The source-id for transactions on non-PCIe buses seem
1269 * to originate from devfn=0 on the secondary bus behind
1270 * the bridge. Map that id as well. The id to use in
1271 * these scanarios is not particularly well documented
1272 * anywhere.
1273 */
1274 domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
1275 break;
1277 default:
1278 gdprintk(XENLOG_ERR VTDPREFIX,
1279 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
1280 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1281 ret = -EINVAL;
1282 break;
1285 return ret;
1288 static int domain_context_unmap_one(
1289 struct domain *domain,
1290 struct iommu *iommu,
1291 u8 bus, u8 devfn)
1293 struct context_entry *context, *context_entries;
1294 unsigned long flags;
1295 u64 maddr;
1297 maddr = bus_to_context_maddr(iommu, bus);
1298 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1299 context = &context_entries[devfn];
1301 if ( !context_present(*context) )
1303 unmap_vtd_domain_page(context_entries);
1304 return 0;
1307 spin_lock_irqsave(&iommu->lock, flags);
1308 context_clear_present(*context);
1309 context_clear_entry(*context);
1310 iommu_flush_cache_entry(context);
1311 iommu_flush_context_domain(iommu, domain_iommu_domid(domain), 0);
1312 iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
1313 unmap_vtd_domain_page(context_entries);
1314 spin_unlock_irqrestore(&iommu->lock, flags);
1316 return 0;
1319 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
1321 struct acpi_drhd_unit *drhd;
1322 u16 sec_bus, sub_bus;
1323 int ret = 0;
1324 u32 type;
1325 u8 secbus;
1327 drhd = acpi_find_matched_drhd_unit(bus, devfn);
1328 if ( !drhd )
1329 return -ENODEV;
1331 type = pdev_type(bus, devfn);
1332 switch ( type )
1334 case DEV_TYPE_PCIe_BRIDGE:
1335 case DEV_TYPE_PCI_BRIDGE:
1336 sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1337 PCI_SECONDARY_BUS);
1338 sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1339 PCI_SUBORDINATE_BUS);
1340 /*dmar_scope_remove_buses(&drhd->scope, sec_bus, sub_bus);*/
1341 if ( DEV_TYPE_PCI_BRIDGE )
1342 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1343 break;
1345 case DEV_TYPE_PCIe_ENDPOINT:
1346 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1347 break;
1349 case DEV_TYPE_PCI:
1350 if ( find_pcie_endpoint(&bus, &devfn, &secbus) )
1351 ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
1352 if ( bus != secbus )
1353 domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
1354 break;
1356 default:
1357 gdprintk(XENLOG_ERR VTDPREFIX,
1358 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
1359 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1360 ret = -EINVAL;
1361 break;
1364 return ret;
1367 static int reassign_device_ownership(
1368 struct domain *source,
1369 struct domain *target,
1370 u8 bus, u8 devfn)
1372 struct hvm_iommu *source_hd = domain_hvm_iommu(source);
1373 struct pci_dev *pdev;
1374 struct acpi_drhd_unit *drhd;
1375 struct iommu *pdev_iommu;
1376 int ret, found = 0;
1378 if ( !(pdev = pci_lock_domain_pdev(source, bus, devfn)) )
1379 return -ENODEV;
1381 drhd = acpi_find_matched_drhd_unit(bus, devfn);
1382 pdev_iommu = drhd->iommu;
1383 domain_context_unmap(source, bus, devfn);
1385 ret = domain_context_mapping(target, bus, devfn);
1386 if ( ret )
1387 return ret;
1389 write_lock(&pcidevs_lock);
1390 list_move(&pdev->domain_list, &target->arch.pdev_list);
1391 write_unlock(&pcidevs_lock);
1392 pdev->domain = target;
1394 spin_unlock(&pdev->lock);
1396 read_lock(&pcidevs_lock);
1397 for_each_pdev ( source, pdev )
1399 drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
1400 if ( drhd->iommu == pdev_iommu )
1402 found = 1;
1403 break;
1406 read_unlock(&pcidevs_lock);
1408 if ( !found )
1409 clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
1411 return ret;
1414 void iommu_domain_teardown(struct domain *d)
1416 struct hvm_iommu *hd = domain_hvm_iommu(d);
1418 if ( list_empty(&acpi_drhd_units) )
1419 return;
1421 iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
1422 hd->pgd_maddr = 0;
1423 iommu_domid_release(d);
1426 static int domain_context_mapped(u8 bus, u8 devfn)
1428 struct acpi_drhd_unit *drhd;
1430 for_each_drhd_unit ( drhd )
1431 if ( device_context_mapped(drhd->iommu, bus, devfn) )
1432 return 1;
1434 return 0;
1437 int intel_iommu_map_page(
1438 struct domain *d, unsigned long gfn, unsigned long mfn)
1440 struct hvm_iommu *hd = domain_hvm_iommu(d);
1441 struct acpi_drhd_unit *drhd;
1442 struct iommu *iommu;
1443 struct dma_pte *page = NULL, *pte = NULL;
1444 u64 pg_maddr;
1445 int pte_present;
1447 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1448 iommu = drhd->iommu;
1450 /* do nothing if dom0 and iommu supports pass thru */
1451 if ( iommu_passthrough &&
1452 ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
1453 return 0;
1455 pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1456 if ( pg_maddr == 0 )
1457 return -ENOMEM;
1458 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1459 pte = page + (gfn & LEVEL_MASK);
1460 pte_present = dma_pte_present(*pte);
1461 dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
1462 dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
1463 iommu_flush_cache_entry(pte);
1464 unmap_vtd_domain_page(page);
1466 for_each_drhd_unit ( drhd )
1468 iommu = drhd->iommu;
1470 if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
1471 continue;
1473 if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
1474 (paddr_t)gfn << PAGE_SHIFT_4K, 1,
1475 !pte_present) )
1476 iommu_flush_write_buffer(iommu);
1479 return 0;
1482 int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
1484 struct acpi_drhd_unit *drhd;
1485 struct iommu *iommu;
1487 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1488 iommu = drhd->iommu;
1490 /* do nothing if dom0 and iommu supports pass thru */
1491 if ( iommu_passthrough &&
1492 ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
1493 return 0;
1495 dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1497 return 0;
1500 static int iommu_prepare_rmrr_dev(struct domain *d,
1501 struct acpi_rmrr_unit *rmrr,
1502 u8 bus, u8 devfn)
1504 int ret = 0;
1505 u64 base, end;
1506 unsigned long base_pfn, end_pfn;
1508 ASSERT(rmrr->base_address < rmrr->end_address);
1510 base = rmrr->base_address & PAGE_MASK_4K;
1511 base_pfn = base >> PAGE_SHIFT_4K;
1512 end = PAGE_ALIGN_4K(rmrr->end_address);
1513 end_pfn = end >> PAGE_SHIFT_4K;
1515 while ( base_pfn < end_pfn )
1517 intel_iommu_map_page(d, base_pfn, base_pfn);
1518 base_pfn++;
1521 if ( domain_context_mapped(bus, devfn) == 0 )
1522 ret = domain_context_mapping(d, bus, devfn);
1524 return ret;
1527 static int intel_iommu_add_device(struct pci_dev *pdev)
1529 struct acpi_rmrr_unit *rmrr;
1530 u16 bdf;
1531 int ret, i;
1533 if ( !pdev->domain )
1534 return -EINVAL;
1536 ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
1537 if ( ret )
1539 gdprintk(XENLOG_ERR VTDPREFIX,
1540 "intel_iommu_add_device: context mapping failed\n");
1541 return ret;
1544 for_each_rmrr_device ( rmrr, bdf, i )
1546 if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
1548 ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
1549 pdev->bus, pdev->devfn);
1550 if ( ret )
1551 gdprintk(XENLOG_ERR VTDPREFIX,
1552 "intel_iommu_add_device: RMRR mapping failed\n");
1553 break;
1557 return ret;
1560 static int intel_iommu_remove_device(struct pci_dev *pdev)
1562 struct acpi_rmrr_unit *rmrr;
1563 u16 bdf;
1564 int i;
1566 if ( !pdev->domain )
1567 return -EINVAL;
1569 /* If the device belongs to dom0, and it has RMRR, don't remove it
1570 * from dom0, because BIOS may use RMRR at booting time.
1571 */
1572 if ( pdev->domain->domain_id == 0 )
1574 for_each_rmrr_device ( rmrr, bdf, i )
1576 if ( PCI_BUS(bdf) == pdev->bus &&
1577 PCI_DEVFN2(bdf) == pdev->devfn )
1578 return 0;
1582 return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
1585 static void setup_dom0_devices(struct domain *d)
1587 struct hvm_iommu *hd;
1588 struct pci_dev *pdev;
1589 int bus, dev, func;
1590 u32 l;
1592 hd = domain_hvm_iommu(d);
1594 write_lock(&pcidevs_lock);
1595 for ( bus = 0; bus < 256; bus++ )
1597 for ( dev = 0; dev < 32; dev++ )
1599 for ( func = 0; func < 8; func++ )
1601 l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
1602 /* some broken boards return 0 or ~0 if a slot is empty: */
1603 if ( (l == 0xffffffff) || (l == 0x00000000) ||
1604 (l == 0x0000ffff) || (l == 0xffff0000) )
1605 continue;
1607 pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
1608 pdev->domain = d;
1609 list_add(&pdev->domain_list, &d->arch.pdev_list);
1610 domain_context_mapping(d, pdev->bus, pdev->devfn);
1614 write_unlock(&pcidevs_lock);
1617 void clear_fault_bits(struct iommu *iommu)
1619 u64 val;
1621 val = dmar_readq(
1622 iommu->reg,
1623 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
1624 dmar_writeq(
1625 iommu->reg,
1626 cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
1627 val);
1628 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
1631 static int init_vtd_hw(void)
1633 struct acpi_drhd_unit *drhd;
1634 struct iommu *iommu;
1635 struct iommu_flush *flush = NULL;
1636 int vector;
1637 int ret;
1639 for_each_drhd_unit ( drhd )
1641 iommu = drhd->iommu;
1642 ret = iommu_set_root_entry(iommu);
1643 if ( ret )
1645 gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
1646 return -EIO;
1649 vector = iommu_set_interrupt(iommu);
1650 dma_msi_data_init(iommu, vector);
1651 dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
1652 iommu->vector = vector;
1653 clear_fault_bits(iommu);
1654 dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
1656 /* initialize flush functions */
1657 flush = iommu_get_flush(iommu);
1658 flush->context = flush_context_reg;
1659 flush->iotlb = flush_iotlb_reg;
1662 for_each_drhd_unit ( drhd )
1664 iommu = drhd->iommu;
1665 if ( qinval_setup(iommu) != 0 )
1666 dprintk(XENLOG_INFO VTDPREFIX,
1667 "Queued Invalidation hardware not found\n");
1670 for_each_drhd_unit ( drhd )
1672 iommu = drhd->iommu;
1673 if ( intremap_setup(iommu) != 0 )
1674 dprintk(XENLOG_INFO VTDPREFIX,
1675 "Interrupt Remapping hardware not found\n");
1678 return 0;
1681 static void setup_dom0_rmrr(struct domain *d)
1683 struct acpi_rmrr_unit *rmrr;
1684 u16 bdf;
1685 int ret, i;
1687 for_each_rmrr_device ( rmrr, bdf, i )
1689 ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
1690 if ( ret )
1691 gdprintk(XENLOG_ERR VTDPREFIX,
1692 "IOMMU: mapping reserved region failed\n");
1696 int intel_vtd_setup(void)
1698 struct acpi_drhd_unit *drhd;
1699 struct iommu *iommu;
1701 if ( !vtd_enabled )
1702 return -ENODEV;
1704 spin_lock_init(&domid_bitmap_lock);
1705 clflush_size = get_cache_line_size();
1707 for_each_drhd_unit ( drhd )
1708 if ( iommu_alloc(drhd) != 0 )
1709 goto error;
1711 /* Allocate IO page directory page for the domain. */
1712 drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
1713 iommu = drhd->iommu;
1715 /* Allocate domain id bitmap, and set bit 0 as reserved */
1716 domid_bitmap_size = cap_ndoms(iommu->cap);
1717 domid_bitmap = xmalloc_array(unsigned long,
1718 BITS_TO_LONGS(domid_bitmap_size));
1719 if ( domid_bitmap == NULL )
1720 goto error;
1721 memset(domid_bitmap, 0, domid_bitmap_size / 8);
1722 set_bit(0, domid_bitmap);
1724 if ( init_vtd_hw() )
1725 goto error;
1727 register_keyhandler('V', dump_iommu_info, "dump iommu info");
1729 return 0;
1731 error:
1732 for_each_drhd_unit ( drhd )
1733 iommu_free(drhd);
1734 vtd_enabled = 0;
1735 return -ENOMEM;
1738 /*
1739 * If the device isn't owned by dom0, it means it already
1740 * has been assigned to other domain, or it's not exist.
1741 */
1742 int device_assigned(u8 bus, u8 devfn)
1744 struct pci_dev *pdev;
1746 if ( (pdev = pci_lock_domain_pdev(dom0, bus, devfn)) )
1748 spin_unlock(&pdev->lock);
1749 return 0;
1752 return 1;
1755 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
1757 struct acpi_rmrr_unit *rmrr;
1758 int ret = 0, i;
1759 u16 bdf;
1761 if ( list_empty(&acpi_drhd_units) )
1762 return -ENODEV;
1764 ret = reassign_device_ownership(dom0, d, bus, devfn);
1765 if ( ret )
1766 return ret;
1768 /* Setup rmrr identity mapping */
1769 for_each_rmrr_device( rmrr, bdf, i )
1771 if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
1773 /* FIXME: Because USB RMRR conflicts with guest bios region,
1774 * ignore USB RMRR temporarily.
1775 */
1776 if ( is_usb_device(bus, devfn) )
1777 return 0;
1779 ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
1780 if ( ret )
1781 gdprintk(XENLOG_ERR VTDPREFIX,
1782 "IOMMU: mapping reserved region failed\n");
1783 return ret;
1787 return ret;
1790 static int intel_iommu_group_id(u8 bus, u8 devfn)
1792 u8 secbus;
1793 if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
1794 return PCI_BDF2(bus, devfn);
1795 else
1796 return -1;
1799 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
1800 void iommu_suspend(void)
1802 struct acpi_drhd_unit *drhd;
1803 struct iommu *iommu;
1804 u32 i;
1806 if ( !vtd_enabled )
1807 return;
1809 iommu_flush_all();
1811 for_each_drhd_unit ( drhd )
1813 iommu = drhd->iommu;
1814 i = iommu->index;
1816 iommu_state[i][DMAR_FECTL_REG] =
1817 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
1818 iommu_state[i][DMAR_FEDATA_REG] =
1819 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
1820 iommu_state[i][DMAR_FEADDR_REG] =
1821 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
1822 iommu_state[i][DMAR_FEUADDR_REG] =
1823 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
1827 void iommu_resume(void)
1829 struct acpi_drhd_unit *drhd;
1830 struct iommu *iommu;
1831 u32 i;
1833 if ( !vtd_enabled )
1834 return;
1836 iommu_flush_all();
1838 if ( init_vtd_hw() != 0 && force_iommu )
1839 panic("IOMMU setup failed, crash Xen for security purpose!\n");
1841 for_each_drhd_unit ( drhd )
1843 iommu = drhd->iommu;
1844 i = iommu->index;
1846 dmar_writel(iommu->reg, DMAR_FECTL_REG,
1847 (u32) iommu_state[i][DMAR_FECTL_REG]);
1848 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
1849 (u32) iommu_state[i][DMAR_FEDATA_REG]);
1850 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
1851 (u32) iommu_state[i][DMAR_FEADDR_REG]);
1852 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
1853 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
1854 iommu_enable_translation(iommu);
1858 struct iommu_ops intel_iommu_ops = {
1859 .init = intel_iommu_domain_init,
1860 .add_device = intel_iommu_add_device,
1861 .remove_device = intel_iommu_remove_device,
1862 .assign_device = intel_iommu_assign_device,
1863 .teardown = iommu_domain_teardown,
1864 .map_page = intel_iommu_map_page,
1865 .unmap_page = intel_iommu_unmap_page,
1866 .reassign_device = reassign_device_ownership,
1867 .get_device_group_id = intel_iommu_group_id,
1868 .update_ire_from_apic = io_apic_write_remap_rte,
1869 .update_ire_from_msi = msi_msg_write_remap_rte,
1870 };
1872 /*
1873 * Local variables:
1874 * mode: C
1875 * c-set-style: "BSD"
1876 * c-basic-offset: 4
1877 * tab-width: 4
1878 * indent-tabs-mode: nil
1879 * End:
1880 */