2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100 static inline int agaw_to_level(int agaw)
105 static inline int agaw_to_width(int agaw)
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
110 static inline int width_to_agaw(int width)
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
115 static inline unsigned int level_to_offset_bits(int level)
117 return (level - 1) * LEVEL_STRIDE;
120 static inline int pfn_level_offset(unsigned long pfn, int level)
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 static inline unsigned long level_mask(int level)
127 return -1UL << level_to_offset_bits(level);
130 static inline unsigned long level_size(int level)
132 return 1UL << level_to_offset_bits(level);
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 return (pfn + level_size(level) - 1) & level_mask(level);
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 return mm_to_dma_pfn(page_to_pfn(pg));
160 static inline unsigned long virt_to_dma_pfn(void *p)
162 return page_to_dma_pfn(virt_to_page(p));
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
175 static int force_on = 0;
180 * 12-63: Context Ptr (12 - (haw-1))
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
190 return (root->val & 1);
192 static inline void set_root_present(struct root_entry *root)
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 root->val |= value & VTD_PAGE_MASK;
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
204 return (struct context_entry *)
205 (root_present(root)?phys_to_virt(
206 root->val & VTD_PAGE_MASK) :
213 * 1: fault processing disable
214 * 2-3: translation type
215 * 12-63: address space root
221 struct context_entry {
226 static inline bool context_present(struct context_entry *context)
228 return (context->lo & 1);
230 static inline void context_set_present(struct context_entry *context)
235 static inline void context_set_fault_enable(struct context_entry *context)
237 context->lo &= (((u64)-1) << 2) | 1;
240 static inline void context_set_translation_type(struct context_entry *context,
243 context->lo &= (((u64)-1) << 4) | 3;
244 context->lo |= (value & 3) << 2;
247 static inline void context_set_address_root(struct context_entry *context,
250 context->lo |= value & VTD_PAGE_MASK;
253 static inline void context_set_address_width(struct context_entry *context,
256 context->hi |= value & 7;
259 static inline void context_set_domain_id(struct context_entry *context,
262 context->hi |= (value & ((1 << 16) - 1)) << 8;
265 static inline void context_clear_entry(struct context_entry *context)
278 * 12-63: Host physcial address
284 static inline void dma_clear_pte(struct dma_pte *pte)
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 return pte->val & VTD_PAGE_MASK;
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
299 static inline bool dma_pte_present(struct dma_pte *pte)
301 return (pte->val & 3) != 0;
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
306 return (pte->val & DMA_PTE_LARGE_PAGE);
309 static inline int first_pte_in_page(struct dma_pte *pte)
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
323 /* domain represents a virtual machine, more than one devices
324 * across iommus may be owned in one domain, e.g. kvm guest.
326 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
328 /* si_domain contains mulitple devices */
329 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
331 /* define the limit of IOMMUs supported in each domain */
333 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
335 # define IOMMU_UNITS_SUPPORTED 64
339 int id; /* domain id */
340 int nid; /* node id */
341 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
342 /* bitmap of iommus this domain uses*/
344 struct list_head devices; /* all devices' list */
345 struct iova_domain iovad; /* iova's that belong to this domain */
347 struct dma_pte *pgd; /* virtual address */
348 int gaw; /* max guest address width */
350 /* adjusted guest address width, 0 is level 2 30-bit */
353 int flags; /* flags to find out type of domain */
355 int iommu_coherency;/* indicate coherency of iommu access */
356 int iommu_snooping; /* indicate snooping control feature*/
357 int iommu_count; /* reference count of iommu */
358 int iommu_superpage;/* Level of superpages supported:
359 0 == 4KiB (no superpages), 1 == 2MiB,
360 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
361 spinlock_t iommu_lock; /* protect iommu set in domain */
362 u64 max_addr; /* maximum mapped address */
365 /* PCI domain-device relationship */
366 struct device_domain_info {
367 struct list_head link; /* link to domain siblings */
368 struct list_head global; /* link to global list */
369 u8 bus; /* PCI bus number */
370 u8 devfn; /* PCI devfn number */
371 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
372 struct intel_iommu *iommu; /* IOMMU used by this device */
373 struct dmar_domain *domain; /* pointer to domain */
376 struct dmar_rmrr_unit {
377 struct list_head list; /* list of rmrr units */
378 struct acpi_dmar_header *hdr; /* ACPI header */
379 u64 base_address; /* reserved base address*/
380 u64 end_address; /* reserved end address */
381 struct dmar_dev_scope *devices; /* target devices */
382 int devices_cnt; /* target device count */
385 struct dmar_atsr_unit {
386 struct list_head list; /* list of ATSR units */
387 struct acpi_dmar_header *hdr; /* ACPI header */
388 struct dmar_dev_scope *devices; /* target devices */
389 int devices_cnt; /* target device count */
390 u8 include_all:1; /* include all ports */
393 static LIST_HEAD(dmar_atsr_units);
394 static LIST_HEAD(dmar_rmrr_units);
396 #define for_each_rmrr_units(rmrr) \
397 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
399 static void flush_unmaps_timeout(unsigned long data);
401 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
403 #define HIGH_WATER_MARK 250
404 struct deferred_flush_tables {
406 struct iova *iova[HIGH_WATER_MARK];
407 struct dmar_domain *domain[HIGH_WATER_MARK];
408 struct page *freelist[HIGH_WATER_MARK];
411 static struct deferred_flush_tables *deferred_flush;
413 /* bitmap for indexing intel_iommus */
414 static int g_num_of_iommus;
416 static DEFINE_SPINLOCK(async_umap_flush_lock);
417 static LIST_HEAD(unmaps_to_do);
420 static long list_size;
422 static void domain_exit(struct dmar_domain *domain);
423 static void domain_remove_dev_info(struct dmar_domain *domain);
424 static void domain_remove_one_dev_info(struct dmar_domain *domain,
426 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
428 static int domain_detach_iommu(struct dmar_domain *domain,
429 struct intel_iommu *iommu);
431 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
432 int dmar_disabled = 0;
434 int dmar_disabled = 1;
435 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
437 int intel_iommu_enabled = 0;
438 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
440 static int dmar_map_gfx = 1;
441 static int dmar_forcedac;
442 static int intel_iommu_strict;
443 static int intel_iommu_superpage = 1;
445 int intel_iommu_gfx_mapped;
446 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
448 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
449 static DEFINE_SPINLOCK(device_domain_lock);
450 static LIST_HEAD(device_domain_list);
452 static const struct iommu_ops intel_iommu_ops;
454 static int __init intel_iommu_setup(char *str)
459 if (!strncmp(str, "on", 2)) {
461 printk(KERN_INFO "Intel-IOMMU: enabled\n");
462 } else if (!strncmp(str, "off", 3)) {
464 printk(KERN_INFO "Intel-IOMMU: disabled\n");
465 } else if (!strncmp(str, "igfx_off", 8)) {
468 "Intel-IOMMU: disable GFX device mapping\n");
469 } else if (!strncmp(str, "forcedac", 8)) {
471 "Intel-IOMMU: Forcing DAC for PCI devices\n");
473 } else if (!strncmp(str, "strict", 6)) {
475 "Intel-IOMMU: disable batched IOTLB flush\n");
476 intel_iommu_strict = 1;
477 } else if (!strncmp(str, "sp_off", 6)) {
479 "Intel-IOMMU: disable supported super page\n");
480 intel_iommu_superpage = 0;
483 str += strcspn(str, ",");
489 __setup("intel_iommu=", intel_iommu_setup);
491 static struct kmem_cache *iommu_domain_cache;
492 static struct kmem_cache *iommu_devinfo_cache;
493 static struct kmem_cache *iommu_iova_cache;
495 static inline void *alloc_pgtable_page(int node)
500 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
502 vaddr = page_address(page);
506 static inline void free_pgtable_page(void *vaddr)
508 free_page((unsigned long)vaddr);
511 static inline void *alloc_domain_mem(void)
513 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
516 static void free_domain_mem(void *vaddr)
518 kmem_cache_free(iommu_domain_cache, vaddr);
521 static inline void * alloc_devinfo_mem(void)
523 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
526 static inline void free_devinfo_mem(void *vaddr)
528 kmem_cache_free(iommu_devinfo_cache, vaddr);
531 struct iova *alloc_iova_mem(void)
533 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
536 void free_iova_mem(struct iova *iova)
538 kmem_cache_free(iommu_iova_cache, iova);
541 static inline int domain_type_is_vm(struct dmar_domain *domain)
543 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
546 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
548 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
549 DOMAIN_FLAG_STATIC_IDENTITY);
552 static inline int domain_pfn_supported(struct dmar_domain *domain,
555 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
557 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
560 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 sagaw = cap_sagaw(iommu->cap);
566 for (agaw = width_to_agaw(max_gaw);
568 if (test_bit(agaw, &sagaw))
576 * Calculate max SAGAW for each iommu.
578 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
580 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 * calculate agaw for each iommu.
585 * "SAGAW" may be different across iommus, use a default agaw, and
586 * get a supported less agaw for iommus that don't support the default agaw.
588 int iommu_calculate_agaw(struct intel_iommu *iommu)
590 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
593 /* This functionin only returns single iommu in a domain */
594 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 /* si_domain and vm domain should not get here. */
599 BUG_ON(domain_type_is_vm_or_si(domain));
600 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
601 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
604 return g_iommus[iommu_id];
607 static void domain_update_iommu_coherency(struct dmar_domain *domain)
609 struct dmar_drhd_unit *drhd;
610 struct intel_iommu *iommu;
613 domain->iommu_coherency = 1;
615 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
617 if (!ecap_coherent(g_iommus[i]->ecap)) {
618 domain->iommu_coherency = 0;
625 /* No hardware attached; use lowest common denominator */
627 for_each_active_iommu(iommu, drhd) {
628 if (!ecap_coherent(iommu->ecap)) {
629 domain->iommu_coherency = 0;
636 static void domain_update_iommu_snooping(struct dmar_domain *domain)
640 domain->iommu_snooping = 1;
642 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
643 if (!ecap_sc_support(g_iommus[i]->ecap)) {
644 domain->iommu_snooping = 0;
650 static void domain_update_iommu_superpage(struct dmar_domain *domain)
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu = NULL;
656 if (!intel_iommu_superpage) {
657 domain->iommu_superpage = 0;
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
664 mask &= cap_super_page_val(iommu->cap);
671 domain->iommu_superpage = fls(mask);
674 /* Some capabilities may be different across iommus */
675 static void domain_update_iommu_cap(struct dmar_domain *domain)
677 domain_update_iommu_coherency(domain);
678 domain_update_iommu_snooping(domain);
679 domain_update_iommu_superpage(domain);
682 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
684 struct dmar_drhd_unit *drhd = NULL;
685 struct intel_iommu *iommu;
687 struct pci_dev *ptmp, *pdev = NULL;
691 if (dev_is_pci(dev)) {
692 pdev = to_pci_dev(dev);
693 segment = pci_domain_nr(pdev->bus);
694 } else if (ACPI_COMPANION(dev))
695 dev = &ACPI_COMPANION(dev)->dev;
698 for_each_active_iommu(iommu, drhd) {
699 if (pdev && segment != drhd->segment)
702 for_each_active_dev_scope(drhd->devices,
703 drhd->devices_cnt, i, tmp) {
705 *bus = drhd->devices[i].bus;
706 *devfn = drhd->devices[i].devfn;
710 if (!pdev || !dev_is_pci(tmp))
713 ptmp = to_pci_dev(tmp);
714 if (ptmp->subordinate &&
715 ptmp->subordinate->number <= pdev->bus->number &&
716 ptmp->subordinate->busn_res.end >= pdev->bus->number)
720 if (pdev && drhd->include_all) {
722 *bus = pdev->bus->number;
723 *devfn = pdev->devfn;
734 static void domain_flush_cache(struct dmar_domain *domain,
735 void *addr, int size)
737 if (!domain->iommu_coherency)
738 clflush_cache_range(addr, size);
741 /* Gets context entry for a given bus and devfn */
742 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
745 struct root_entry *root;
746 struct context_entry *context;
747 unsigned long phy_addr;
750 spin_lock_irqsave(&iommu->lock, flags);
751 root = &iommu->root_entry[bus];
752 context = get_context_addr_from_root(root);
754 context = (struct context_entry *)
755 alloc_pgtable_page(iommu->node);
757 spin_unlock_irqrestore(&iommu->lock, flags);
760 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
761 phy_addr = virt_to_phys((void *)context);
762 set_root_value(root, phy_addr);
763 set_root_present(root);
764 __iommu_flush_cache(iommu, root, sizeof(*root));
766 spin_unlock_irqrestore(&iommu->lock, flags);
767 return &context[devfn];
770 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
772 struct root_entry *root;
773 struct context_entry *context;
777 spin_lock_irqsave(&iommu->lock, flags);
778 root = &iommu->root_entry[bus];
779 context = get_context_addr_from_root(root);
784 ret = context_present(&context[devfn]);
786 spin_unlock_irqrestore(&iommu->lock, flags);
790 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
792 struct root_entry *root;
793 struct context_entry *context;
796 spin_lock_irqsave(&iommu->lock, flags);
797 root = &iommu->root_entry[bus];
798 context = get_context_addr_from_root(root);
800 context_clear_entry(&context[devfn]);
801 __iommu_flush_cache(iommu, &context[devfn], \
804 spin_unlock_irqrestore(&iommu->lock, flags);
807 static void free_context_table(struct intel_iommu *iommu)
809 struct root_entry *root;
812 struct context_entry *context;
814 spin_lock_irqsave(&iommu->lock, flags);
815 if (!iommu->root_entry) {
818 for (i = 0; i < ROOT_ENTRY_NR; i++) {
819 root = &iommu->root_entry[i];
820 context = get_context_addr_from_root(root);
822 free_pgtable_page(context);
824 free_pgtable_page(iommu->root_entry);
825 iommu->root_entry = NULL;
827 spin_unlock_irqrestore(&iommu->lock, flags);
830 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
831 unsigned long pfn, int *target_level)
833 struct dma_pte *parent, *pte = NULL;
834 int level = agaw_to_level(domain->agaw);
837 BUG_ON(!domain->pgd);
839 if (!domain_pfn_supported(domain, pfn))
840 /* Address beyond IOMMU's addressing capabilities. */
843 parent = domain->pgd;
848 offset = pfn_level_offset(pfn, level);
849 pte = &parent[offset];
850 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
852 if (level == *target_level)
855 if (!dma_pte_present(pte)) {
858 tmp_page = alloc_pgtable_page(domain->nid);
863 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
864 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
865 if (cmpxchg64(&pte->val, 0ULL, pteval))
866 /* Someone else set it while we were thinking; use theirs. */
867 free_pgtable_page(tmp_page);
869 domain_flush_cache(domain, pte, sizeof(*pte));
874 parent = phys_to_virt(dma_pte_addr(pte));
879 *target_level = level;
885 /* return address's pte at specific level */
886 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
888 int level, int *large_page)
890 struct dma_pte *parent, *pte = NULL;
891 int total = agaw_to_level(domain->agaw);
894 parent = domain->pgd;
895 while (level <= total) {
896 offset = pfn_level_offset(pfn, total);
897 pte = &parent[offset];
901 if (!dma_pte_present(pte)) {
906 if (dma_pte_superpage(pte)) {
911 parent = phys_to_virt(dma_pte_addr(pte));
917 /* clear last level pte, a tlb flush should be followed */
918 static void dma_pte_clear_range(struct dmar_domain *domain,
919 unsigned long start_pfn,
920 unsigned long last_pfn)
922 unsigned int large_page = 1;
923 struct dma_pte *first_pte, *pte;
925 BUG_ON(!domain_pfn_supported(domain, start_pfn));
926 BUG_ON(!domain_pfn_supported(domain, last_pfn));
927 BUG_ON(start_pfn > last_pfn);
929 /* we don't need lock here; nobody else touches the iova range */
932 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
934 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
939 start_pfn += lvl_to_nr_pages(large_page);
941 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
943 domain_flush_cache(domain, first_pte,
944 (void *)pte - (void *)first_pte);
946 } while (start_pfn && start_pfn <= last_pfn);
949 static void dma_pte_free_level(struct dmar_domain *domain, int level,
950 struct dma_pte *pte, unsigned long pfn,
951 unsigned long start_pfn, unsigned long last_pfn)
953 pfn = max(start_pfn, pfn);
954 pte = &pte[pfn_level_offset(pfn, level)];
957 unsigned long level_pfn;
958 struct dma_pte *level_pte;
960 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
963 level_pfn = pfn & level_mask(level - 1);
964 level_pte = phys_to_virt(dma_pte_addr(pte));
967 dma_pte_free_level(domain, level - 1, level_pte,
968 level_pfn, start_pfn, last_pfn);
970 /* If range covers entire pagetable, free it */
971 if (!(start_pfn > level_pfn ||
972 last_pfn < level_pfn + level_size(level) - 1)) {
974 domain_flush_cache(domain, pte, sizeof(*pte));
975 free_pgtable_page(level_pte);
978 pfn += level_size(level);
979 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
982 /* free page table pages. last level pte should already be cleared */
983 static void dma_pte_free_pagetable(struct dmar_domain *domain,
984 unsigned long start_pfn,
985 unsigned long last_pfn)
987 BUG_ON(!domain_pfn_supported(domain, start_pfn));
988 BUG_ON(!domain_pfn_supported(domain, last_pfn));
989 BUG_ON(start_pfn > last_pfn);
991 dma_pte_clear_range(domain, start_pfn, last_pfn);
993 /* We don't need lock here; nobody else touches the iova range */
994 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
995 domain->pgd, 0, start_pfn, last_pfn);
998 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
999 free_pgtable_page(domain->pgd);
1004 /* When a page at a given level is being unlinked from its parent, we don't
1005 need to *modify* it at all. All we need to do is make a list of all the
1006 pages which can be freed just as soon as we've flushed the IOTLB and we
1007 know the hardware page-walk will no longer touch them.
1008 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1010 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1011 int level, struct dma_pte *pte,
1012 struct page *freelist)
1016 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1017 pg->freelist = freelist;
1023 pte = page_address(pg);
1025 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1026 freelist = dma_pte_list_pagetables(domain, level - 1,
1029 } while (!first_pte_in_page(pte));
1034 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1035 struct dma_pte *pte, unsigned long pfn,
1036 unsigned long start_pfn,
1037 unsigned long last_pfn,
1038 struct page *freelist)
1040 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1042 pfn = max(start_pfn, pfn);
1043 pte = &pte[pfn_level_offset(pfn, level)];
1046 unsigned long level_pfn;
1048 if (!dma_pte_present(pte))
1051 level_pfn = pfn & level_mask(level);
1053 /* If range covers entire pagetable, free it */
1054 if (start_pfn <= level_pfn &&
1055 last_pfn >= level_pfn + level_size(level) - 1) {
1056 /* These suborbinate page tables are going away entirely. Don't
1057 bother to clear them; we're just going to *free* them. */
1058 if (level > 1 && !dma_pte_superpage(pte))
1059 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1065 } else if (level > 1) {
1066 /* Recurse down into a level that isn't *entirely* obsolete */
1067 freelist = dma_pte_clear_level(domain, level - 1,
1068 phys_to_virt(dma_pte_addr(pte)),
1069 level_pfn, start_pfn, last_pfn,
1073 pfn += level_size(level);
1074 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1077 domain_flush_cache(domain, first_pte,
1078 (void *)++last_pte - (void *)first_pte);
1083 /* We can't just free the pages because the IOMMU may still be walking
1084 the page tables, and may have cached the intermediate levels. The
1085 pages can only be freed after the IOTLB flush has been done. */
1086 struct page *domain_unmap(struct dmar_domain *domain,
1087 unsigned long start_pfn,
1088 unsigned long last_pfn)
1090 struct page *freelist = NULL;
1092 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1093 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1094 BUG_ON(start_pfn > last_pfn);
1096 /* we don't need lock here; nobody else touches the iova range */
1097 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1098 domain->pgd, 0, start_pfn, last_pfn, NULL);
1101 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1102 struct page *pgd_page = virt_to_page(domain->pgd);
1103 pgd_page->freelist = freelist;
1104 freelist = pgd_page;
1112 void dma_free_pagelist(struct page *freelist)
1116 while ((pg = freelist)) {
1117 freelist = pg->freelist;
1118 free_pgtable_page(page_address(pg));
1122 /* iommu handling */
1123 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1125 struct root_entry *root;
1126 unsigned long flags;
1128 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1132 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1134 spin_lock_irqsave(&iommu->lock, flags);
1135 iommu->root_entry = root;
1136 spin_unlock_irqrestore(&iommu->lock, flags);
1141 static void iommu_set_root_entry(struct intel_iommu *iommu)
1147 addr = iommu->root_entry;
1149 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1150 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1152 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1154 /* Make sure hardware complete it */
1155 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1156 readl, (sts & DMA_GSTS_RTPS), sts);
1158 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1161 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1166 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1169 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1170 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1172 /* Make sure hardware complete it */
1173 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1174 readl, (!(val & DMA_GSTS_WBFS)), val);
1176 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1179 /* return value determine if we need a write buffer flush */
1180 static void __iommu_flush_context(struct intel_iommu *iommu,
1181 u16 did, u16 source_id, u8 function_mask,
1188 case DMA_CCMD_GLOBAL_INVL:
1189 val = DMA_CCMD_GLOBAL_INVL;
1191 case DMA_CCMD_DOMAIN_INVL:
1192 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1194 case DMA_CCMD_DEVICE_INVL:
1195 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1196 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1201 val |= DMA_CCMD_ICC;
1203 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1204 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1206 /* Make sure hardware complete it */
1207 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1208 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1210 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1213 /* return value determine if we need a write buffer flush */
1214 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1215 u64 addr, unsigned int size_order, u64 type)
1217 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1218 u64 val = 0, val_iva = 0;
1222 case DMA_TLB_GLOBAL_FLUSH:
1223 /* global flush doesn't need set IVA_REG */
1224 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1226 case DMA_TLB_DSI_FLUSH:
1227 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1229 case DMA_TLB_PSI_FLUSH:
1230 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1231 /* IH bit is passed in as part of address */
1232 val_iva = size_order | addr;
1237 /* Note: set drain read/write */
1240 * This is probably to be super secure.. Looks like we can
1241 * ignore it without any impact.
1243 if (cap_read_drain(iommu->cap))
1244 val |= DMA_TLB_READ_DRAIN;
1246 if (cap_write_drain(iommu->cap))
1247 val |= DMA_TLB_WRITE_DRAIN;
1249 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1250 /* Note: Only uses first TLB reg currently */
1252 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1253 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1255 /* Make sure hardware complete it */
1256 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1257 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1259 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1261 /* check IOTLB invalidation granularity */
1262 if (DMA_TLB_IAIG(val) == 0)
1263 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1264 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1265 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1266 (unsigned long long)DMA_TLB_IIRG(type),
1267 (unsigned long long)DMA_TLB_IAIG(val));
1270 static struct device_domain_info *
1271 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1275 unsigned long flags;
1276 struct device_domain_info *info;
1277 struct pci_dev *pdev;
1279 if (!ecap_dev_iotlb_support(iommu->ecap))
1285 spin_lock_irqsave(&device_domain_lock, flags);
1286 list_for_each_entry(info, &domain->devices, link)
1287 if (info->iommu == iommu && info->bus == bus &&
1288 info->devfn == devfn) {
1292 spin_unlock_irqrestore(&device_domain_lock, flags);
1294 if (!found || !info->dev || !dev_is_pci(info->dev))
1297 pdev = to_pci_dev(info->dev);
1299 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1302 if (!dmar_find_matched_atsr_unit(pdev))
1308 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1310 if (!info || !dev_is_pci(info->dev))
1313 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1316 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1318 if (!info->dev || !dev_is_pci(info->dev) ||
1319 !pci_ats_enabled(to_pci_dev(info->dev)))
1322 pci_disable_ats(to_pci_dev(info->dev));
1325 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1326 u64 addr, unsigned mask)
1329 unsigned long flags;
1330 struct device_domain_info *info;
1332 spin_lock_irqsave(&device_domain_lock, flags);
1333 list_for_each_entry(info, &domain->devices, link) {
1334 struct pci_dev *pdev;
1335 if (!info->dev || !dev_is_pci(info->dev))
1338 pdev = to_pci_dev(info->dev);
1339 if (!pci_ats_enabled(pdev))
1342 sid = info->bus << 8 | info->devfn;
1343 qdep = pci_ats_queue_depth(pdev);
1344 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1346 spin_unlock_irqrestore(&device_domain_lock, flags);
1349 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1350 unsigned long pfn, unsigned int pages, int ih, int map)
1352 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1353 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1360 * Fallback to domain selective flush if no PSI support or the size is
1362 * PSI requires page size to be 2 ^ x, and the base address is naturally
1363 * aligned to the size
1365 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1366 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1369 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1373 * In caching mode, changes of pages from non-present to present require
1374 * flush. However, device IOTLB doesn't need to be flushed in this case.
1376 if (!cap_caching_mode(iommu->cap) || !map)
1377 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1380 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1383 unsigned long flags;
1385 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1386 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1387 pmen &= ~DMA_PMEN_EPM;
1388 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1390 /* wait for the protected region status bit to clear */
1391 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1392 readl, !(pmen & DMA_PMEN_PRS), pmen);
1394 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1397 static void iommu_enable_translation(struct intel_iommu *iommu)
1400 unsigned long flags;
1402 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1403 iommu->gcmd |= DMA_GCMD_TE;
1404 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1406 /* Make sure hardware complete it */
1407 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1408 readl, (sts & DMA_GSTS_TES), sts);
1410 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1413 static void iommu_disable_translation(struct intel_iommu *iommu)
1418 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1419 iommu->gcmd &= ~DMA_GCMD_TE;
1420 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1422 /* Make sure hardware complete it */
1423 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1424 readl, (!(sts & DMA_GSTS_TES)), sts);
1426 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430 static int iommu_init_domains(struct intel_iommu *iommu)
1432 unsigned long ndomains;
1433 unsigned long nlongs;
1435 ndomains = cap_ndoms(iommu->cap);
1436 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1437 iommu->seq_id, ndomains);
1438 nlongs = BITS_TO_LONGS(ndomains);
1440 spin_lock_init(&iommu->lock);
1442 /* TBD: there might be 64K domains,
1443 * consider other allocation for future chip
1445 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1446 if (!iommu->domain_ids) {
1447 pr_err("IOMMU%d: allocating domain id array failed\n",
1451 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1453 if (!iommu->domains) {
1454 pr_err("IOMMU%d: allocating domain array failed\n",
1456 kfree(iommu->domain_ids);
1457 iommu->domain_ids = NULL;
1462 * if Caching mode is set, then invalid translations are tagged
1463 * with domainid 0. Hence we need to pre-allocate it.
1465 if (cap_caching_mode(iommu->cap))
1466 set_bit(0, iommu->domain_ids);
1470 static void free_dmar_iommu(struct intel_iommu *iommu)
1472 struct dmar_domain *domain;
1475 if ((iommu->domains) && (iommu->domain_ids)) {
1476 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1478 * Domain id 0 is reserved for invalid translation
1479 * if hardware supports caching mode.
1481 if (cap_caching_mode(iommu->cap) && i == 0)
1484 domain = iommu->domains[i];
1485 clear_bit(i, iommu->domain_ids);
1486 if (domain_detach_iommu(domain, iommu) == 0 &&
1487 !domain_type_is_vm(domain))
1488 domain_exit(domain);
1492 if (iommu->gcmd & DMA_GCMD_TE)
1493 iommu_disable_translation(iommu);
1495 kfree(iommu->domains);
1496 kfree(iommu->domain_ids);
1497 iommu->domains = NULL;
1498 iommu->domain_ids = NULL;
1500 g_iommus[iommu->seq_id] = NULL;
1502 /* free context mapping */
1503 free_context_table(iommu);
1506 static struct dmar_domain *alloc_domain(int flags)
1508 /* domain id for virtual machine, it won't be set in context */
1509 static atomic_t vm_domid = ATOMIC_INIT(0);
1510 struct dmar_domain *domain;
1512 domain = alloc_domain_mem();
1516 memset(domain, 0, sizeof(*domain));
1518 domain->flags = flags;
1519 spin_lock_init(&domain->iommu_lock);
1520 INIT_LIST_HEAD(&domain->devices);
1521 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1522 domain->id = atomic_inc_return(&vm_domid);
1527 static int __iommu_attach_domain(struct dmar_domain *domain,
1528 struct intel_iommu *iommu)
1531 unsigned long ndomains;
1533 ndomains = cap_ndoms(iommu->cap);
1534 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1535 if (num < ndomains) {
1536 set_bit(num, iommu->domain_ids);
1537 iommu->domains[num] = domain;
1545 static int iommu_attach_domain(struct dmar_domain *domain,
1546 struct intel_iommu *iommu)
1549 unsigned long flags;
1551 spin_lock_irqsave(&iommu->lock, flags);
1552 num = __iommu_attach_domain(domain, iommu);
1553 spin_unlock_irqrestore(&iommu->lock, flags);
1555 pr_err("IOMMU: no free domain ids\n");
1560 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1561 struct intel_iommu *iommu)
1564 unsigned long ndomains;
1566 ndomains = cap_ndoms(iommu->cap);
1567 for_each_set_bit(num, iommu->domain_ids, ndomains)
1568 if (iommu->domains[num] == domain)
1571 return __iommu_attach_domain(domain, iommu);
1574 static void iommu_detach_domain(struct dmar_domain *domain,
1575 struct intel_iommu *iommu)
1577 unsigned long flags;
1580 spin_lock_irqsave(&iommu->lock, flags);
1581 if (domain_type_is_vm_or_si(domain)) {
1582 ndomains = cap_ndoms(iommu->cap);
1583 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1584 if (iommu->domains[num] == domain) {
1585 clear_bit(num, iommu->domain_ids);
1586 iommu->domains[num] = NULL;
1591 clear_bit(domain->id, iommu->domain_ids);
1592 iommu->domains[domain->id] = NULL;
1594 spin_unlock_irqrestore(&iommu->lock, flags);
1597 static void domain_attach_iommu(struct dmar_domain *domain,
1598 struct intel_iommu *iommu)
1600 unsigned long flags;
1602 spin_lock_irqsave(&domain->iommu_lock, flags);
1603 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1604 domain->iommu_count++;
1605 if (domain->iommu_count == 1)
1606 domain->nid = iommu->node;
1607 domain_update_iommu_cap(domain);
1609 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1612 static int domain_detach_iommu(struct dmar_domain *domain,
1613 struct intel_iommu *iommu)
1615 unsigned long flags;
1616 int count = INT_MAX;
1618 spin_lock_irqsave(&domain->iommu_lock, flags);
1619 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1620 count = --domain->iommu_count;
1621 domain_update_iommu_cap(domain);
1623 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1628 static struct iova_domain reserved_iova_list;
1629 static struct lock_class_key reserved_rbtree_key;
1631 static int dmar_init_reserved_ranges(void)
1633 struct pci_dev *pdev = NULL;
1637 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1639 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1640 &reserved_rbtree_key);
1642 /* IOAPIC ranges shouldn't be accessed by DMA */
1643 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1644 IOVA_PFN(IOAPIC_RANGE_END));
1646 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1650 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1651 for_each_pci_dev(pdev) {
1654 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1655 r = &pdev->resource[i];
1656 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1658 iova = reserve_iova(&reserved_iova_list,
1662 printk(KERN_ERR "Reserve iova failed\n");
1670 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1672 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1675 static inline int guestwidth_to_adjustwidth(int gaw)
1678 int r = (gaw - 12) % 9;
1689 static int domain_init(struct dmar_domain *domain, int guest_width)
1691 struct intel_iommu *iommu;
1692 int adjust_width, agaw;
1693 unsigned long sagaw;
1695 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1696 domain_reserve_special_ranges(domain);
1698 /* calculate AGAW */
1699 iommu = domain_get_iommu(domain);
1700 if (guest_width > cap_mgaw(iommu->cap))
1701 guest_width = cap_mgaw(iommu->cap);
1702 domain->gaw = guest_width;
1703 adjust_width = guestwidth_to_adjustwidth(guest_width);
1704 agaw = width_to_agaw(adjust_width);
1705 sagaw = cap_sagaw(iommu->cap);
1706 if (!test_bit(agaw, &sagaw)) {
1707 /* hardware doesn't support it, choose a bigger one */
1708 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1709 agaw = find_next_bit(&sagaw, 5, agaw);
1713 domain->agaw = agaw;
1715 if (ecap_coherent(iommu->ecap))
1716 domain->iommu_coherency = 1;
1718 domain->iommu_coherency = 0;
1720 if (ecap_sc_support(iommu->ecap))
1721 domain->iommu_snooping = 1;
1723 domain->iommu_snooping = 0;
1725 if (intel_iommu_superpage)
1726 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1728 domain->iommu_superpage = 0;
1730 domain->nid = iommu->node;
1732 /* always allocate the top pgd */
1733 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1736 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1740 static void domain_exit(struct dmar_domain *domain)
1742 struct dmar_drhd_unit *drhd;
1743 struct intel_iommu *iommu;
1744 struct page *freelist = NULL;
1746 /* Domain 0 is reserved, so dont process it */
1750 /* Flush any lazy unmaps that may reference this domain */
1751 if (!intel_iommu_strict)
1752 flush_unmaps_timeout(0);
1754 /* remove associated devices */
1755 domain_remove_dev_info(domain);
1758 put_iova_domain(&domain->iovad);
1760 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1762 /* clear attached or cached domains */
1764 for_each_active_iommu(iommu, drhd)
1765 iommu_detach_domain(domain, iommu);
1768 dma_free_pagelist(freelist);
1770 free_domain_mem(domain);
1773 static int domain_context_mapping_one(struct dmar_domain *domain,
1774 struct intel_iommu *iommu,
1775 u8 bus, u8 devfn, int translation)
1777 struct context_entry *context;
1778 unsigned long flags;
1779 struct dma_pte *pgd;
1782 struct device_domain_info *info = NULL;
1784 pr_debug("Set context mapping for %02x:%02x.%d\n",
1785 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1787 BUG_ON(!domain->pgd);
1788 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1789 translation != CONTEXT_TT_MULTI_LEVEL);
1791 context = device_to_context_entry(iommu, bus, devfn);
1794 spin_lock_irqsave(&iommu->lock, flags);
1795 if (context_present(context)) {
1796 spin_unlock_irqrestore(&iommu->lock, flags);
1803 if (domain_type_is_vm_or_si(domain)) {
1804 if (domain_type_is_vm(domain)) {
1805 id = iommu_attach_vm_domain(domain, iommu);
1807 spin_unlock_irqrestore(&iommu->lock, flags);
1808 pr_err("IOMMU: no free domain ids\n");
1813 /* Skip top levels of page tables for
1814 * iommu which has less agaw than default.
1815 * Unnecessary for PT mode.
1817 if (translation != CONTEXT_TT_PASS_THROUGH) {
1818 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1819 pgd = phys_to_virt(dma_pte_addr(pgd));
1820 if (!dma_pte_present(pgd)) {
1821 spin_unlock_irqrestore(&iommu->lock, flags);
1828 context_set_domain_id(context, id);
1830 if (translation != CONTEXT_TT_PASS_THROUGH) {
1831 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1832 translation = info ? CONTEXT_TT_DEV_IOTLB :
1833 CONTEXT_TT_MULTI_LEVEL;
1836 * In pass through mode, AW must be programmed to indicate the largest
1837 * AGAW value supported by hardware. And ASR is ignored by hardware.
1839 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1840 context_set_address_width(context, iommu->msagaw);
1842 context_set_address_root(context, virt_to_phys(pgd));
1843 context_set_address_width(context, iommu->agaw);
1846 context_set_translation_type(context, translation);
1847 context_set_fault_enable(context);
1848 context_set_present(context);
1849 domain_flush_cache(domain, context, sizeof(*context));
1852 * It's a non-present to present mapping. If hardware doesn't cache
1853 * non-present entry we only need to flush the write-buffer. If the
1854 * _does_ cache non-present entries, then it does so in the special
1855 * domain #0, which we have to flush:
1857 if (cap_caching_mode(iommu->cap)) {
1858 iommu->flush.flush_context(iommu, 0,
1859 (((u16)bus) << 8) | devfn,
1860 DMA_CCMD_MASK_NOBIT,
1861 DMA_CCMD_DEVICE_INVL);
1862 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1864 iommu_flush_write_buffer(iommu);
1866 iommu_enable_dev_iotlb(info);
1867 spin_unlock_irqrestore(&iommu->lock, flags);
1869 domain_attach_iommu(domain, iommu);
1874 struct domain_context_mapping_data {
1875 struct dmar_domain *domain;
1876 struct intel_iommu *iommu;
1880 static int domain_context_mapping_cb(struct pci_dev *pdev,
1881 u16 alias, void *opaque)
1883 struct domain_context_mapping_data *data = opaque;
1885 return domain_context_mapping_one(data->domain, data->iommu,
1886 PCI_BUS_NUM(alias), alias & 0xff,
1891 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1894 struct intel_iommu *iommu;
1896 struct domain_context_mapping_data data;
1898 iommu = device_to_iommu(dev, &bus, &devfn);
1902 if (!dev_is_pci(dev))
1903 return domain_context_mapping_one(domain, iommu, bus, devfn,
1906 data.domain = domain;
1908 data.translation = translation;
1910 return pci_for_each_dma_alias(to_pci_dev(dev),
1911 &domain_context_mapping_cb, &data);
1914 static int domain_context_mapped_cb(struct pci_dev *pdev,
1915 u16 alias, void *opaque)
1917 struct intel_iommu *iommu = opaque;
1919 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1922 static int domain_context_mapped(struct device *dev)
1924 struct intel_iommu *iommu;
1927 iommu = device_to_iommu(dev, &bus, &devfn);
1931 if (!dev_is_pci(dev))
1932 return device_context_mapped(iommu, bus, devfn);
1934 return !pci_for_each_dma_alias(to_pci_dev(dev),
1935 domain_context_mapped_cb, iommu);
1938 /* Returns a number of VTD pages, but aligned to MM page size */
1939 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1942 host_addr &= ~PAGE_MASK;
1943 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1946 /* Return largest possible superpage level for a given mapping */
1947 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1948 unsigned long iov_pfn,
1949 unsigned long phy_pfn,
1950 unsigned long pages)
1952 int support, level = 1;
1953 unsigned long pfnmerge;
1955 support = domain->iommu_superpage;
1957 /* To use a large page, the virtual *and* physical addresses
1958 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1959 of them will mean we have to use smaller pages. So just
1960 merge them and check both at once. */
1961 pfnmerge = iov_pfn | phy_pfn;
1963 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1964 pages >>= VTD_STRIDE_SHIFT;
1967 pfnmerge >>= VTD_STRIDE_SHIFT;
1974 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1975 struct scatterlist *sg, unsigned long phys_pfn,
1976 unsigned long nr_pages, int prot)
1978 struct dma_pte *first_pte = NULL, *pte = NULL;
1979 phys_addr_t uninitialized_var(pteval);
1980 unsigned long sg_res;
1981 unsigned int largepage_lvl = 0;
1982 unsigned long lvl_pages = 0;
1984 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1986 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1989 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1994 sg_res = nr_pages + 1;
1995 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1998 while (nr_pages > 0) {
2002 sg_res = aligned_nrpages(sg->offset, sg->length);
2003 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2004 sg->dma_length = sg->length;
2005 pteval = page_to_phys(sg_page(sg)) | prot;
2006 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2010 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2012 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2015 /* It is large page*/
2016 if (largepage_lvl > 1) {
2017 pteval |= DMA_PTE_LARGE_PAGE;
2018 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2020 * Ensure that old small page tables are
2021 * removed to make room for superpage,
2024 dma_pte_free_pagetable(domain, iov_pfn,
2025 iov_pfn + lvl_pages - 1);
2027 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2031 /* We don't need lock here, nobody else
2032 * touches the iova range
2034 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2036 static int dumps = 5;
2037 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2038 iov_pfn, tmp, (unsigned long long)pteval);
2041 debug_dma_dump_mappings(NULL);
2046 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2048 BUG_ON(nr_pages < lvl_pages);
2049 BUG_ON(sg_res < lvl_pages);
2051 nr_pages -= lvl_pages;
2052 iov_pfn += lvl_pages;
2053 phys_pfn += lvl_pages;
2054 pteval += lvl_pages * VTD_PAGE_SIZE;
2055 sg_res -= lvl_pages;
2057 /* If the next PTE would be the first in a new page, then we
2058 need to flush the cache on the entries we've just written.
2059 And then we'll need to recalculate 'pte', so clear it and
2060 let it get set again in the if (!pte) block above.
2062 If we're done (!nr_pages) we need to flush the cache too.
2064 Also if we've been setting superpages, we may need to
2065 recalculate 'pte' and switch back to smaller pages for the
2066 end of the mapping, if the trailing size is not enough to
2067 use another superpage (i.e. sg_res < lvl_pages). */
2069 if (!nr_pages || first_pte_in_page(pte) ||
2070 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2071 domain_flush_cache(domain, first_pte,
2072 (void *)pte - (void *)first_pte);
2076 if (!sg_res && nr_pages)
2082 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2083 struct scatterlist *sg, unsigned long nr_pages,
2086 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2089 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2090 unsigned long phys_pfn, unsigned long nr_pages,
2093 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2096 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2101 clear_context_table(iommu, bus, devfn);
2102 iommu->flush.flush_context(iommu, 0, 0, 0,
2103 DMA_CCMD_GLOBAL_INVL);
2104 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2107 static inline void unlink_domain_info(struct device_domain_info *info)
2109 assert_spin_locked(&device_domain_lock);
2110 list_del(&info->link);
2111 list_del(&info->global);
2113 info->dev->archdata.iommu = NULL;
2116 static void domain_remove_dev_info(struct dmar_domain *domain)
2118 struct device_domain_info *info, *tmp;
2119 unsigned long flags;
2121 spin_lock_irqsave(&device_domain_lock, flags);
2122 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2123 unlink_domain_info(info);
2124 spin_unlock_irqrestore(&device_domain_lock, flags);
2126 iommu_disable_dev_iotlb(info);
2127 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2129 if (domain_type_is_vm(domain)) {
2130 iommu_detach_dependent_devices(info->iommu, info->dev);
2131 domain_detach_iommu(domain, info->iommu);
2134 free_devinfo_mem(info);
2135 spin_lock_irqsave(&device_domain_lock, flags);
2137 spin_unlock_irqrestore(&device_domain_lock, flags);
2142 * Note: we use struct device->archdata.iommu stores the info
2144 static struct dmar_domain *find_domain(struct device *dev)
2146 struct device_domain_info *info;
2148 /* No lock here, assumes no domain exit in normal case */
2149 info = dev->archdata.iommu;
2151 return info->domain;
2155 static inline struct device_domain_info *
2156 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2158 struct device_domain_info *info;
2160 list_for_each_entry(info, &device_domain_list, global)
2161 if (info->iommu->segment == segment && info->bus == bus &&
2162 info->devfn == devfn)
2168 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2171 struct dmar_domain *domain)
2173 struct dmar_domain *found = NULL;
2174 struct device_domain_info *info;
2175 unsigned long flags;
2177 info = alloc_devinfo_mem();
2182 info->devfn = devfn;
2184 info->domain = domain;
2185 info->iommu = iommu;
2187 spin_lock_irqsave(&device_domain_lock, flags);
2189 found = find_domain(dev);
2191 struct device_domain_info *info2;
2192 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2194 found = info2->domain;
2197 spin_unlock_irqrestore(&device_domain_lock, flags);
2198 free_devinfo_mem(info);
2199 /* Caller must free the original domain */
2203 list_add(&info->link, &domain->devices);
2204 list_add(&info->global, &device_domain_list);
2206 dev->archdata.iommu = info;
2207 spin_unlock_irqrestore(&device_domain_lock, flags);
2212 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2214 *(u16 *)opaque = alias;
2218 /* domain is initialized */
2219 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2221 struct dmar_domain *domain, *tmp;
2222 struct intel_iommu *iommu;
2223 struct device_domain_info *info;
2225 unsigned long flags;
2228 domain = find_domain(dev);
2232 iommu = device_to_iommu(dev, &bus, &devfn);
2236 if (dev_is_pci(dev)) {
2237 struct pci_dev *pdev = to_pci_dev(dev);
2239 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2241 spin_lock_irqsave(&device_domain_lock, flags);
2242 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2243 PCI_BUS_NUM(dma_alias),
2246 iommu = info->iommu;
2247 domain = info->domain;
2249 spin_unlock_irqrestore(&device_domain_lock, flags);
2251 /* DMA alias already has a domain, uses it */
2256 /* Allocate and initialize new domain for the device */
2257 domain = alloc_domain(0);
2260 domain->id = iommu_attach_domain(domain, iommu);
2261 if (domain->id < 0) {
2262 free_domain_mem(domain);
2265 domain_attach_iommu(domain, iommu);
2266 if (domain_init(domain, gaw)) {
2267 domain_exit(domain);
2271 /* register PCI DMA alias device */
2272 if (dev_is_pci(dev)) {
2273 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2274 dma_alias & 0xff, NULL, domain);
2276 if (!tmp || tmp != domain) {
2277 domain_exit(domain);
2286 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2288 if (!tmp || tmp != domain) {
2289 domain_exit(domain);
2296 static int iommu_identity_mapping;
2297 #define IDENTMAP_ALL 1
2298 #define IDENTMAP_GFX 2
2299 #define IDENTMAP_AZALIA 4
2301 static int iommu_domain_identity_map(struct dmar_domain *domain,
2302 unsigned long long start,
2303 unsigned long long end)
2305 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2306 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2308 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2309 dma_to_mm_pfn(last_vpfn))) {
2310 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2314 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2315 start, end, domain->id);
2317 * RMRR range might have overlap with physical memory range,
2320 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2322 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2323 last_vpfn - first_vpfn + 1,
2324 DMA_PTE_READ|DMA_PTE_WRITE);
2327 static int iommu_prepare_identity_map(struct device *dev,
2328 unsigned long long start,
2329 unsigned long long end)
2331 struct dmar_domain *domain;
2334 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2338 /* For _hardware_ passthrough, don't bother. But for software
2339 passthrough, we do it anyway -- it may indicate a memory
2340 range which is reserved in E820, so which didn't get set
2341 up to start with in si_domain */
2342 if (domain == si_domain && hw_pass_through) {
2343 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2344 dev_name(dev), start, end);
2349 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2350 dev_name(dev), start, end);
2353 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2354 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2355 dmi_get_system_info(DMI_BIOS_VENDOR),
2356 dmi_get_system_info(DMI_BIOS_VERSION),
2357 dmi_get_system_info(DMI_PRODUCT_VERSION));
2362 if (end >> agaw_to_width(domain->agaw)) {
2363 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2364 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2365 agaw_to_width(domain->agaw),
2366 dmi_get_system_info(DMI_BIOS_VENDOR),
2367 dmi_get_system_info(DMI_BIOS_VERSION),
2368 dmi_get_system_info(DMI_PRODUCT_VERSION));
2373 ret = iommu_domain_identity_map(domain, start, end);
2377 /* context entry init */
2378 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2385 domain_exit(domain);
2389 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2392 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2394 return iommu_prepare_identity_map(dev, rmrr->base_address,
2398 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2399 static inline void iommu_prepare_isa(void)
2401 struct pci_dev *pdev;
2404 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2408 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2409 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2412 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2413 "floppy might not work\n");
2418 static inline void iommu_prepare_isa(void)
2422 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2424 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2426 static int __init si_domain_init(int hw)
2428 struct dmar_drhd_unit *drhd;
2429 struct intel_iommu *iommu;
2433 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2437 for_each_active_iommu(iommu, drhd) {
2438 ret = iommu_attach_domain(si_domain, iommu);
2440 domain_exit(si_domain);
2443 si_domain->id = ret;
2445 } else if (si_domain->id != ret) {
2446 domain_exit(si_domain);
2449 domain_attach_iommu(si_domain, iommu);
2452 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2453 domain_exit(si_domain);
2457 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2463 for_each_online_node(nid) {
2464 unsigned long start_pfn, end_pfn;
2467 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2468 ret = iommu_domain_identity_map(si_domain,
2469 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2478 static int identity_mapping(struct device *dev)
2480 struct device_domain_info *info;
2482 if (likely(!iommu_identity_mapping))
2485 info = dev->archdata.iommu;
2486 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2487 return (info->domain == si_domain);
2492 static int domain_add_dev_info(struct dmar_domain *domain,
2493 struct device *dev, int translation)
2495 struct dmar_domain *ndomain;
2496 struct intel_iommu *iommu;
2500 iommu = device_to_iommu(dev, &bus, &devfn);
2504 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2505 if (ndomain != domain)
2508 ret = domain_context_mapping(domain, dev, translation);
2510 domain_remove_one_dev_info(domain, dev);
2517 static bool device_has_rmrr(struct device *dev)
2519 struct dmar_rmrr_unit *rmrr;
2524 for_each_rmrr_units(rmrr) {
2526 * Return TRUE if this RMRR contains the device that
2529 for_each_active_dev_scope(rmrr->devices,
2530 rmrr->devices_cnt, i, tmp)
2540 static int iommu_should_identity_map(struct device *dev, int startup)
2543 if (dev_is_pci(dev)) {
2544 struct pci_dev *pdev = to_pci_dev(dev);
2547 * We want to prevent any device associated with an RMRR from
2548 * getting placed into the SI Domain. This is done because
2549 * problems exist when devices are moved in and out of domains
2550 * and their respective RMRR info is lost. We exempt USB devices
2551 * from this process due to their usage of RMRRs that are known
2552 * to not be needed after BIOS hand-off to OS.
2554 if (device_has_rmrr(dev) &&
2555 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2558 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2561 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2564 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2568 * We want to start off with all devices in the 1:1 domain, and
2569 * take them out later if we find they can't access all of memory.
2571 * However, we can't do this for PCI devices behind bridges,
2572 * because all PCI devices behind the same bridge will end up
2573 * with the same source-id on their transactions.
2575 * Practically speaking, we can't change things around for these
2576 * devices at run-time, because we can't be sure there'll be no
2577 * DMA transactions in flight for any of their siblings.
2579 * So PCI devices (unless they're on the root bus) as well as
2580 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2581 * the 1:1 domain, just in _case_ one of their siblings turns out
2582 * not to be able to map all of memory.
2584 if (!pci_is_pcie(pdev)) {
2585 if (!pci_is_root_bus(pdev->bus))
2587 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2589 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2592 if (device_has_rmrr(dev))
2597 * At boot time, we don't yet know if devices will be 64-bit capable.
2598 * Assume that they will — if they turn out not to be, then we can
2599 * take them out of the 1:1 domain later.
2603 * If the device's dma_mask is less than the system's memory
2604 * size then this is not a candidate for identity mapping.
2606 u64 dma_mask = *dev->dma_mask;
2608 if (dev->coherent_dma_mask &&
2609 dev->coherent_dma_mask < dma_mask)
2610 dma_mask = dev->coherent_dma_mask;
2612 return dma_mask >= dma_get_required_mask(dev);
2618 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2622 if (!iommu_should_identity_map(dev, 1))
2625 ret = domain_add_dev_info(si_domain, dev,
2626 hw ? CONTEXT_TT_PASS_THROUGH :
2627 CONTEXT_TT_MULTI_LEVEL);
2629 pr_info("IOMMU: %s identity mapping for device %s\n",
2630 hw ? "hardware" : "software", dev_name(dev));
2631 else if (ret == -ENODEV)
2632 /* device not associated with an iommu */
2639 static int __init iommu_prepare_static_identity_mapping(int hw)
2641 struct pci_dev *pdev = NULL;
2642 struct dmar_drhd_unit *drhd;
2643 struct intel_iommu *iommu;
2648 ret = si_domain_init(hw);
2652 for_each_pci_dev(pdev) {
2653 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2658 for_each_active_iommu(iommu, drhd)
2659 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2660 struct acpi_device_physical_node *pn;
2661 struct acpi_device *adev;
2663 if (dev->bus != &acpi_bus_type)
2666 adev= to_acpi_device(dev);
2667 mutex_lock(&adev->physical_node_lock);
2668 list_for_each_entry(pn, &adev->physical_node_list, node) {
2669 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2673 mutex_unlock(&adev->physical_node_lock);
2681 static int __init init_dmars(void)
2683 struct dmar_drhd_unit *drhd;
2684 struct dmar_rmrr_unit *rmrr;
2686 struct intel_iommu *iommu;
2692 * initialize and program root entry to not present
2695 for_each_drhd_unit(drhd) {
2697 * lock not needed as this is only incremented in the single
2698 * threaded kernel __init code path all other access are read
2701 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2705 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2706 IOMMU_UNITS_SUPPORTED);
2709 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2712 printk(KERN_ERR "Allocating global iommu array failed\n");
2717 deferred_flush = kzalloc(g_num_of_iommus *
2718 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2719 if (!deferred_flush) {
2724 for_each_active_iommu(iommu, drhd) {
2725 g_iommus[iommu->seq_id] = iommu;
2727 ret = iommu_init_domains(iommu);
2733 * we could share the same root & context tables
2734 * among all IOMMU's. Need to Split it later.
2736 ret = iommu_alloc_root_entry(iommu);
2738 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2741 if (!ecap_pass_through(iommu->ecap))
2742 hw_pass_through = 0;
2746 * Start from the sane iommu hardware state.
2748 for_each_active_iommu(iommu, drhd) {
2750 * If the queued invalidation is already initialized by us
2751 * (for example, while enabling interrupt-remapping) then
2752 * we got the things already rolling from a sane state.
2758 * Clear any previous faults.
2760 dmar_fault(-1, iommu);
2762 * Disable queued invalidation if supported and already enabled
2763 * before OS handover.
2765 dmar_disable_qi(iommu);
2768 for_each_active_iommu(iommu, drhd) {
2769 if (dmar_enable_qi(iommu)) {
2771 * Queued Invalidate not enabled, use Register Based
2774 iommu->flush.flush_context = __iommu_flush_context;
2775 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2776 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2779 (unsigned long long)drhd->reg_base_addr);
2781 iommu->flush.flush_context = qi_flush_context;
2782 iommu->flush.flush_iotlb = qi_flush_iotlb;
2783 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2786 (unsigned long long)drhd->reg_base_addr);
2790 if (iommu_pass_through)
2791 iommu_identity_mapping |= IDENTMAP_ALL;
2793 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2794 iommu_identity_mapping |= IDENTMAP_GFX;
2797 check_tylersburg_isoch();
2800 * If pass through is not set or not enabled, setup context entries for
2801 * identity mappings for rmrr, gfx, and isa and may fall back to static
2802 * identity mapping if iommu_identity_mapping is set.
2804 if (iommu_identity_mapping) {
2805 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2807 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2813 * for each dev attached to rmrr
2815 * locate drhd for dev, alloc domain for dev
2816 * allocate free domain
2817 * allocate page table entries for rmrr
2818 * if context not allocated for bus
2819 * allocate and init context
2820 * set present in root table for this bus
2821 * init context with domain, translation etc
2825 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2826 for_each_rmrr_units(rmrr) {
2827 /* some BIOS lists non-exist devices in DMAR table. */
2828 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2830 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2833 "IOMMU: mapping reserved region failed\n");
2837 iommu_prepare_isa();
2842 * global invalidate context cache
2843 * global invalidate iotlb
2844 * enable translation
2846 for_each_iommu(iommu, drhd) {
2847 if (drhd->ignored) {
2849 * we always have to disable PMRs or DMA may fail on
2853 iommu_disable_protect_mem_regions(iommu);
2857 iommu_flush_write_buffer(iommu);
2859 ret = dmar_set_interrupt(iommu);
2863 iommu_set_root_entry(iommu);
2865 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2866 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2867 iommu_enable_translation(iommu);
2868 iommu_disable_protect_mem_regions(iommu);
2874 for_each_active_iommu(iommu, drhd)
2875 free_dmar_iommu(iommu);
2876 kfree(deferred_flush);
2883 /* This takes a number of _MM_ pages, not VTD pages */
2884 static struct iova *intel_alloc_iova(struct device *dev,
2885 struct dmar_domain *domain,
2886 unsigned long nrpages, uint64_t dma_mask)
2888 struct iova *iova = NULL;
2890 /* Restrict dma_mask to the width that the iommu can handle */
2891 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2893 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2895 * First try to allocate an io virtual address in
2896 * DMA_BIT_MASK(32) and if that fails then try allocating
2899 iova = alloc_iova(&domain->iovad, nrpages,
2900 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2904 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2905 if (unlikely(!iova)) {
2906 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2907 nrpages, dev_name(dev));
2914 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2916 struct dmar_domain *domain;
2919 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2921 printk(KERN_ERR "Allocating domain for %s failed",
2926 /* make sure context mapping is ok */
2927 if (unlikely(!domain_context_mapped(dev))) {
2928 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2930 printk(KERN_ERR "Domain context map for %s failed",
2939 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2941 struct device_domain_info *info;
2943 /* No lock here, assumes no domain exit in normal case */
2944 info = dev->archdata.iommu;
2946 return info->domain;
2948 return __get_valid_domain_for_dev(dev);
2951 static int iommu_dummy(struct device *dev)
2953 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2956 /* Check if the dev needs to go through non-identity map and unmap process.*/
2957 static int iommu_no_mapping(struct device *dev)
2961 if (iommu_dummy(dev))
2964 if (!iommu_identity_mapping)
2967 found = identity_mapping(dev);
2969 if (iommu_should_identity_map(dev, 0))
2973 * 32 bit DMA is removed from si_domain and fall back
2974 * to non-identity mapping.
2976 domain_remove_one_dev_info(si_domain, dev);
2977 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2983 * In case of a detached 64 bit DMA device from vm, the device
2984 * is put into si_domain for identity mapping.
2986 if (iommu_should_identity_map(dev, 0)) {
2988 ret = domain_add_dev_info(si_domain, dev,
2990 CONTEXT_TT_PASS_THROUGH :
2991 CONTEXT_TT_MULTI_LEVEL);
2993 printk(KERN_INFO "64bit %s uses identity mapping\n",
3003 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3004 size_t size, int dir, u64 dma_mask)
3006 struct dmar_domain *domain;
3007 phys_addr_t start_paddr;
3011 struct intel_iommu *iommu;
3012 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3014 BUG_ON(dir == DMA_NONE);
3016 if (iommu_no_mapping(dev))
3019 domain = get_valid_domain_for_dev(dev);
3023 iommu = domain_get_iommu(domain);
3024 size = aligned_nrpages(paddr, size);
3026 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3031 * Check if DMAR supports zero-length reads on write only
3034 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3035 !cap_zlr(iommu->cap))
3036 prot |= DMA_PTE_READ;
3037 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3038 prot |= DMA_PTE_WRITE;
3040 * paddr - (paddr + size) might be partial page, we should map the whole
3041 * page. Note: if two part of one page are separately mapped, we
3042 * might have two guest_addr mapping to the same host paddr, but this
3043 * is not a big problem
3045 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3046 mm_to_dma_pfn(paddr_pfn), size, prot);
3050 /* it's a non-present to present mapping. Only flush if caching mode */
3051 if (cap_caching_mode(iommu->cap))
3052 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3054 iommu_flush_write_buffer(iommu);
3056 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3057 start_paddr += paddr & ~PAGE_MASK;
3062 __free_iova(&domain->iovad, iova);
3063 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3064 dev_name(dev), size, (unsigned long long)paddr, dir);
3068 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3069 unsigned long offset, size_t size,
3070 enum dma_data_direction dir,
3071 struct dma_attrs *attrs)
3073 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3074 dir, *dev->dma_mask);
3077 static void flush_unmaps(void)
3083 /* just flush them all */
3084 for (i = 0; i < g_num_of_iommus; i++) {
3085 struct intel_iommu *iommu = g_iommus[i];
3089 if (!deferred_flush[i].next)
3092 /* In caching mode, global flushes turn emulation expensive */
3093 if (!cap_caching_mode(iommu->cap))
3094 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3095 DMA_TLB_GLOBAL_FLUSH);
3096 for (j = 0; j < deferred_flush[i].next; j++) {
3098 struct iova *iova = deferred_flush[i].iova[j];
3099 struct dmar_domain *domain = deferred_flush[i].domain[j];
3101 /* On real hardware multiple invalidations are expensive */
3102 if (cap_caching_mode(iommu->cap))
3103 iommu_flush_iotlb_psi(iommu, domain->id,
3104 iova->pfn_lo, iova_size(iova),
3105 !deferred_flush[i].freelist[j], 0);
3107 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3108 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3109 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3111 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3112 if (deferred_flush[i].freelist[j])
3113 dma_free_pagelist(deferred_flush[i].freelist[j]);
3115 deferred_flush[i].next = 0;
3121 static void flush_unmaps_timeout(unsigned long data)
3123 unsigned long flags;
3125 spin_lock_irqsave(&async_umap_flush_lock, flags);
3127 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3130 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3132 unsigned long flags;
3134 struct intel_iommu *iommu;
3136 spin_lock_irqsave(&async_umap_flush_lock, flags);
3137 if (list_size == HIGH_WATER_MARK)
3140 iommu = domain_get_iommu(dom);
3141 iommu_id = iommu->seq_id;
3143 next = deferred_flush[iommu_id].next;
3144 deferred_flush[iommu_id].domain[next] = dom;
3145 deferred_flush[iommu_id].iova[next] = iova;
3146 deferred_flush[iommu_id].freelist[next] = freelist;
3147 deferred_flush[iommu_id].next++;
3150 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3154 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3157 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3159 struct dmar_domain *domain;
3160 unsigned long start_pfn, last_pfn;
3162 struct intel_iommu *iommu;
3163 struct page *freelist;
3165 if (iommu_no_mapping(dev))
3168 domain = find_domain(dev);
3171 iommu = domain_get_iommu(domain);
3173 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3174 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3175 (unsigned long long)dev_addr))
3178 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3179 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3181 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3182 dev_name(dev), start_pfn, last_pfn);
3184 freelist = domain_unmap(domain, start_pfn, last_pfn);
3186 if (intel_iommu_strict) {
3187 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3188 last_pfn - start_pfn + 1, !freelist, 0);
3190 __free_iova(&domain->iovad, iova);
3191 dma_free_pagelist(freelist);
3193 add_unmap(domain, iova, freelist);
3195 * queue up the release of the unmap to save the 1/6th of the
3196 * cpu used up by the iotlb flush operation...
3201 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3202 size_t size, enum dma_data_direction dir,
3203 struct dma_attrs *attrs)
3205 intel_unmap(dev, dev_addr);
3208 static void *intel_alloc_coherent(struct device *dev, size_t size,
3209 dma_addr_t *dma_handle, gfp_t flags,
3210 struct dma_attrs *attrs)
3212 struct page *page = NULL;
3215 size = PAGE_ALIGN(size);
3216 order = get_order(size);
3218 if (!iommu_no_mapping(dev))
3219 flags &= ~(GFP_DMA | GFP_DMA32);
3220 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3221 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3227 if (flags & __GFP_WAIT) {
3228 unsigned int count = size >> PAGE_SHIFT;
3230 page = dma_alloc_from_contiguous(dev, count, order);
3231 if (page && iommu_no_mapping(dev) &&
3232 page_to_phys(page) + size > dev->coherent_dma_mask) {
3233 dma_release_from_contiguous(dev, page, count);
3239 page = alloc_pages(flags, order);
3242 memset(page_address(page), 0, size);
3244 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3246 dev->coherent_dma_mask);
3248 return page_address(page);
3249 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3250 __free_pages(page, order);
3255 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3256 dma_addr_t dma_handle, struct dma_attrs *attrs)
3259 struct page *page = virt_to_page(vaddr);
3261 size = PAGE_ALIGN(size);
3262 order = get_order(size);
3264 intel_unmap(dev, dma_handle);
3265 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3266 __free_pages(page, order);
3269 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3270 int nelems, enum dma_data_direction dir,
3271 struct dma_attrs *attrs)
3273 intel_unmap(dev, sglist[0].dma_address);
3276 static int intel_nontranslate_map_sg(struct device *hddev,
3277 struct scatterlist *sglist, int nelems, int dir)
3280 struct scatterlist *sg;
3282 for_each_sg(sglist, sg, nelems, i) {
3283 BUG_ON(!sg_page(sg));
3284 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3285 sg->dma_length = sg->length;
3290 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3291 enum dma_data_direction dir, struct dma_attrs *attrs)
3294 struct dmar_domain *domain;
3297 struct iova *iova = NULL;
3299 struct scatterlist *sg;
3300 unsigned long start_vpfn;
3301 struct intel_iommu *iommu;
3303 BUG_ON(dir == DMA_NONE);
3304 if (iommu_no_mapping(dev))
3305 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3307 domain = get_valid_domain_for_dev(dev);
3311 iommu = domain_get_iommu(domain);
3313 for_each_sg(sglist, sg, nelems, i)
3314 size += aligned_nrpages(sg->offset, sg->length);
3316 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3319 sglist->dma_length = 0;
3324 * Check if DMAR supports zero-length reads on write only
3327 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3328 !cap_zlr(iommu->cap))
3329 prot |= DMA_PTE_READ;
3330 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3331 prot |= DMA_PTE_WRITE;
3333 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3335 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3336 if (unlikely(ret)) {
3337 dma_pte_free_pagetable(domain, start_vpfn,
3338 start_vpfn + size - 1);
3339 __free_iova(&domain->iovad, iova);
3343 /* it's a non-present to present mapping. Only flush if caching mode */
3344 if (cap_caching_mode(iommu->cap))
3345 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3347 iommu_flush_write_buffer(iommu);
3352 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3357 struct dma_map_ops intel_dma_ops = {
3358 .alloc = intel_alloc_coherent,
3359 .free = intel_free_coherent,
3360 .map_sg = intel_map_sg,
3361 .unmap_sg = intel_unmap_sg,
3362 .map_page = intel_map_page,
3363 .unmap_page = intel_unmap_page,
3364 .mapping_error = intel_mapping_error,
3367 static inline int iommu_domain_cache_init(void)
3371 iommu_domain_cache = kmem_cache_create("iommu_domain",
3372 sizeof(struct dmar_domain),
3377 if (!iommu_domain_cache) {
3378 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3385 static inline int iommu_devinfo_cache_init(void)
3389 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3390 sizeof(struct device_domain_info),
3394 if (!iommu_devinfo_cache) {
3395 printk(KERN_ERR "Couldn't create devinfo cache\n");
3402 static inline int iommu_iova_cache_init(void)
3406 iommu_iova_cache = kmem_cache_create("iommu_iova",
3407 sizeof(struct iova),
3411 if (!iommu_iova_cache) {
3412 printk(KERN_ERR "Couldn't create iova cache\n");
3419 static int __init iommu_init_mempool(void)
3422 ret = iommu_iova_cache_init();
3426 ret = iommu_domain_cache_init();
3430 ret = iommu_devinfo_cache_init();
3434 kmem_cache_destroy(iommu_domain_cache);
3436 kmem_cache_destroy(iommu_iova_cache);
3441 static void __init iommu_exit_mempool(void)
3443 kmem_cache_destroy(iommu_devinfo_cache);
3444 kmem_cache_destroy(iommu_domain_cache);
3445 kmem_cache_destroy(iommu_iova_cache);
3449 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3451 struct dmar_drhd_unit *drhd;
3455 /* We know that this device on this chipset has its own IOMMU.
3456 * If we find it under a different IOMMU, then the BIOS is lying
3457 * to us. Hope that the IOMMU for this device is actually
3458 * disabled, and it needs no translation...
3460 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3462 /* "can't" happen */
3463 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3466 vtbar &= 0xffff0000;
3468 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3469 drhd = dmar_find_matched_drhd_unit(pdev);
3470 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3471 TAINT_FIRMWARE_WORKAROUND,
3472 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3473 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3475 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3477 static void __init init_no_remapping_devices(void)
3479 struct dmar_drhd_unit *drhd;
3483 for_each_drhd_unit(drhd) {
3484 if (!drhd->include_all) {
3485 for_each_active_dev_scope(drhd->devices,
3486 drhd->devices_cnt, i, dev)
3488 /* ignore DMAR unit if no devices exist */
3489 if (i == drhd->devices_cnt)
3494 for_each_active_drhd_unit(drhd) {
3495 if (drhd->include_all)
3498 for_each_active_dev_scope(drhd->devices,
3499 drhd->devices_cnt, i, dev)
3500 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3502 if (i < drhd->devices_cnt)
3505 /* This IOMMU has *only* gfx devices. Either bypass it or
3506 set the gfx_mapped flag, as appropriate */
3508 intel_iommu_gfx_mapped = 1;
3511 for_each_active_dev_scope(drhd->devices,
3512 drhd->devices_cnt, i, dev)
3513 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3518 #ifdef CONFIG_SUSPEND
3519 static int init_iommu_hw(void)
3521 struct dmar_drhd_unit *drhd;
3522 struct intel_iommu *iommu = NULL;
3524 for_each_active_iommu(iommu, drhd)
3526 dmar_reenable_qi(iommu);
3528 for_each_iommu(iommu, drhd) {
3529 if (drhd->ignored) {
3531 * we always have to disable PMRs or DMA may fail on
3535 iommu_disable_protect_mem_regions(iommu);
3539 iommu_flush_write_buffer(iommu);
3541 iommu_set_root_entry(iommu);
3543 iommu->flush.flush_context(iommu, 0, 0, 0,
3544 DMA_CCMD_GLOBAL_INVL);
3545 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3546 iommu_enable_translation(iommu);
3547 iommu_disable_protect_mem_regions(iommu);
3553 static void iommu_flush_all(void)
3555 struct dmar_drhd_unit *drhd;
3556 struct intel_iommu *iommu;
3558 for_each_active_iommu(iommu, drhd) {
3559 iommu->flush.flush_context(iommu, 0, 0, 0,
3560 DMA_CCMD_GLOBAL_INVL);
3561 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3562 DMA_TLB_GLOBAL_FLUSH);
3566 static int iommu_suspend(void)
3568 struct dmar_drhd_unit *drhd;
3569 struct intel_iommu *iommu = NULL;
3572 for_each_active_iommu(iommu, drhd) {
3573 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3575 if (!iommu->iommu_state)
3581 for_each_active_iommu(iommu, drhd) {
3582 iommu_disable_translation(iommu);
3584 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3586 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3587 readl(iommu->reg + DMAR_FECTL_REG);
3588 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3589 readl(iommu->reg + DMAR_FEDATA_REG);
3590 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3591 readl(iommu->reg + DMAR_FEADDR_REG);
3592 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3593 readl(iommu->reg + DMAR_FEUADDR_REG);
3595 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3600 for_each_active_iommu(iommu, drhd)
3601 kfree(iommu->iommu_state);
3606 static void iommu_resume(void)
3608 struct dmar_drhd_unit *drhd;
3609 struct intel_iommu *iommu = NULL;
3612 if (init_iommu_hw()) {
3614 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3616 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3620 for_each_active_iommu(iommu, drhd) {
3622 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3624 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3625 iommu->reg + DMAR_FECTL_REG);
3626 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3627 iommu->reg + DMAR_FEDATA_REG);
3628 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3629 iommu->reg + DMAR_FEADDR_REG);
3630 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3631 iommu->reg + DMAR_FEUADDR_REG);
3633 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3636 for_each_active_iommu(iommu, drhd)
3637 kfree(iommu->iommu_state);
3640 static struct syscore_ops iommu_syscore_ops = {
3641 .resume = iommu_resume,
3642 .suspend = iommu_suspend,
3645 static void __init init_iommu_pm_ops(void)
3647 register_syscore_ops(&iommu_syscore_ops);
3651 static inline void init_iommu_pm_ops(void) {}
3652 #endif /* CONFIG_PM */
3655 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3657 struct acpi_dmar_reserved_memory *rmrr;
3658 struct dmar_rmrr_unit *rmrru;
3660 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3664 rmrru->hdr = header;
3665 rmrr = (struct acpi_dmar_reserved_memory *)header;
3666 rmrru->base_address = rmrr->base_address;
3667 rmrru->end_address = rmrr->end_address;
3668 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3669 ((void *)rmrr) + rmrr->header.length,
3670 &rmrru->devices_cnt);
3671 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3676 list_add(&rmrru->list, &dmar_rmrr_units);
3681 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3683 struct acpi_dmar_atsr *atsr;
3684 struct dmar_atsr_unit *atsru;
3686 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3687 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3692 atsru->include_all = atsr->flags & 0x1;
3693 if (!atsru->include_all) {
3694 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3695 (void *)atsr + atsr->header.length,
3696 &atsru->devices_cnt);
3697 if (atsru->devices_cnt && atsru->devices == NULL) {
3703 list_add_rcu(&atsru->list, &dmar_atsr_units);
3708 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3710 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3714 static void intel_iommu_free_dmars(void)
3716 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3717 struct dmar_atsr_unit *atsru, *atsr_n;
3719 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3720 list_del(&rmrru->list);
3721 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3725 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3726 list_del(&atsru->list);
3727 intel_iommu_free_atsr(atsru);
3731 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3734 struct pci_bus *bus;
3735 struct pci_dev *bridge = NULL;
3737 struct acpi_dmar_atsr *atsr;
3738 struct dmar_atsr_unit *atsru;
3740 dev = pci_physfn(dev);
3741 for (bus = dev->bus; bus; bus = bus->parent) {
3743 if (!bridge || !pci_is_pcie(bridge) ||
3744 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3746 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3753 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3754 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3755 if (atsr->segment != pci_domain_nr(dev->bus))
3758 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3759 if (tmp == &bridge->dev)
3762 if (atsru->include_all)
3772 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3775 struct dmar_rmrr_unit *rmrru;
3776 struct dmar_atsr_unit *atsru;
3777 struct acpi_dmar_atsr *atsr;
3778 struct acpi_dmar_reserved_memory *rmrr;
3780 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3783 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3784 rmrr = container_of(rmrru->hdr,
3785 struct acpi_dmar_reserved_memory, header);
3786 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3787 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3788 ((void *)rmrr) + rmrr->header.length,
3789 rmrr->segment, rmrru->devices,
3790 rmrru->devices_cnt);
3793 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3794 dmar_remove_dev_scope(info, rmrr->segment,
3795 rmrru->devices, rmrru->devices_cnt);
3799 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3800 if (atsru->include_all)
3803 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3804 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3805 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3806 (void *)atsr + atsr->header.length,
3807 atsr->segment, atsru->devices,
3808 atsru->devices_cnt);
3813 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3814 if (dmar_remove_dev_scope(info, atsr->segment,
3815 atsru->devices, atsru->devices_cnt))
3824 * Here we only respond to action of unbound device from driver.
3826 * Added device is not attached to its DMAR domain here yet. That will happen
3827 * when mapping the device to iova.
3829 static int device_notifier(struct notifier_block *nb,
3830 unsigned long action, void *data)
3832 struct device *dev = data;
3833 struct dmar_domain *domain;
3835 if (iommu_dummy(dev))
3838 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3839 action != BUS_NOTIFY_DEL_DEVICE)
3842 domain = find_domain(dev);
3846 down_read(&dmar_global_lock);
3847 domain_remove_one_dev_info(domain, dev);
3848 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
3849 domain_exit(domain);
3850 up_read(&dmar_global_lock);
3855 static struct notifier_block device_nb = {
3856 .notifier_call = device_notifier,
3859 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3860 unsigned long val, void *v)
3862 struct memory_notify *mhp = v;
3863 unsigned long long start, end;
3864 unsigned long start_vpfn, last_vpfn;
3867 case MEM_GOING_ONLINE:
3868 start = mhp->start_pfn << PAGE_SHIFT;
3869 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3870 if (iommu_domain_identity_map(si_domain, start, end)) {
3871 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3878 case MEM_CANCEL_ONLINE:
3879 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3880 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3881 while (start_vpfn <= last_vpfn) {
3883 struct dmar_drhd_unit *drhd;
3884 struct intel_iommu *iommu;
3885 struct page *freelist;
3887 iova = find_iova(&si_domain->iovad, start_vpfn);
3889 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3894 iova = split_and_remove_iova(&si_domain->iovad, iova,
3895 start_vpfn, last_vpfn);
3897 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3898 start_vpfn, last_vpfn);
3902 freelist = domain_unmap(si_domain, iova->pfn_lo,
3906 for_each_active_iommu(iommu, drhd)
3907 iommu_flush_iotlb_psi(iommu, si_domain->id,
3908 iova->pfn_lo, iova_size(iova),
3911 dma_free_pagelist(freelist);
3913 start_vpfn = iova->pfn_hi + 1;
3914 free_iova_mem(iova);
3922 static struct notifier_block intel_iommu_memory_nb = {
3923 .notifier_call = intel_iommu_memory_notifier,
3928 static ssize_t intel_iommu_show_version(struct device *dev,
3929 struct device_attribute *attr,
3932 struct intel_iommu *iommu = dev_get_drvdata(dev);
3933 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3934 return sprintf(buf, "%d:%d\n",
3935 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3937 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
3939 static ssize_t intel_iommu_show_address(struct device *dev,
3940 struct device_attribute *attr,
3943 struct intel_iommu *iommu = dev_get_drvdata(dev);
3944 return sprintf(buf, "%llx\n", iommu->reg_phys);
3946 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
3948 static ssize_t intel_iommu_show_cap(struct device *dev,
3949 struct device_attribute *attr,
3952 struct intel_iommu *iommu = dev_get_drvdata(dev);
3953 return sprintf(buf, "%llx\n", iommu->cap);
3955 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
3957 static ssize_t intel_iommu_show_ecap(struct device *dev,
3958 struct device_attribute *attr,
3961 struct intel_iommu *iommu = dev_get_drvdata(dev);
3962 return sprintf(buf, "%llx\n", iommu->ecap);
3964 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
3966 static struct attribute *intel_iommu_attrs[] = {
3967 &dev_attr_version.attr,
3968 &dev_attr_address.attr,
3970 &dev_attr_ecap.attr,
3974 static struct attribute_group intel_iommu_group = {
3975 .name = "intel-iommu",
3976 .attrs = intel_iommu_attrs,
3979 const struct attribute_group *intel_iommu_groups[] = {
3984 int __init intel_iommu_init(void)
3987 struct dmar_drhd_unit *drhd;
3988 struct intel_iommu *iommu;
3990 /* VT-d is required for a TXT/tboot launch, so enforce that */
3991 force_on = tboot_force_iommu();
3993 if (iommu_init_mempool()) {
3995 panic("tboot: Failed to initialize iommu memory\n");
3999 down_write(&dmar_global_lock);
4000 if (dmar_table_init()) {
4002 panic("tboot: Failed to initialize DMAR table\n");
4007 * Disable translation if already enabled prior to OS handover.
4009 for_each_active_iommu(iommu, drhd)
4010 if (iommu->gcmd & DMA_GCMD_TE)
4011 iommu_disable_translation(iommu);
4013 if (dmar_dev_scope_init() < 0) {
4015 panic("tboot: Failed to initialize DMAR device scope\n");
4019 if (no_iommu || dmar_disabled)
4022 if (list_empty(&dmar_rmrr_units))
4023 printk(KERN_INFO "DMAR: No RMRR found\n");
4025 if (list_empty(&dmar_atsr_units))
4026 printk(KERN_INFO "DMAR: No ATSR found\n");
4028 if (dmar_init_reserved_ranges()) {
4030 panic("tboot: Failed to reserve iommu ranges\n");
4031 goto out_free_reserved_range;
4034 init_no_remapping_devices();
4039 panic("tboot: Failed to initialize DMARs\n");
4040 printk(KERN_ERR "IOMMU: dmar init failed\n");
4041 goto out_free_reserved_range;
4043 up_write(&dmar_global_lock);
4045 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4047 init_timer(&unmap_timer);
4048 #ifdef CONFIG_SWIOTLB
4051 dma_ops = &intel_dma_ops;
4053 init_iommu_pm_ops();
4055 for_each_active_iommu(iommu, drhd)
4056 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4060 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4061 bus_register_notifier(&pci_bus_type, &device_nb);
4062 if (si_domain && !hw_pass_through)
4063 register_memory_notifier(&intel_iommu_memory_nb);
4065 intel_iommu_enabled = 1;
4069 out_free_reserved_range:
4070 put_iova_domain(&reserved_iova_list);
4072 intel_iommu_free_dmars();
4073 up_write(&dmar_global_lock);
4074 iommu_exit_mempool();
4078 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4080 struct intel_iommu *iommu = opaque;
4082 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4087 * NB - intel-iommu lacks any sort of reference counting for the users of
4088 * dependent devices. If multiple endpoints have intersecting dependent
4089 * devices, unbinding the driver from any one of them will possibly leave
4090 * the others unable to operate.
4092 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4095 if (!iommu || !dev || !dev_is_pci(dev))
4098 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4101 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4104 struct device_domain_info *info, *tmp;
4105 struct intel_iommu *iommu;
4106 unsigned long flags;
4110 iommu = device_to_iommu(dev, &bus, &devfn);
4114 spin_lock_irqsave(&device_domain_lock, flags);
4115 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4116 if (info->iommu == iommu && info->bus == bus &&
4117 info->devfn == devfn) {
4118 unlink_domain_info(info);
4119 spin_unlock_irqrestore(&device_domain_lock, flags);
4121 iommu_disable_dev_iotlb(info);
4122 iommu_detach_dev(iommu, info->bus, info->devfn);
4123 iommu_detach_dependent_devices(iommu, dev);
4124 free_devinfo_mem(info);
4126 spin_lock_irqsave(&device_domain_lock, flags);
4134 /* if there is no other devices under the same iommu
4135 * owned by this domain, clear this iommu in iommu_bmp
4136 * update iommu count and coherency
4138 if (info->iommu == iommu)
4142 spin_unlock_irqrestore(&device_domain_lock, flags);
4145 domain_detach_iommu(domain, iommu);
4146 if (!domain_type_is_vm_or_si(domain))
4147 iommu_detach_domain(domain, iommu);
4151 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4155 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4156 domain_reserve_special_ranges(domain);
4158 /* calculate AGAW */
4159 domain->gaw = guest_width;
4160 adjust_width = guestwidth_to_adjustwidth(guest_width);
4161 domain->agaw = width_to_agaw(adjust_width);
4163 domain->iommu_coherency = 0;
4164 domain->iommu_snooping = 0;
4165 domain->iommu_superpage = 0;
4166 domain->max_addr = 0;
4168 /* always allocate the top pgd */
4169 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4172 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4176 static int intel_iommu_domain_init(struct iommu_domain *domain)
4178 struct dmar_domain *dmar_domain;
4180 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4183 "intel_iommu_domain_init: dmar_domain == NULL\n");
4186 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4188 "intel_iommu_domain_init() failed\n");
4189 domain_exit(dmar_domain);
4192 domain_update_iommu_cap(dmar_domain);
4193 domain->priv = dmar_domain;
4195 domain->geometry.aperture_start = 0;
4196 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4197 domain->geometry.force_aperture = true;
4202 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4204 struct dmar_domain *dmar_domain = domain->priv;
4206 domain->priv = NULL;
4207 domain_exit(dmar_domain);
4210 static int intel_iommu_attach_device(struct iommu_domain *domain,
4213 struct dmar_domain *dmar_domain = domain->priv;
4214 struct intel_iommu *iommu;
4218 /* normally dev is not mapped */
4219 if (unlikely(domain_context_mapped(dev))) {
4220 struct dmar_domain *old_domain;
4222 old_domain = find_domain(dev);
4224 if (domain_type_is_vm_or_si(dmar_domain))
4225 domain_remove_one_dev_info(old_domain, dev);
4227 domain_remove_dev_info(old_domain);
4231 iommu = device_to_iommu(dev, &bus, &devfn);
4235 /* check if this iommu agaw is sufficient for max mapped address */
4236 addr_width = agaw_to_width(iommu->agaw);
4237 if (addr_width > cap_mgaw(iommu->cap))
4238 addr_width = cap_mgaw(iommu->cap);
4240 if (dmar_domain->max_addr > (1LL << addr_width)) {
4241 printk(KERN_ERR "%s: iommu width (%d) is not "
4242 "sufficient for the mapped address (%llx)\n",
4243 __func__, addr_width, dmar_domain->max_addr);
4246 dmar_domain->gaw = addr_width;
4249 * Knock out extra levels of page tables if necessary
4251 while (iommu->agaw < dmar_domain->agaw) {
4252 struct dma_pte *pte;
4254 pte = dmar_domain->pgd;
4255 if (dma_pte_present(pte)) {
4256 dmar_domain->pgd = (struct dma_pte *)
4257 phys_to_virt(dma_pte_addr(pte));
4258 free_pgtable_page(pte);
4260 dmar_domain->agaw--;
4263 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4266 static void intel_iommu_detach_device(struct iommu_domain *domain,
4269 struct dmar_domain *dmar_domain = domain->priv;
4271 domain_remove_one_dev_info(dmar_domain, dev);
4274 static int intel_iommu_map(struct iommu_domain *domain,
4275 unsigned long iova, phys_addr_t hpa,
4276 size_t size, int iommu_prot)
4278 struct dmar_domain *dmar_domain = domain->priv;
4283 if (iommu_prot & IOMMU_READ)
4284 prot |= DMA_PTE_READ;
4285 if (iommu_prot & IOMMU_WRITE)
4286 prot |= DMA_PTE_WRITE;
4287 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4288 prot |= DMA_PTE_SNP;
4290 max_addr = iova + size;
4291 if (dmar_domain->max_addr < max_addr) {
4294 /* check if minimum agaw is sufficient for mapped address */
4295 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4296 if (end < max_addr) {
4297 printk(KERN_ERR "%s: iommu width (%d) is not "
4298 "sufficient for the mapped address (%llx)\n",
4299 __func__, dmar_domain->gaw, max_addr);
4302 dmar_domain->max_addr = max_addr;
4304 /* Round up size to next multiple of PAGE_SIZE, if it and
4305 the low bits of hpa would take us onto the next page */
4306 size = aligned_nrpages(hpa, size);
4307 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4308 hpa >> VTD_PAGE_SHIFT, size, prot);
4312 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4313 unsigned long iova, size_t size)
4315 struct dmar_domain *dmar_domain = domain->priv;
4316 struct page *freelist = NULL;
4317 struct intel_iommu *iommu;
4318 unsigned long start_pfn, last_pfn;
4319 unsigned int npages;
4320 int iommu_id, num, ndomains, level = 0;
4322 /* Cope with horrid API which requires us to unmap more than the
4323 size argument if it happens to be a large-page mapping. */
4324 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4327 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4328 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4330 start_pfn = iova >> VTD_PAGE_SHIFT;
4331 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4333 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4335 npages = last_pfn - start_pfn + 1;
4337 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4338 iommu = g_iommus[iommu_id];
4341 * find bit position of dmar_domain
4343 ndomains = cap_ndoms(iommu->cap);
4344 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4345 if (iommu->domains[num] == dmar_domain)
4346 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4347 npages, !freelist, 0);
4352 dma_free_pagelist(freelist);
4354 if (dmar_domain->max_addr == iova + size)
4355 dmar_domain->max_addr = iova;
4360 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4363 struct dmar_domain *dmar_domain = domain->priv;
4364 struct dma_pte *pte;
4368 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4370 phys = dma_pte_addr(pte);
4375 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4378 struct dmar_domain *dmar_domain = domain->priv;
4380 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4381 return dmar_domain->iommu_snooping;
4382 if (cap == IOMMU_CAP_INTR_REMAP)
4383 return irq_remapping_enabled;
4388 static int intel_iommu_add_device(struct device *dev)
4390 struct intel_iommu *iommu;
4391 struct iommu_group *group;
4394 iommu = device_to_iommu(dev, &bus, &devfn);
4398 iommu_device_link(iommu->iommu_dev, dev);
4400 group = iommu_group_get_for_dev(dev);
4403 return PTR_ERR(group);
4405 iommu_group_put(group);
4409 static void intel_iommu_remove_device(struct device *dev)
4411 struct intel_iommu *iommu;
4414 iommu = device_to_iommu(dev, &bus, &devfn);
4418 iommu_group_remove_device(dev);
4420 iommu_device_unlink(iommu->iommu_dev, dev);
4423 static const struct iommu_ops intel_iommu_ops = {
4424 .domain_init = intel_iommu_domain_init,
4425 .domain_destroy = intel_iommu_domain_destroy,
4426 .attach_dev = intel_iommu_attach_device,
4427 .detach_dev = intel_iommu_detach_device,
4428 .map = intel_iommu_map,
4429 .unmap = intel_iommu_unmap,
4430 .iova_to_phys = intel_iommu_iova_to_phys,
4431 .domain_has_cap = intel_iommu_domain_has_cap,
4432 .add_device = intel_iommu_add_device,
4433 .remove_device = intel_iommu_remove_device,
4434 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4437 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4439 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4440 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4452 static void quirk_iommu_rwbf(struct pci_dev *dev)
4455 * Mobile 4 Series Chipset neglects to set RWBF capability,
4456 * but needs it. Same seems to hold for the desktop versions.
4458 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4462 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4463 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4471 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4472 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4473 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4474 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4475 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4476 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4477 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4478 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4480 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4484 if (pci_read_config_word(dev, GGC, &ggc))
4487 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4488 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4490 } else if (dmar_map_gfx) {
4491 /* we have to ensure the gfx device is idle before we flush */
4492 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4493 intel_iommu_strict = 1;
4496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4497 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4498 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4499 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4501 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4502 ISOCH DMAR unit for the Azalia sound device, but not give it any
4503 TLB entries, which causes it to deadlock. Check for that. We do
4504 this in a function called from init_dmars(), instead of in a PCI
4505 quirk, because we don't want to print the obnoxious "BIOS broken"
4506 message if VT-d is actually disabled.
4508 static void __init check_tylersburg_isoch(void)
4510 struct pci_dev *pdev;
4511 uint32_t vtisochctrl;
4513 /* If there's no Azalia in the system anyway, forget it. */
4514 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4519 /* System Management Registers. Might be hidden, in which case
4520 we can't do the sanity check. But that's OK, because the
4521 known-broken BIOSes _don't_ actually hide it, so far. */
4522 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4526 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4533 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4534 if (vtisochctrl & 1)
4537 /* Drop all bits other than the number of TLB entries */
4538 vtisochctrl &= 0x1c;
4540 /* If we have the recommended number of TLB entries (16), fine. */
4541 if (vtisochctrl == 0x10)
4544 /* Zero TLB entries? You get to ride the short bus to school. */
4546 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4547 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4548 dmi_get_system_info(DMI_BIOS_VENDOR),
4549 dmi_get_system_info(DMI_BIOS_VERSION),
4550 dmi_get_system_info(DMI_PRODUCT_VERSION));
4551 iommu_identity_mapping |= IDENTMAP_AZALIA;
4555 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",