Merge branch 'pm-opp'
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
50
51 #include "irq_remapping.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
85
86 /* page table handling */
87 #define LEVEL_STRIDE            (9)
88 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
89
90 /*
91  * This bitmap is used to advertise the page sizes our hardware support
92  * to the IOMMU core, which will then use this information to split
93  * physically contiguous memory regions it is mapping into page sizes
94  * that we support.
95  *
96  * Traditionally the IOMMU core just handed us the mappings directly,
97  * after making sure the size is an order of a 4KiB page and that the
98  * mapping has natural alignment.
99  *
100  * To retain this behavior, we currently advertise that we support
101  * all page sizes that are an order of 4KiB.
102  *
103  * If at some point we'd like to utilize the IOMMU core's new behavior,
104  * we could change this to advertise the real page sizes we support.
105  */
106 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
107
108 static inline int agaw_to_level(int agaw)
109 {
110         return agaw + 2;
111 }
112
113 static inline int agaw_to_width(int agaw)
114 {
115         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 }
117
118 static inline int width_to_agaw(int width)
119 {
120         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 }
122
123 static inline unsigned int level_to_offset_bits(int level)
124 {
125         return (level - 1) * LEVEL_STRIDE;
126 }
127
128 static inline int pfn_level_offset(unsigned long pfn, int level)
129 {
130         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 }
132
133 static inline unsigned long level_mask(int level)
134 {
135         return -1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long level_size(int level)
139 {
140         return 1UL << level_to_offset_bits(level);
141 }
142
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
144 {
145         return (pfn + level_size(level) - 1) & level_mask(level);
146 }
147
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
149 {
150         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 }
152
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154    are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
156 {
157         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
161 {
162         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 }
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
165 {
166         return mm_to_dma_pfn(page_to_pfn(pg));
167 }
168 static inline unsigned long virt_to_dma_pfn(void *p)
169 {
170         return page_to_dma_pfn(virt_to_page(p));
171 }
172
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
175
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
178
179 /*
180  * set to 1 to panic kernel if can't successfully enable VT-d
181  * (used when kernel is launched w/ TXT)
182  */
183 static int force_on = 0;
184
185 /*
186  * 0: Present
187  * 1-11: Reserved
188  * 12-63: Context Ptr (12 - (haw-1))
189  * 64-127: Reserved
190  */
191 struct root_entry {
192         u64     lo;
193         u64     hi;
194 };
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
196
197 /*
198  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 {
203         if (!(re->lo & 1))
204                 return 0;
205
206         return re->lo & VTD_PAGE_MASK;
207 }
208
209 /*
210  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
211  * if marked present.
212  */
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 {
215         if (!(re->hi & 1))
216                 return 0;
217
218         return re->hi & VTD_PAGE_MASK;
219 }
220 /*
221  * low 64 bits:
222  * 0: present
223  * 1: fault processing disable
224  * 2-3: translation type
225  * 12-63: address space root
226  * high 64 bits:
227  * 0-2: address width
228  * 3-6: aval
229  * 8-23: domain id
230  */
231 struct context_entry {
232         u64 lo;
233         u64 hi;
234 };
235
236 static inline void context_clear_pasid_enable(struct context_entry *context)
237 {
238         context->lo &= ~(1ULL << 11);
239 }
240
241 static inline bool context_pasid_enabled(struct context_entry *context)
242 {
243         return !!(context->lo & (1ULL << 11));
244 }
245
246 static inline void context_set_copied(struct context_entry *context)
247 {
248         context->hi |= (1ull << 3);
249 }
250
251 static inline bool context_copied(struct context_entry *context)
252 {
253         return !!(context->hi & (1ULL << 3));
254 }
255
256 static inline bool __context_present(struct context_entry *context)
257 {
258         return (context->lo & 1);
259 }
260
261 static inline bool context_present(struct context_entry *context)
262 {
263         return context_pasid_enabled(context) ?
264              __context_present(context) :
265              __context_present(context) && !context_copied(context);
266 }
267
268 static inline void context_set_present(struct context_entry *context)
269 {
270         context->lo |= 1;
271 }
272
273 static inline void context_set_fault_enable(struct context_entry *context)
274 {
275         context->lo &= (((u64)-1) << 2) | 1;
276 }
277
278 static inline void context_set_translation_type(struct context_entry *context,
279                                                 unsigned long value)
280 {
281         context->lo &= (((u64)-1) << 4) | 3;
282         context->lo |= (value & 3) << 2;
283 }
284
285 static inline void context_set_address_root(struct context_entry *context,
286                                             unsigned long value)
287 {
288         context->lo &= ~VTD_PAGE_MASK;
289         context->lo |= value & VTD_PAGE_MASK;
290 }
291
292 static inline void context_set_address_width(struct context_entry *context,
293                                              unsigned long value)
294 {
295         context->hi |= value & 7;
296 }
297
298 static inline void context_set_domain_id(struct context_entry *context,
299                                          unsigned long value)
300 {
301         context->hi |= (value & ((1 << 16) - 1)) << 8;
302 }
303
304 static inline int context_domain_id(struct context_entry *c)
305 {
306         return((c->hi >> 8) & 0xffff);
307 }
308
309 static inline void context_clear_entry(struct context_entry *context)
310 {
311         context->lo = 0;
312         context->hi = 0;
313 }
314
315 /*
316  * 0: readable
317  * 1: writable
318  * 2-6: reserved
319  * 7: super page
320  * 8-10: available
321  * 11: snoop behavior
322  * 12-63: Host physcial address
323  */
324 struct dma_pte {
325         u64 val;
326 };
327
328 static inline void dma_clear_pte(struct dma_pte *pte)
329 {
330         pte->val = 0;
331 }
332
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
334 {
335 #ifdef CONFIG_64BIT
336         return pte->val & VTD_PAGE_MASK;
337 #else
338         /* Must have a full atomic 64-bit read */
339         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
340 #endif
341 }
342
343 static inline bool dma_pte_present(struct dma_pte *pte)
344 {
345         return (pte->val & 3) != 0;
346 }
347
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
349 {
350         return (pte->val & DMA_PTE_LARGE_PAGE);
351 }
352
353 static inline int first_pte_in_page(struct dma_pte *pte)
354 {
355         return !((unsigned long)pte & ~VTD_PAGE_MASK);
356 }
357
358 /*
359  * This domain is a statically identity mapping domain.
360  *      1. This domain creats a static 1:1 mapping to all usable memory.
361  *      2. It maps to each iommu if successful.
362  *      3. Each iommu mapps to this domain if successful.
363  */
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
366
367 /* domain represents a virtual machine, more than one devices
368  * across iommus may be owned in one domain, e.g. kvm guest.
369  */
370 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
371
372 /* si_domain contains mulitple devices */
373 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
374
375 struct dmar_domain {
376         int     id;                     /* domain id */
377         int     nid;                    /* node id */
378         DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
379                                         /* bitmap of iommus this domain uses*/
380
381         struct list_head devices;       /* all devices' list */
382         struct iova_domain iovad;       /* iova's that belong to this domain */
383
384         struct dma_pte  *pgd;           /* virtual address */
385         int             gaw;            /* max guest address width */
386
387         /* adjusted guest address width, 0 is level 2 30-bit */
388         int             agaw;
389
390         int             flags;          /* flags to find out type of domain */
391
392         int             iommu_coherency;/* indicate coherency of iommu access */
393         int             iommu_snooping; /* indicate snooping control feature*/
394         int             iommu_count;    /* reference count of iommu */
395         int             iommu_superpage;/* Level of superpages supported:
396                                            0 == 4KiB (no superpages), 1 == 2MiB,
397                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
398         spinlock_t      iommu_lock;     /* protect iommu set in domain */
399         u64             max_addr;       /* maximum mapped address */
400
401         struct iommu_domain domain;     /* generic domain data structure for
402                                            iommu core */
403 };
404
405 /* PCI domain-device relationship */
406 struct device_domain_info {
407         struct list_head link;  /* link to domain siblings */
408         struct list_head global; /* link to global list */
409         u8 bus;                 /* PCI bus number */
410         u8 devfn;               /* PCI devfn number */
411         struct {
412                 u8 enabled:1;
413                 u8 qdep;
414         } ats;                  /* ATS state */
415         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
416         struct intel_iommu *iommu; /* IOMMU used by this device */
417         struct dmar_domain *domain; /* pointer to domain */
418 };
419
420 struct dmar_rmrr_unit {
421         struct list_head list;          /* list of rmrr units   */
422         struct acpi_dmar_header *hdr;   /* ACPI header          */
423         u64     base_address;           /* reserved base address*/
424         u64     end_address;            /* reserved end address */
425         struct dmar_dev_scope *devices; /* target devices */
426         int     devices_cnt;            /* target device count */
427 };
428
429 struct dmar_atsr_unit {
430         struct list_head list;          /* list of ATSR units */
431         struct acpi_dmar_header *hdr;   /* ACPI header */
432         struct dmar_dev_scope *devices; /* target devices */
433         int devices_cnt;                /* target device count */
434         u8 include_all:1;               /* include all ports */
435 };
436
437 static LIST_HEAD(dmar_atsr_units);
438 static LIST_HEAD(dmar_rmrr_units);
439
440 #define for_each_rmrr_units(rmrr) \
441         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
442
443 static void flush_unmaps_timeout(unsigned long data);
444
445 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
446
447 #define HIGH_WATER_MARK 250
448 struct deferred_flush_tables {
449         int next;
450         struct iova *iova[HIGH_WATER_MARK];
451         struct dmar_domain *domain[HIGH_WATER_MARK];
452         struct page *freelist[HIGH_WATER_MARK];
453 };
454
455 static struct deferred_flush_tables *deferred_flush;
456
457 /* bitmap for indexing intel_iommus */
458 static int g_num_of_iommus;
459
460 static DEFINE_SPINLOCK(async_umap_flush_lock);
461 static LIST_HEAD(unmaps_to_do);
462
463 static int timer_on;
464 static long list_size;
465
466 static void domain_exit(struct dmar_domain *domain);
467 static void domain_remove_dev_info(struct dmar_domain *domain);
468 static void domain_remove_one_dev_info(struct dmar_domain *domain,
469                                        struct device *dev);
470 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
471                                            struct device *dev);
472 static int domain_detach_iommu(struct dmar_domain *domain,
473                                struct intel_iommu *iommu);
474
475 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
476 int dmar_disabled = 0;
477 #else
478 int dmar_disabled = 1;
479 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
480
481 int intel_iommu_enabled = 0;
482 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
483
484 static int dmar_map_gfx = 1;
485 static int dmar_forcedac;
486 static int intel_iommu_strict;
487 static int intel_iommu_superpage = 1;
488 static int intel_iommu_ecs = 1;
489
490 /* We only actually use ECS when PASID support (on the new bit 40)
491  * is also advertised. Some early implementations — the ones with
492  * PASID support on bit 28 — have issues even when we *only* use
493  * extended root/context tables. */
494 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
495                             ecap_pasid(iommu->ecap))
496
497 int intel_iommu_gfx_mapped;
498 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
499
500 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
501 static DEFINE_SPINLOCK(device_domain_lock);
502 static LIST_HEAD(device_domain_list);
503
504 static const struct iommu_ops intel_iommu_ops;
505
506 static bool translation_pre_enabled(struct intel_iommu *iommu)
507 {
508         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
509 }
510
511 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
512 {
513         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
514 }
515
516 static void init_translation_status(struct intel_iommu *iommu)
517 {
518         u32 gsts;
519
520         gsts = readl(iommu->reg + DMAR_GSTS_REG);
521         if (gsts & DMA_GSTS_TES)
522                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
523 }
524
525 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
526 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
527 {
528         return container_of(dom, struct dmar_domain, domain);
529 }
530
531 static int __init intel_iommu_setup(char *str)
532 {
533         if (!str)
534                 return -EINVAL;
535         while (*str) {
536                 if (!strncmp(str, "on", 2)) {
537                         dmar_disabled = 0;
538                         pr_info("IOMMU enabled\n");
539                 } else if (!strncmp(str, "off", 3)) {
540                         dmar_disabled = 1;
541                         pr_info("IOMMU disabled\n");
542                 } else if (!strncmp(str, "igfx_off", 8)) {
543                         dmar_map_gfx = 0;
544                         pr_info("Disable GFX device mapping\n");
545                 } else if (!strncmp(str, "forcedac", 8)) {
546                         pr_info("Forcing DAC for PCI devices\n");
547                         dmar_forcedac = 1;
548                 } else if (!strncmp(str, "strict", 6)) {
549                         pr_info("Disable batched IOTLB flush\n");
550                         intel_iommu_strict = 1;
551                 } else if (!strncmp(str, "sp_off", 6)) {
552                         pr_info("Disable supported super page\n");
553                         intel_iommu_superpage = 0;
554                 } else if (!strncmp(str, "ecs_off", 7)) {
555                         printk(KERN_INFO
556                                 "Intel-IOMMU: disable extended context table support\n");
557                         intel_iommu_ecs = 0;
558                 }
559
560                 str += strcspn(str, ",");
561                 while (*str == ',')
562                         str++;
563         }
564         return 0;
565 }
566 __setup("intel_iommu=", intel_iommu_setup);
567
568 static struct kmem_cache *iommu_domain_cache;
569 static struct kmem_cache *iommu_devinfo_cache;
570
571 static inline void *alloc_pgtable_page(int node)
572 {
573         struct page *page;
574         void *vaddr = NULL;
575
576         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
577         if (page)
578                 vaddr = page_address(page);
579         return vaddr;
580 }
581
582 static inline void free_pgtable_page(void *vaddr)
583 {
584         free_page((unsigned long)vaddr);
585 }
586
587 static inline void *alloc_domain_mem(void)
588 {
589         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
590 }
591
592 static void free_domain_mem(void *vaddr)
593 {
594         kmem_cache_free(iommu_domain_cache, vaddr);
595 }
596
597 static inline void * alloc_devinfo_mem(void)
598 {
599         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
600 }
601
602 static inline void free_devinfo_mem(void *vaddr)
603 {
604         kmem_cache_free(iommu_devinfo_cache, vaddr);
605 }
606
607 static inline int domain_type_is_vm(struct dmar_domain *domain)
608 {
609         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
610 }
611
612 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
613 {
614         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
615                                 DOMAIN_FLAG_STATIC_IDENTITY);
616 }
617
618 static inline int domain_pfn_supported(struct dmar_domain *domain,
619                                        unsigned long pfn)
620 {
621         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
622
623         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
624 }
625
626 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
627 {
628         unsigned long sagaw;
629         int agaw = -1;
630
631         sagaw = cap_sagaw(iommu->cap);
632         for (agaw = width_to_agaw(max_gaw);
633              agaw >= 0; agaw--) {
634                 if (test_bit(agaw, &sagaw))
635                         break;
636         }
637
638         return agaw;
639 }
640
641 /*
642  * Calculate max SAGAW for each iommu.
643  */
644 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
645 {
646         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
647 }
648
649 /*
650  * calculate agaw for each iommu.
651  * "SAGAW" may be different across iommus, use a default agaw, and
652  * get a supported less agaw for iommus that don't support the default agaw.
653  */
654 int iommu_calculate_agaw(struct intel_iommu *iommu)
655 {
656         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
657 }
658
659 /* This functionin only returns single iommu in a domain */
660 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
661 {
662         int iommu_id;
663
664         /* si_domain and vm domain should not get here. */
665         BUG_ON(domain_type_is_vm_or_si(domain));
666         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
667         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
668                 return NULL;
669
670         return g_iommus[iommu_id];
671 }
672
673 static void domain_update_iommu_coherency(struct dmar_domain *domain)
674 {
675         struct dmar_drhd_unit *drhd;
676         struct intel_iommu *iommu;
677         bool found = false;
678         int i;
679
680         domain->iommu_coherency = 1;
681
682         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
683                 found = true;
684                 if (!ecap_coherent(g_iommus[i]->ecap)) {
685                         domain->iommu_coherency = 0;
686                         break;
687                 }
688         }
689         if (found)
690                 return;
691
692         /* No hardware attached; use lowest common denominator */
693         rcu_read_lock();
694         for_each_active_iommu(iommu, drhd) {
695                 if (!ecap_coherent(iommu->ecap)) {
696                         domain->iommu_coherency = 0;
697                         break;
698                 }
699         }
700         rcu_read_unlock();
701 }
702
703 static int domain_update_iommu_snooping(struct intel_iommu *skip)
704 {
705         struct dmar_drhd_unit *drhd;
706         struct intel_iommu *iommu;
707         int ret = 1;
708
709         rcu_read_lock();
710         for_each_active_iommu(iommu, drhd) {
711                 if (iommu != skip) {
712                         if (!ecap_sc_support(iommu->ecap)) {
713                                 ret = 0;
714                                 break;
715                         }
716                 }
717         }
718         rcu_read_unlock();
719
720         return ret;
721 }
722
723 static int domain_update_iommu_superpage(struct intel_iommu *skip)
724 {
725         struct dmar_drhd_unit *drhd;
726         struct intel_iommu *iommu;
727         int mask = 0xf;
728
729         if (!intel_iommu_superpage) {
730                 return 0;
731         }
732
733         /* set iommu_superpage to the smallest common denominator */
734         rcu_read_lock();
735         for_each_active_iommu(iommu, drhd) {
736                 if (iommu != skip) {
737                         mask &= cap_super_page_val(iommu->cap);
738                         if (!mask)
739                                 break;
740                 }
741         }
742         rcu_read_unlock();
743
744         return fls(mask);
745 }
746
747 /* Some capabilities may be different across iommus */
748 static void domain_update_iommu_cap(struct dmar_domain *domain)
749 {
750         domain_update_iommu_coherency(domain);
751         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
752         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
753 }
754
755 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
756                                                        u8 bus, u8 devfn, int alloc)
757 {
758         struct root_entry *root = &iommu->root_entry[bus];
759         struct context_entry *context;
760         u64 *entry;
761
762         if (ecs_enabled(iommu)) {
763                 if (devfn >= 0x80) {
764                         devfn -= 0x80;
765                         entry = &root->hi;
766                 }
767                 devfn *= 2;
768         }
769         entry = &root->lo;
770         if (*entry & 1)
771                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
772         else {
773                 unsigned long phy_addr;
774                 if (!alloc)
775                         return NULL;
776
777                 context = alloc_pgtable_page(iommu->node);
778                 if (!context)
779                         return NULL;
780
781                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
782                 phy_addr = virt_to_phys((void *)context);
783                 *entry = phy_addr | 1;
784                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
785         }
786         return &context[devfn];
787 }
788
789 static int iommu_dummy(struct device *dev)
790 {
791         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
792 }
793
794 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
795 {
796         struct dmar_drhd_unit *drhd = NULL;
797         struct intel_iommu *iommu;
798         struct device *tmp;
799         struct pci_dev *ptmp, *pdev = NULL;
800         u16 segment = 0;
801         int i;
802
803         if (iommu_dummy(dev))
804                 return NULL;
805
806         if (dev_is_pci(dev)) {
807                 pdev = to_pci_dev(dev);
808                 segment = pci_domain_nr(pdev->bus);
809         } else if (has_acpi_companion(dev))
810                 dev = &ACPI_COMPANION(dev)->dev;
811
812         rcu_read_lock();
813         for_each_active_iommu(iommu, drhd) {
814                 if (pdev && segment != drhd->segment)
815                         continue;
816
817                 for_each_active_dev_scope(drhd->devices,
818                                           drhd->devices_cnt, i, tmp) {
819                         if (tmp == dev) {
820                                 *bus = drhd->devices[i].bus;
821                                 *devfn = drhd->devices[i].devfn;
822                                 goto out;
823                         }
824
825                         if (!pdev || !dev_is_pci(tmp))
826                                 continue;
827
828                         ptmp = to_pci_dev(tmp);
829                         if (ptmp->subordinate &&
830                             ptmp->subordinate->number <= pdev->bus->number &&
831                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
832                                 goto got_pdev;
833                 }
834
835                 if (pdev && drhd->include_all) {
836                 got_pdev:
837                         *bus = pdev->bus->number;
838                         *devfn = pdev->devfn;
839                         goto out;
840                 }
841         }
842         iommu = NULL;
843  out:
844         rcu_read_unlock();
845
846         return iommu;
847 }
848
849 static void domain_flush_cache(struct dmar_domain *domain,
850                                void *addr, int size)
851 {
852         if (!domain->iommu_coherency)
853                 clflush_cache_range(addr, size);
854 }
855
856 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
857 {
858         struct context_entry *context;
859         int ret = 0;
860         unsigned long flags;
861
862         spin_lock_irqsave(&iommu->lock, flags);
863         context = iommu_context_addr(iommu, bus, devfn, 0);
864         if (context)
865                 ret = context_present(context);
866         spin_unlock_irqrestore(&iommu->lock, flags);
867         return ret;
868 }
869
870 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
871 {
872         struct context_entry *context;
873         unsigned long flags;
874
875         spin_lock_irqsave(&iommu->lock, flags);
876         context = iommu_context_addr(iommu, bus, devfn, 0);
877         if (context) {
878                 context_clear_entry(context);
879                 __iommu_flush_cache(iommu, context, sizeof(*context));
880         }
881         spin_unlock_irqrestore(&iommu->lock, flags);
882 }
883
884 static void free_context_table(struct intel_iommu *iommu)
885 {
886         int i;
887         unsigned long flags;
888         struct context_entry *context;
889
890         spin_lock_irqsave(&iommu->lock, flags);
891         if (!iommu->root_entry) {
892                 goto out;
893         }
894         for (i = 0; i < ROOT_ENTRY_NR; i++) {
895                 context = iommu_context_addr(iommu, i, 0, 0);
896                 if (context)
897                         free_pgtable_page(context);
898
899                 if (!ecs_enabled(iommu))
900                         continue;
901
902                 context = iommu_context_addr(iommu, i, 0x80, 0);
903                 if (context)
904                         free_pgtable_page(context);
905
906         }
907         free_pgtable_page(iommu->root_entry);
908         iommu->root_entry = NULL;
909 out:
910         spin_unlock_irqrestore(&iommu->lock, flags);
911 }
912
913 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
914                                       unsigned long pfn, int *target_level)
915 {
916         struct dma_pte *parent, *pte = NULL;
917         int level = agaw_to_level(domain->agaw);
918         int offset;
919
920         BUG_ON(!domain->pgd);
921
922         if (!domain_pfn_supported(domain, pfn))
923                 /* Address beyond IOMMU's addressing capabilities. */
924                 return NULL;
925
926         parent = domain->pgd;
927
928         while (1) {
929                 void *tmp_page;
930
931                 offset = pfn_level_offset(pfn, level);
932                 pte = &parent[offset];
933                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
934                         break;
935                 if (level == *target_level)
936                         break;
937
938                 if (!dma_pte_present(pte)) {
939                         uint64_t pteval;
940
941                         tmp_page = alloc_pgtable_page(domain->nid);
942
943                         if (!tmp_page)
944                                 return NULL;
945
946                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
947                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
948                         if (cmpxchg64(&pte->val, 0ULL, pteval))
949                                 /* Someone else set it while we were thinking; use theirs. */
950                                 free_pgtable_page(tmp_page);
951                         else
952                                 domain_flush_cache(domain, pte, sizeof(*pte));
953                 }
954                 if (level == 1)
955                         break;
956
957                 parent = phys_to_virt(dma_pte_addr(pte));
958                 level--;
959         }
960
961         if (!*target_level)
962                 *target_level = level;
963
964         return pte;
965 }
966
967
968 /* return address's pte at specific level */
969 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
970                                          unsigned long pfn,
971                                          int level, int *large_page)
972 {
973         struct dma_pte *parent, *pte = NULL;
974         int total = agaw_to_level(domain->agaw);
975         int offset;
976
977         parent = domain->pgd;
978         while (level <= total) {
979                 offset = pfn_level_offset(pfn, total);
980                 pte = &parent[offset];
981                 if (level == total)
982                         return pte;
983
984                 if (!dma_pte_present(pte)) {
985                         *large_page = total;
986                         break;
987                 }
988
989                 if (dma_pte_superpage(pte)) {
990                         *large_page = total;
991                         return pte;
992                 }
993
994                 parent = phys_to_virt(dma_pte_addr(pte));
995                 total--;
996         }
997         return NULL;
998 }
999
1000 /* clear last level pte, a tlb flush should be followed */
1001 static void dma_pte_clear_range(struct dmar_domain *domain,
1002                                 unsigned long start_pfn,
1003                                 unsigned long last_pfn)
1004 {
1005         unsigned int large_page = 1;
1006         struct dma_pte *first_pte, *pte;
1007
1008         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1009         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1010         BUG_ON(start_pfn > last_pfn);
1011
1012         /* we don't need lock here; nobody else touches the iova range */
1013         do {
1014                 large_page = 1;
1015                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1016                 if (!pte) {
1017                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1018                         continue;
1019                 }
1020                 do {
1021                         dma_clear_pte(pte);
1022                         start_pfn += lvl_to_nr_pages(large_page);
1023                         pte++;
1024                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1025
1026                 domain_flush_cache(domain, first_pte,
1027                                    (void *)pte - (void *)first_pte);
1028
1029         } while (start_pfn && start_pfn <= last_pfn);
1030 }
1031
1032 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1033                                struct dma_pte *pte, unsigned long pfn,
1034                                unsigned long start_pfn, unsigned long last_pfn)
1035 {
1036         pfn = max(start_pfn, pfn);
1037         pte = &pte[pfn_level_offset(pfn, level)];
1038
1039         do {
1040                 unsigned long level_pfn;
1041                 struct dma_pte *level_pte;
1042
1043                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044                         goto next;
1045
1046                 level_pfn = pfn & level_mask(level - 1);
1047                 level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049                 if (level > 2)
1050                         dma_pte_free_level(domain, level - 1, level_pte,
1051                                            level_pfn, start_pfn, last_pfn);
1052
1053                 /* If range covers entire pagetable, free it */
1054                 if (!(start_pfn > level_pfn ||
1055                       last_pfn < level_pfn + level_size(level) - 1)) {
1056                         dma_clear_pte(pte);
1057                         domain_flush_cache(domain, pte, sizeof(*pte));
1058                         free_pgtable_page(level_pte);
1059                 }
1060 next:
1061                 pfn += level_size(level);
1062         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1063 }
1064
1065 /* free page table pages. last level pte should already be cleared */
1066 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1067                                    unsigned long start_pfn,
1068                                    unsigned long last_pfn)
1069 {
1070         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1071         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1072         BUG_ON(start_pfn > last_pfn);
1073
1074         dma_pte_clear_range(domain, start_pfn, last_pfn);
1075
1076         /* We don't need lock here; nobody else touches the iova range */
1077         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1078                            domain->pgd, 0, start_pfn, last_pfn);
1079
1080         /* free pgd */
1081         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1082                 free_pgtable_page(domain->pgd);
1083                 domain->pgd = NULL;
1084         }
1085 }
1086
1087 /* When a page at a given level is being unlinked from its parent, we don't
1088    need to *modify* it at all. All we need to do is make a list of all the
1089    pages which can be freed just as soon as we've flushed the IOTLB and we
1090    know the hardware page-walk will no longer touch them.
1091    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1092    be freed. */
1093 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1094                                             int level, struct dma_pte *pte,
1095                                             struct page *freelist)
1096 {
1097         struct page *pg;
1098
1099         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1100         pg->freelist = freelist;
1101         freelist = pg;
1102
1103         if (level == 1)
1104                 return freelist;
1105
1106         pte = page_address(pg);
1107         do {
1108                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1109                         freelist = dma_pte_list_pagetables(domain, level - 1,
1110                                                            pte, freelist);
1111                 pte++;
1112         } while (!first_pte_in_page(pte));
1113
1114         return freelist;
1115 }
1116
1117 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1118                                         struct dma_pte *pte, unsigned long pfn,
1119                                         unsigned long start_pfn,
1120                                         unsigned long last_pfn,
1121                                         struct page *freelist)
1122 {
1123         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125         pfn = max(start_pfn, pfn);
1126         pte = &pte[pfn_level_offset(pfn, level)];
1127
1128         do {
1129                 unsigned long level_pfn;
1130
1131                 if (!dma_pte_present(pte))
1132                         goto next;
1133
1134                 level_pfn = pfn & level_mask(level);
1135
1136                 /* If range covers entire pagetable, free it */
1137                 if (start_pfn <= level_pfn &&
1138                     last_pfn >= level_pfn + level_size(level) - 1) {
1139                         /* These suborbinate page tables are going away entirely. Don't
1140                            bother to clear them; we're just going to *free* them. */
1141                         if (level > 1 && !dma_pte_superpage(pte))
1142                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1143
1144                         dma_clear_pte(pte);
1145                         if (!first_pte)
1146                                 first_pte = pte;
1147                         last_pte = pte;
1148                 } else if (level > 1) {
1149                         /* Recurse down into a level that isn't *entirely* obsolete */
1150                         freelist = dma_pte_clear_level(domain, level - 1,
1151                                                        phys_to_virt(dma_pte_addr(pte)),
1152                                                        level_pfn, start_pfn, last_pfn,
1153                                                        freelist);
1154                 }
1155 next:
1156                 pfn += level_size(level);
1157         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1158
1159         if (first_pte)
1160                 domain_flush_cache(domain, first_pte,
1161                                    (void *)++last_pte - (void *)first_pte);
1162
1163         return freelist;
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 struct page *domain_unmap(struct dmar_domain *domain,
1170                           unsigned long start_pfn,
1171                           unsigned long last_pfn)
1172 {
1173         struct page *freelist = NULL;
1174
1175         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1176         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1177         BUG_ON(start_pfn > last_pfn);
1178
1179         /* we don't need lock here; nobody else touches the iova range */
1180         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1181                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1182
1183         /* free pgd */
1184         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1185                 struct page *pgd_page = virt_to_page(domain->pgd);
1186                 pgd_page->freelist = freelist;
1187                 freelist = pgd_page;
1188
1189                 domain->pgd = NULL;
1190         }
1191
1192         return freelist;
1193 }
1194
1195 void dma_free_pagelist(struct page *freelist)
1196 {
1197         struct page *pg;
1198
1199         while ((pg = freelist)) {
1200                 freelist = pg->freelist;
1201                 free_pgtable_page(page_address(pg));
1202         }
1203 }
1204
1205 /* iommu handling */
1206 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1207 {
1208         struct root_entry *root;
1209         unsigned long flags;
1210
1211         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1212         if (!root) {
1213                 pr_err("Allocating root entry for %s failed\n",
1214                         iommu->name);
1215                 return -ENOMEM;
1216         }
1217
1218         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1219
1220         spin_lock_irqsave(&iommu->lock, flags);
1221         iommu->root_entry = root;
1222         spin_unlock_irqrestore(&iommu->lock, flags);
1223
1224         return 0;
1225 }
1226
1227 static void iommu_set_root_entry(struct intel_iommu *iommu)
1228 {
1229         u64 addr;
1230         u32 sts;
1231         unsigned long flag;
1232
1233         addr = virt_to_phys(iommu->root_entry);
1234         if (ecs_enabled(iommu))
1235                 addr |= DMA_RTADDR_RTT;
1236
1237         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1238         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1239
1240         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1241
1242         /* Make sure hardware complete it */
1243         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1244                       readl, (sts & DMA_GSTS_RTPS), sts);
1245
1246         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1247 }
1248
1249 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1250 {
1251         u32 val;
1252         unsigned long flag;
1253
1254         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1255                 return;
1256
1257         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1258         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1259
1260         /* Make sure hardware complete it */
1261         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1262                       readl, (!(val & DMA_GSTS_WBFS)), val);
1263
1264         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1265 }
1266
1267 /* return value determine if we need a write buffer flush */
1268 static void __iommu_flush_context(struct intel_iommu *iommu,
1269                                   u16 did, u16 source_id, u8 function_mask,
1270                                   u64 type)
1271 {
1272         u64 val = 0;
1273         unsigned long flag;
1274
1275         switch (type) {
1276         case DMA_CCMD_GLOBAL_INVL:
1277                 val = DMA_CCMD_GLOBAL_INVL;
1278                 break;
1279         case DMA_CCMD_DOMAIN_INVL:
1280                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1281                 break;
1282         case DMA_CCMD_DEVICE_INVL:
1283                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1284                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1285                 break;
1286         default:
1287                 BUG();
1288         }
1289         val |= DMA_CCMD_ICC;
1290
1291         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1292         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1293
1294         /* Make sure hardware complete it */
1295         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1296                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1297
1298         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1299 }
1300
1301 /* return value determine if we need a write buffer flush */
1302 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1303                                 u64 addr, unsigned int size_order, u64 type)
1304 {
1305         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1306         u64 val = 0, val_iva = 0;
1307         unsigned long flag;
1308
1309         switch (type) {
1310         case DMA_TLB_GLOBAL_FLUSH:
1311                 /* global flush doesn't need set IVA_REG */
1312                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1313                 break;
1314         case DMA_TLB_DSI_FLUSH:
1315                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1316                 break;
1317         case DMA_TLB_PSI_FLUSH:
1318                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1319                 /* IH bit is passed in as part of address */
1320                 val_iva = size_order | addr;
1321                 break;
1322         default:
1323                 BUG();
1324         }
1325         /* Note: set drain read/write */
1326 #if 0
1327         /*
1328          * This is probably to be super secure.. Looks like we can
1329          * ignore it without any impact.
1330          */
1331         if (cap_read_drain(iommu->cap))
1332                 val |= DMA_TLB_READ_DRAIN;
1333 #endif
1334         if (cap_write_drain(iommu->cap))
1335                 val |= DMA_TLB_WRITE_DRAIN;
1336
1337         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1338         /* Note: Only uses first TLB reg currently */
1339         if (val_iva)
1340                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1341         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1342
1343         /* Make sure hardware complete it */
1344         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1345                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1346
1347         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348
1349         /* check IOTLB invalidation granularity */
1350         if (DMA_TLB_IAIG(val) == 0)
1351                 pr_err("Flush IOTLB failed\n");
1352         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1353                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1354                         (unsigned long long)DMA_TLB_IIRG(type),
1355                         (unsigned long long)DMA_TLB_IAIG(val));
1356 }
1357
1358 static struct device_domain_info *
1359 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1360                          u8 bus, u8 devfn)
1361 {
1362         bool found = false;
1363         unsigned long flags;
1364         struct device_domain_info *info;
1365         struct pci_dev *pdev;
1366
1367         if (!ecap_dev_iotlb_support(iommu->ecap))
1368                 return NULL;
1369
1370         if (!iommu->qi)
1371                 return NULL;
1372
1373         spin_lock_irqsave(&device_domain_lock, flags);
1374         list_for_each_entry(info, &domain->devices, link)
1375                 if (info->iommu == iommu && info->bus == bus &&
1376                     info->devfn == devfn) {
1377                         found = true;
1378                         break;
1379                 }
1380         spin_unlock_irqrestore(&device_domain_lock, flags);
1381
1382         if (!found || !info->dev || !dev_is_pci(info->dev))
1383                 return NULL;
1384
1385         pdev = to_pci_dev(info->dev);
1386
1387         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1388                 return NULL;
1389
1390         if (!dmar_find_matched_atsr_unit(pdev))
1391                 return NULL;
1392
1393         return info;
1394 }
1395
1396 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1397 {
1398         struct pci_dev *pdev;
1399
1400         if (!info || !dev_is_pci(info->dev))
1401                 return;
1402
1403         pdev = to_pci_dev(info->dev);
1404         if (pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1405                 return;
1406
1407         info->ats.enabled = 1;
1408         info->ats.qdep = pci_ats_queue_depth(pdev);
1409 }
1410
1411 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1412 {
1413         if (!info->ats.enabled)
1414                 return;
1415
1416         pci_disable_ats(to_pci_dev(info->dev));
1417         info->ats.enabled = 0;
1418 }
1419
1420 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1421                                   u64 addr, unsigned mask)
1422 {
1423         u16 sid, qdep;
1424         unsigned long flags;
1425         struct device_domain_info *info;
1426
1427         spin_lock_irqsave(&device_domain_lock, flags);
1428         list_for_each_entry(info, &domain->devices, link) {
1429                 if (!info->ats.enabled)
1430                         continue;
1431
1432                 sid = info->bus << 8 | info->devfn;
1433                 qdep = info->ats.qdep;
1434                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1435         }
1436         spin_unlock_irqrestore(&device_domain_lock, flags);
1437 }
1438
1439 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1440                                   unsigned long pfn, unsigned int pages, int ih, int map)
1441 {
1442         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1443         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1444
1445         BUG_ON(pages == 0);
1446
1447         if (ih)
1448                 ih = 1 << 6;
1449         /*
1450          * Fallback to domain selective flush if no PSI support or the size is
1451          * too big.
1452          * PSI requires page size to be 2 ^ x, and the base address is naturally
1453          * aligned to the size
1454          */
1455         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1456                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1457                                                 DMA_TLB_DSI_FLUSH);
1458         else
1459                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1460                                                 DMA_TLB_PSI_FLUSH);
1461
1462         /*
1463          * In caching mode, changes of pages from non-present to present require
1464          * flush. However, device IOTLB doesn't need to be flushed in this case.
1465          */
1466         if (!cap_caching_mode(iommu->cap) || !map)
1467                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1468 }
1469
1470 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1471 {
1472         u32 pmen;
1473         unsigned long flags;
1474
1475         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1476         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1477         pmen &= ~DMA_PMEN_EPM;
1478         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1479
1480         /* wait for the protected region status bit to clear */
1481         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1482                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1483
1484         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1485 }
1486
1487 static void iommu_enable_translation(struct intel_iommu *iommu)
1488 {
1489         u32 sts;
1490         unsigned long flags;
1491
1492         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1493         iommu->gcmd |= DMA_GCMD_TE;
1494         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1495
1496         /* Make sure hardware complete it */
1497         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1498                       readl, (sts & DMA_GSTS_TES), sts);
1499
1500         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1501 }
1502
1503 static void iommu_disable_translation(struct intel_iommu *iommu)
1504 {
1505         u32 sts;
1506         unsigned long flag;
1507
1508         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1509         iommu->gcmd &= ~DMA_GCMD_TE;
1510         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1511
1512         /* Make sure hardware complete it */
1513         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1514                       readl, (!(sts & DMA_GSTS_TES)), sts);
1515
1516         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1517 }
1518
1519
1520 static int iommu_init_domains(struct intel_iommu *iommu)
1521 {
1522         unsigned long ndomains;
1523         unsigned long nlongs;
1524
1525         ndomains = cap_ndoms(iommu->cap);
1526         pr_debug("%s: Number of Domains supported <%ld>\n",
1527                  iommu->name, ndomains);
1528         nlongs = BITS_TO_LONGS(ndomains);
1529
1530         spin_lock_init(&iommu->lock);
1531
1532         /* TBD: there might be 64K domains,
1533          * consider other allocation for future chip
1534          */
1535         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1536         if (!iommu->domain_ids) {
1537                 pr_err("%s: Allocating domain id array failed\n",
1538                        iommu->name);
1539                 return -ENOMEM;
1540         }
1541         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1542                         GFP_KERNEL);
1543         if (!iommu->domains) {
1544                 pr_err("%s: Allocating domain array failed\n",
1545                        iommu->name);
1546                 kfree(iommu->domain_ids);
1547                 iommu->domain_ids = NULL;
1548                 return -ENOMEM;
1549         }
1550
1551         /*
1552          * if Caching mode is set, then invalid translations are tagged
1553          * with domainid 0. Hence we need to pre-allocate it.
1554          */
1555         if (cap_caching_mode(iommu->cap))
1556                 set_bit(0, iommu->domain_ids);
1557         return 0;
1558 }
1559
1560 static void disable_dmar_iommu(struct intel_iommu *iommu)
1561 {
1562         struct dmar_domain *domain;
1563         int i;
1564
1565         if ((iommu->domains) && (iommu->domain_ids)) {
1566                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1567                         /*
1568                          * Domain id 0 is reserved for invalid translation
1569                          * if hardware supports caching mode.
1570                          */
1571                         if (cap_caching_mode(iommu->cap) && i == 0)
1572                                 continue;
1573
1574                         domain = iommu->domains[i];
1575                         clear_bit(i, iommu->domain_ids);
1576                         if (domain_detach_iommu(domain, iommu) == 0 &&
1577                             !domain_type_is_vm(domain))
1578                                 domain_exit(domain);
1579                 }
1580         }
1581
1582         if (iommu->gcmd & DMA_GCMD_TE)
1583                 iommu_disable_translation(iommu);
1584 }
1585
1586 static void free_dmar_iommu(struct intel_iommu *iommu)
1587 {
1588         if ((iommu->domains) && (iommu->domain_ids)) {
1589                 kfree(iommu->domains);
1590                 kfree(iommu->domain_ids);
1591                 iommu->domains = NULL;
1592                 iommu->domain_ids = NULL;
1593         }
1594
1595         g_iommus[iommu->seq_id] = NULL;
1596
1597         /* free context mapping */
1598         free_context_table(iommu);
1599 }
1600
1601 static struct dmar_domain *alloc_domain(int flags)
1602 {
1603         /* domain id for virtual machine, it won't be set in context */
1604         static atomic_t vm_domid = ATOMIC_INIT(0);
1605         struct dmar_domain *domain;
1606
1607         domain = alloc_domain_mem();
1608         if (!domain)
1609                 return NULL;
1610
1611         memset(domain, 0, sizeof(*domain));
1612         domain->nid = -1;
1613         domain->flags = flags;
1614         spin_lock_init(&domain->iommu_lock);
1615         INIT_LIST_HEAD(&domain->devices);
1616         if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1617                 domain->id = atomic_inc_return(&vm_domid);
1618
1619         return domain;
1620 }
1621
1622 static int __iommu_attach_domain(struct dmar_domain *domain,
1623                                  struct intel_iommu *iommu)
1624 {
1625         int num;
1626         unsigned long ndomains;
1627
1628         ndomains = cap_ndoms(iommu->cap);
1629         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1630         if (num < ndomains) {
1631                 set_bit(num, iommu->domain_ids);
1632                 iommu->domains[num] = domain;
1633         } else {
1634                 num = -ENOSPC;
1635         }
1636
1637         return num;
1638 }
1639
1640 static int iommu_attach_domain(struct dmar_domain *domain,
1641                                struct intel_iommu *iommu)
1642 {
1643         int num;
1644         unsigned long flags;
1645
1646         spin_lock_irqsave(&iommu->lock, flags);
1647         num = __iommu_attach_domain(domain, iommu);
1648         spin_unlock_irqrestore(&iommu->lock, flags);
1649         if (num < 0)
1650                 pr_err("%s: No free domain ids\n", iommu->name);
1651
1652         return num;
1653 }
1654
1655 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1656                                   struct intel_iommu *iommu)
1657 {
1658         int num;
1659         unsigned long ndomains;
1660
1661         ndomains = cap_ndoms(iommu->cap);
1662         for_each_set_bit(num, iommu->domain_ids, ndomains)
1663                 if (iommu->domains[num] == domain)
1664                         return num;
1665
1666         return __iommu_attach_domain(domain, iommu);
1667 }
1668
1669 static void iommu_detach_domain(struct dmar_domain *domain,
1670                                 struct intel_iommu *iommu)
1671 {
1672         unsigned long flags;
1673         int num, ndomains;
1674
1675         spin_lock_irqsave(&iommu->lock, flags);
1676         if (domain_type_is_vm_or_si(domain)) {
1677                 ndomains = cap_ndoms(iommu->cap);
1678                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1679                         if (iommu->domains[num] == domain) {
1680                                 clear_bit(num, iommu->domain_ids);
1681                                 iommu->domains[num] = NULL;
1682                                 break;
1683                         }
1684                 }
1685         } else {
1686                 clear_bit(domain->id, iommu->domain_ids);
1687                 iommu->domains[domain->id] = NULL;
1688         }
1689         spin_unlock_irqrestore(&iommu->lock, flags);
1690 }
1691
1692 static void domain_attach_iommu(struct dmar_domain *domain,
1693                                struct intel_iommu *iommu)
1694 {
1695         unsigned long flags;
1696
1697         spin_lock_irqsave(&domain->iommu_lock, flags);
1698         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1699                 domain->iommu_count++;
1700                 if (domain->iommu_count == 1)
1701                         domain->nid = iommu->node;
1702                 domain_update_iommu_cap(domain);
1703         }
1704         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1705 }
1706
1707 static int domain_detach_iommu(struct dmar_domain *domain,
1708                                struct intel_iommu *iommu)
1709 {
1710         unsigned long flags;
1711         int count = INT_MAX;
1712
1713         spin_lock_irqsave(&domain->iommu_lock, flags);
1714         if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1715                 count = --domain->iommu_count;
1716                 domain_update_iommu_cap(domain);
1717         }
1718         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1719
1720         return count;
1721 }
1722
1723 static struct iova_domain reserved_iova_list;
1724 static struct lock_class_key reserved_rbtree_key;
1725
1726 static int dmar_init_reserved_ranges(void)
1727 {
1728         struct pci_dev *pdev = NULL;
1729         struct iova *iova;
1730         int i;
1731
1732         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1733                         DMA_32BIT_PFN);
1734
1735         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1736                 &reserved_rbtree_key);
1737
1738         /* IOAPIC ranges shouldn't be accessed by DMA */
1739         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1740                 IOVA_PFN(IOAPIC_RANGE_END));
1741         if (!iova) {
1742                 pr_err("Reserve IOAPIC range failed\n");
1743                 return -ENODEV;
1744         }
1745
1746         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1747         for_each_pci_dev(pdev) {
1748                 struct resource *r;
1749
1750                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1751                         r = &pdev->resource[i];
1752                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1753                                 continue;
1754                         iova = reserve_iova(&reserved_iova_list,
1755                                             IOVA_PFN(r->start),
1756                                             IOVA_PFN(r->end));
1757                         if (!iova) {
1758                                 pr_err("Reserve iova failed\n");
1759                                 return -ENODEV;
1760                         }
1761                 }
1762         }
1763         return 0;
1764 }
1765
1766 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1767 {
1768         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1769 }
1770
1771 static inline int guestwidth_to_adjustwidth(int gaw)
1772 {
1773         int agaw;
1774         int r = (gaw - 12) % 9;
1775
1776         if (r == 0)
1777                 agaw = gaw;
1778         else
1779                 agaw = gaw + 9 - r;
1780         if (agaw > 64)
1781                 agaw = 64;
1782         return agaw;
1783 }
1784
1785 static int domain_init(struct dmar_domain *domain, int guest_width)
1786 {
1787         struct intel_iommu *iommu;
1788         int adjust_width, agaw;
1789         unsigned long sagaw;
1790
1791         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1792                         DMA_32BIT_PFN);
1793         domain_reserve_special_ranges(domain);
1794
1795         /* calculate AGAW */
1796         iommu = domain_get_iommu(domain);
1797         if (guest_width > cap_mgaw(iommu->cap))
1798                 guest_width = cap_mgaw(iommu->cap);
1799         domain->gaw = guest_width;
1800         adjust_width = guestwidth_to_adjustwidth(guest_width);
1801         agaw = width_to_agaw(adjust_width);
1802         sagaw = cap_sagaw(iommu->cap);
1803         if (!test_bit(agaw, &sagaw)) {
1804                 /* hardware doesn't support it, choose a bigger one */
1805                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1806                 agaw = find_next_bit(&sagaw, 5, agaw);
1807                 if (agaw >= 5)
1808                         return -ENODEV;
1809         }
1810         domain->agaw = agaw;
1811
1812         if (ecap_coherent(iommu->ecap))
1813                 domain->iommu_coherency = 1;
1814         else
1815                 domain->iommu_coherency = 0;
1816
1817         if (ecap_sc_support(iommu->ecap))
1818                 domain->iommu_snooping = 1;
1819         else
1820                 domain->iommu_snooping = 0;
1821
1822         if (intel_iommu_superpage)
1823                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1824         else
1825                 domain->iommu_superpage = 0;
1826
1827         domain->nid = iommu->node;
1828
1829         /* always allocate the top pgd */
1830         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1831         if (!domain->pgd)
1832                 return -ENOMEM;
1833         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1834         return 0;
1835 }
1836
1837 static void domain_exit(struct dmar_domain *domain)
1838 {
1839         struct dmar_drhd_unit *drhd;
1840         struct intel_iommu *iommu;
1841         struct page *freelist = NULL;
1842
1843         /* Domain 0 is reserved, so dont process it */
1844         if (!domain)
1845                 return;
1846
1847         /* Flush any lazy unmaps that may reference this domain */
1848         if (!intel_iommu_strict)
1849                 flush_unmaps_timeout(0);
1850
1851         /* remove associated devices */
1852         domain_remove_dev_info(domain);
1853
1854         /* destroy iovas */
1855         put_iova_domain(&domain->iovad);
1856
1857         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1858
1859         /* clear attached or cached domains */
1860         rcu_read_lock();
1861         for_each_active_iommu(iommu, drhd)
1862                 if (domain_type_is_vm(domain) ||
1863                     test_bit(iommu->seq_id, domain->iommu_bmp))
1864                         iommu_detach_domain(domain, iommu);
1865         rcu_read_unlock();
1866
1867         dma_free_pagelist(freelist);
1868
1869         free_domain_mem(domain);
1870 }
1871
1872 static int domain_context_mapping_one(struct dmar_domain *domain,
1873                                       struct intel_iommu *iommu,
1874                                       u8 bus, u8 devfn, int translation)
1875 {
1876         struct context_entry *context;
1877         unsigned long flags;
1878         struct dma_pte *pgd;
1879         int id;
1880         int agaw;
1881         struct device_domain_info *info = NULL;
1882
1883         pr_debug("Set context mapping for %02x:%02x.%d\n",
1884                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1885
1886         BUG_ON(!domain->pgd);
1887         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1888                translation != CONTEXT_TT_MULTI_LEVEL);
1889
1890         spin_lock_irqsave(&iommu->lock, flags);
1891         context = iommu_context_addr(iommu, bus, devfn, 1);
1892         spin_unlock_irqrestore(&iommu->lock, flags);
1893         if (!context)
1894                 return -ENOMEM;
1895         spin_lock_irqsave(&iommu->lock, flags);
1896         if (context_present(context)) {
1897                 spin_unlock_irqrestore(&iommu->lock, flags);
1898                 return 0;
1899         }
1900
1901         context_clear_entry(context);
1902
1903         id = domain->id;
1904         pgd = domain->pgd;
1905
1906         if (domain_type_is_vm_or_si(domain)) {
1907                 if (domain_type_is_vm(domain)) {
1908                         id = iommu_attach_vm_domain(domain, iommu);
1909                         if (id < 0) {
1910                                 spin_unlock_irqrestore(&iommu->lock, flags);
1911                                 pr_err("%s: No free domain ids\n", iommu->name);
1912                                 return -EFAULT;
1913                         }
1914                 }
1915
1916                 /* Skip top levels of page tables for
1917                  * iommu which has less agaw than default.
1918                  * Unnecessary for PT mode.
1919                  */
1920                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1921                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1922                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1923                                 if (!dma_pte_present(pgd)) {
1924                                         spin_unlock_irqrestore(&iommu->lock, flags);
1925                                         return -ENOMEM;
1926                                 }
1927                         }
1928                 }
1929         }
1930
1931         context_set_domain_id(context, id);
1932
1933         if (translation != CONTEXT_TT_PASS_THROUGH) {
1934                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1935                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1936                                      CONTEXT_TT_MULTI_LEVEL;
1937         }
1938         /*
1939          * In pass through mode, AW must be programmed to indicate the largest
1940          * AGAW value supported by hardware. And ASR is ignored by hardware.
1941          */
1942         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1943                 context_set_address_width(context, iommu->msagaw);
1944         else {
1945                 context_set_address_root(context, virt_to_phys(pgd));
1946                 context_set_address_width(context, iommu->agaw);
1947         }
1948
1949         context_set_translation_type(context, translation);
1950         context_set_fault_enable(context);
1951         context_set_present(context);
1952         domain_flush_cache(domain, context, sizeof(*context));
1953
1954         /*
1955          * It's a non-present to present mapping. If hardware doesn't cache
1956          * non-present entry we only need to flush the write-buffer. If the
1957          * _does_ cache non-present entries, then it does so in the special
1958          * domain #0, which we have to flush:
1959          */
1960         if (cap_caching_mode(iommu->cap)) {
1961                 iommu->flush.flush_context(iommu, 0,
1962                                            (((u16)bus) << 8) | devfn,
1963                                            DMA_CCMD_MASK_NOBIT,
1964                                            DMA_CCMD_DEVICE_INVL);
1965                 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1966         } else {
1967                 iommu_flush_write_buffer(iommu);
1968         }
1969         iommu_enable_dev_iotlb(info);
1970         spin_unlock_irqrestore(&iommu->lock, flags);
1971
1972         domain_attach_iommu(domain, iommu);
1973
1974         return 0;
1975 }
1976
1977 struct domain_context_mapping_data {
1978         struct dmar_domain *domain;
1979         struct intel_iommu *iommu;
1980         int translation;
1981 };
1982
1983 static int domain_context_mapping_cb(struct pci_dev *pdev,
1984                                      u16 alias, void *opaque)
1985 {
1986         struct domain_context_mapping_data *data = opaque;
1987
1988         return domain_context_mapping_one(data->domain, data->iommu,
1989                                           PCI_BUS_NUM(alias), alias & 0xff,
1990                                           data->translation);
1991 }
1992
1993 static int
1994 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1995                        int translation)
1996 {
1997         struct intel_iommu *iommu;
1998         u8 bus, devfn;
1999         struct domain_context_mapping_data data;
2000
2001         iommu = device_to_iommu(dev, &bus, &devfn);
2002         if (!iommu)
2003                 return -ENODEV;
2004
2005         if (!dev_is_pci(dev))
2006                 return domain_context_mapping_one(domain, iommu, bus, devfn,
2007                                                   translation);
2008
2009         data.domain = domain;
2010         data.iommu = iommu;
2011         data.translation = translation;
2012
2013         return pci_for_each_dma_alias(to_pci_dev(dev),
2014                                       &domain_context_mapping_cb, &data);
2015 }
2016
2017 static int domain_context_mapped_cb(struct pci_dev *pdev,
2018                                     u16 alias, void *opaque)
2019 {
2020         struct intel_iommu *iommu = opaque;
2021
2022         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2023 }
2024
2025 static int domain_context_mapped(struct device *dev)
2026 {
2027         struct intel_iommu *iommu;
2028         u8 bus, devfn;
2029
2030         iommu = device_to_iommu(dev, &bus, &devfn);
2031         if (!iommu)
2032                 return -ENODEV;
2033
2034         if (!dev_is_pci(dev))
2035                 return device_context_mapped(iommu, bus, devfn);
2036
2037         return !pci_for_each_dma_alias(to_pci_dev(dev),
2038                                        domain_context_mapped_cb, iommu);
2039 }
2040
2041 /* Returns a number of VTD pages, but aligned to MM page size */
2042 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2043                                             size_t size)
2044 {
2045         host_addr &= ~PAGE_MASK;
2046         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2047 }
2048
2049 /* Return largest possible superpage level for a given mapping */
2050 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2051                                           unsigned long iov_pfn,
2052                                           unsigned long phy_pfn,
2053                                           unsigned long pages)
2054 {
2055         int support, level = 1;
2056         unsigned long pfnmerge;
2057
2058         support = domain->iommu_superpage;
2059
2060         /* To use a large page, the virtual *and* physical addresses
2061            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2062            of them will mean we have to use smaller pages. So just
2063            merge them and check both at once. */
2064         pfnmerge = iov_pfn | phy_pfn;
2065
2066         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2067                 pages >>= VTD_STRIDE_SHIFT;
2068                 if (!pages)
2069                         break;
2070                 pfnmerge >>= VTD_STRIDE_SHIFT;
2071                 level++;
2072                 support--;
2073         }
2074         return level;
2075 }
2076
2077 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2078                             struct scatterlist *sg, unsigned long phys_pfn,
2079                             unsigned long nr_pages, int prot)
2080 {
2081         struct dma_pte *first_pte = NULL, *pte = NULL;
2082         phys_addr_t uninitialized_var(pteval);
2083         unsigned long sg_res = 0;
2084         unsigned int largepage_lvl = 0;
2085         unsigned long lvl_pages = 0;
2086
2087         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2088
2089         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2090                 return -EINVAL;
2091
2092         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2093
2094         if (!sg) {
2095                 sg_res = nr_pages;
2096                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2097         }
2098
2099         while (nr_pages > 0) {
2100                 uint64_t tmp;
2101
2102                 if (!sg_res) {
2103                         sg_res = aligned_nrpages(sg->offset, sg->length);
2104                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2105                         sg->dma_length = sg->length;
2106                         pteval = page_to_phys(sg_page(sg)) | prot;
2107                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2108                 }
2109
2110                 if (!pte) {
2111                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2112
2113                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2114                         if (!pte)
2115                                 return -ENOMEM;
2116                         /* It is large page*/
2117                         if (largepage_lvl > 1) {
2118                                 pteval |= DMA_PTE_LARGE_PAGE;
2119                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2120                                 /*
2121                                  * Ensure that old small page tables are
2122                                  * removed to make room for superpage,
2123                                  * if they exist.
2124                                  */
2125                                 dma_pte_free_pagetable(domain, iov_pfn,
2126                                                        iov_pfn + lvl_pages - 1);
2127                         } else {
2128                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2129                         }
2130
2131                 }
2132                 /* We don't need lock here, nobody else
2133                  * touches the iova range
2134                  */
2135                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2136                 if (tmp) {
2137                         static int dumps = 5;
2138                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2139                                 iov_pfn, tmp, (unsigned long long)pteval);
2140                         if (dumps) {
2141                                 dumps--;
2142                                 debug_dma_dump_mappings(NULL);
2143                         }
2144                         WARN_ON(1);
2145                 }
2146
2147                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2148
2149                 BUG_ON(nr_pages < lvl_pages);
2150                 BUG_ON(sg_res < lvl_pages);
2151
2152                 nr_pages -= lvl_pages;
2153                 iov_pfn += lvl_pages;
2154                 phys_pfn += lvl_pages;
2155                 pteval += lvl_pages * VTD_PAGE_SIZE;
2156                 sg_res -= lvl_pages;
2157
2158                 /* If the next PTE would be the first in a new page, then we
2159                    need to flush the cache on the entries we've just written.
2160                    And then we'll need to recalculate 'pte', so clear it and
2161                    let it get set again in the if (!pte) block above.
2162
2163                    If we're done (!nr_pages) we need to flush the cache too.
2164
2165                    Also if we've been setting superpages, we may need to
2166                    recalculate 'pte' and switch back to smaller pages for the
2167                    end of the mapping, if the trailing size is not enough to
2168                    use another superpage (i.e. sg_res < lvl_pages). */
2169                 pte++;
2170                 if (!nr_pages || first_pte_in_page(pte) ||
2171                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2172                         domain_flush_cache(domain, first_pte,
2173                                            (void *)pte - (void *)first_pte);
2174                         pte = NULL;
2175                 }
2176
2177                 if (!sg_res && nr_pages)
2178                         sg = sg_next(sg);
2179         }
2180         return 0;
2181 }
2182
2183 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184                                     struct scatterlist *sg, unsigned long nr_pages,
2185                                     int prot)
2186 {
2187         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2188 }
2189
2190 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2191                                      unsigned long phys_pfn, unsigned long nr_pages,
2192                                      int prot)
2193 {
2194         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2195 }
2196
2197 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2198 {
2199         if (!iommu)
2200                 return;
2201
2202         clear_context_table(iommu, bus, devfn);
2203         iommu->flush.flush_context(iommu, 0, 0, 0,
2204                                            DMA_CCMD_GLOBAL_INVL);
2205         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2206 }
2207
2208 static inline void unlink_domain_info(struct device_domain_info *info)
2209 {
2210         assert_spin_locked(&device_domain_lock);
2211         list_del(&info->link);
2212         list_del(&info->global);
2213         if (info->dev)
2214                 info->dev->archdata.iommu = NULL;
2215 }
2216
2217 static void domain_remove_dev_info(struct dmar_domain *domain)
2218 {
2219         struct device_domain_info *info, *tmp;
2220         unsigned long flags;
2221
2222         spin_lock_irqsave(&device_domain_lock, flags);
2223         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2224                 unlink_domain_info(info);
2225                 spin_unlock_irqrestore(&device_domain_lock, flags);
2226
2227                 iommu_disable_dev_iotlb(info);
2228                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2229
2230                 if (domain_type_is_vm(domain)) {
2231                         iommu_detach_dependent_devices(info->iommu, info->dev);
2232                         domain_detach_iommu(domain, info->iommu);
2233                 }
2234
2235                 free_devinfo_mem(info);
2236                 spin_lock_irqsave(&device_domain_lock, flags);
2237         }
2238         spin_unlock_irqrestore(&device_domain_lock, flags);
2239 }
2240
2241 /*
2242  * find_domain
2243  * Note: we use struct device->archdata.iommu stores the info
2244  */
2245 static struct dmar_domain *find_domain(struct device *dev)
2246 {
2247         struct device_domain_info *info;
2248
2249         /* No lock here, assumes no domain exit in normal case */
2250         info = dev->archdata.iommu;
2251         if (info)
2252                 return info->domain;
2253         return NULL;
2254 }
2255
2256 static inline struct device_domain_info *
2257 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2258 {
2259         struct device_domain_info *info;
2260
2261         list_for_each_entry(info, &device_domain_list, global)
2262                 if (info->iommu->segment == segment && info->bus == bus &&
2263                     info->devfn == devfn)
2264                         return info;
2265
2266         return NULL;
2267 }
2268
2269 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2270                                                 int bus, int devfn,
2271                                                 struct device *dev,
2272                                                 struct dmar_domain *domain)
2273 {
2274         struct dmar_domain *found = NULL;
2275         struct device_domain_info *info;
2276         unsigned long flags;
2277
2278         info = alloc_devinfo_mem();
2279         if (!info)
2280                 return NULL;
2281
2282         info->bus = bus;
2283         info->devfn = devfn;
2284         info->ats.enabled = 0;
2285         info->ats.qdep = 0;
2286         info->dev = dev;
2287         info->domain = domain;
2288         info->iommu = iommu;
2289
2290         spin_lock_irqsave(&device_domain_lock, flags);
2291         if (dev)
2292                 found = find_domain(dev);
2293         else {
2294                 struct device_domain_info *info2;
2295                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2296                 if (info2)
2297                         found = info2->domain;
2298         }
2299         if (found) {
2300                 spin_unlock_irqrestore(&device_domain_lock, flags);
2301                 free_devinfo_mem(info);
2302                 /* Caller must free the original domain */
2303                 return found;
2304         }
2305
2306         list_add(&info->link, &domain->devices);
2307         list_add(&info->global, &device_domain_list);
2308         if (dev)
2309                 dev->archdata.iommu = info;
2310         spin_unlock_irqrestore(&device_domain_lock, flags);
2311
2312         return domain;
2313 }
2314
2315 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2316 {
2317         *(u16 *)opaque = alias;
2318         return 0;
2319 }
2320
2321 /* domain is initialized */
2322 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2323 {
2324         struct dmar_domain *domain, *tmp;
2325         struct intel_iommu *iommu;
2326         struct device_domain_info *info;
2327         u16 dma_alias;
2328         unsigned long flags;
2329         u8 bus, devfn;
2330
2331         domain = find_domain(dev);
2332         if (domain)
2333                 return domain;
2334
2335         iommu = device_to_iommu(dev, &bus, &devfn);
2336         if (!iommu)
2337                 return NULL;
2338
2339         if (dev_is_pci(dev)) {
2340                 struct pci_dev *pdev = to_pci_dev(dev);
2341
2342                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2343
2344                 spin_lock_irqsave(&device_domain_lock, flags);
2345                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2346                                                       PCI_BUS_NUM(dma_alias),
2347                                                       dma_alias & 0xff);
2348                 if (info) {
2349                         iommu = info->iommu;
2350                         domain = info->domain;
2351                 }
2352                 spin_unlock_irqrestore(&device_domain_lock, flags);
2353
2354                 /* DMA alias already has a domain, uses it */
2355                 if (info)
2356                         goto found_domain;
2357         }
2358
2359         /* Allocate and initialize new domain for the device */
2360         domain = alloc_domain(0);
2361         if (!domain)
2362                 return NULL;
2363         domain->id = iommu_attach_domain(domain, iommu);
2364         if (domain->id < 0) {
2365                 free_domain_mem(domain);
2366                 return NULL;
2367         }
2368         domain_attach_iommu(domain, iommu);
2369         if (domain_init(domain, gaw)) {
2370                 domain_exit(domain);
2371                 return NULL;
2372         }
2373
2374         /* register PCI DMA alias device */
2375         if (dev_is_pci(dev)) {
2376                 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2377                                            dma_alias & 0xff, NULL, domain);
2378
2379                 if (!tmp || tmp != domain) {
2380                         domain_exit(domain);
2381                         domain = tmp;
2382                 }
2383
2384                 if (!domain)
2385                         return NULL;
2386         }
2387
2388 found_domain:
2389         tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2390
2391         if (!tmp || tmp != domain) {
2392                 domain_exit(domain);
2393                 domain = tmp;
2394         }
2395
2396         return domain;
2397 }
2398
2399 static int iommu_identity_mapping;
2400 #define IDENTMAP_ALL            1
2401 #define IDENTMAP_GFX            2
2402 #define IDENTMAP_AZALIA         4
2403
2404 static int iommu_domain_identity_map(struct dmar_domain *domain,
2405                                      unsigned long long start,
2406                                      unsigned long long end)
2407 {
2408         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2409         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2410
2411         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2412                           dma_to_mm_pfn(last_vpfn))) {
2413                 pr_err("Reserving iova failed\n");
2414                 return -ENOMEM;
2415         }
2416
2417         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2418                  start, end, domain->id);
2419         /*
2420          * RMRR range might have overlap with physical memory range,
2421          * clear it first
2422          */
2423         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2424
2425         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2426                                   last_vpfn - first_vpfn + 1,
2427                                   DMA_PTE_READ|DMA_PTE_WRITE);
2428 }
2429
2430 static int iommu_prepare_identity_map(struct device *dev,
2431                                       unsigned long long start,
2432                                       unsigned long long end)
2433 {
2434         struct dmar_domain *domain;
2435         int ret;
2436
2437         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2438         if (!domain)
2439                 return -ENOMEM;
2440
2441         /* For _hardware_ passthrough, don't bother. But for software
2442            passthrough, we do it anyway -- it may indicate a memory
2443            range which is reserved in E820, so which didn't get set
2444            up to start with in si_domain */
2445         if (domain == si_domain && hw_pass_through) {
2446                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2447                         dev_name(dev), start, end);
2448                 return 0;
2449         }
2450
2451         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2452                 dev_name(dev), start, end);
2453
2454         if (end < start) {
2455                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2456                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2457                         dmi_get_system_info(DMI_BIOS_VENDOR),
2458                         dmi_get_system_info(DMI_BIOS_VERSION),
2459                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2460                 ret = -EIO;
2461                 goto error;
2462         }
2463
2464         if (end >> agaw_to_width(domain->agaw)) {
2465                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2466                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2467                      agaw_to_width(domain->agaw),
2468                      dmi_get_system_info(DMI_BIOS_VENDOR),
2469                      dmi_get_system_info(DMI_BIOS_VERSION),
2470                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2471                 ret = -EIO;
2472                 goto error;
2473         }
2474
2475         ret = iommu_domain_identity_map(domain, start, end);
2476         if (ret)
2477                 goto error;
2478
2479         /* context entry init */
2480         ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2481         if (ret)
2482                 goto error;
2483
2484         return 0;
2485
2486  error:
2487         domain_exit(domain);
2488         return ret;
2489 }
2490
2491 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2492                                          struct device *dev)
2493 {
2494         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2495                 return 0;
2496         return iommu_prepare_identity_map(dev, rmrr->base_address,
2497                                           rmrr->end_address);
2498 }
2499
2500 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2501 static inline void iommu_prepare_isa(void)
2502 {
2503         struct pci_dev *pdev;
2504         int ret;
2505
2506         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2507         if (!pdev)
2508                 return;
2509
2510         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2511         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2512
2513         if (ret)
2514                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2515
2516         pci_dev_put(pdev);
2517 }
2518 #else
2519 static inline void iommu_prepare_isa(void)
2520 {
2521         return;
2522 }
2523 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2524
2525 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2526
2527 static int __init si_domain_init(int hw)
2528 {
2529         struct dmar_drhd_unit *drhd;
2530         struct intel_iommu *iommu;
2531         int nid, ret = 0;
2532         bool first = true;
2533
2534         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2535         if (!si_domain)
2536                 return -EFAULT;
2537
2538         for_each_active_iommu(iommu, drhd) {
2539                 ret = iommu_attach_domain(si_domain, iommu);
2540                 if (ret < 0) {
2541                         domain_exit(si_domain);
2542                         return -EFAULT;
2543                 } else if (first) {
2544                         si_domain->id = ret;
2545                         first = false;
2546                 } else if (si_domain->id != ret) {
2547                         domain_exit(si_domain);
2548                         return -EFAULT;
2549                 }
2550                 domain_attach_iommu(si_domain, iommu);
2551         }
2552
2553         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2554                 domain_exit(si_domain);
2555                 return -EFAULT;
2556         }
2557
2558         pr_debug("Identity mapping domain is domain %d\n",
2559                  si_domain->id);
2560
2561         if (hw)
2562                 return 0;
2563
2564         for_each_online_node(nid) {
2565                 unsigned long start_pfn, end_pfn;
2566                 int i;
2567
2568                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2569                         ret = iommu_domain_identity_map(si_domain,
2570                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2571                         if (ret)
2572                                 return ret;
2573                 }
2574         }
2575
2576         return 0;
2577 }
2578
2579 static int identity_mapping(struct device *dev)
2580 {
2581         struct device_domain_info *info;
2582
2583         if (likely(!iommu_identity_mapping))
2584                 return 0;
2585
2586         info = dev->archdata.iommu;
2587         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2588                 return (info->domain == si_domain);
2589
2590         return 0;
2591 }
2592
2593 static int domain_add_dev_info(struct dmar_domain *domain,
2594                                struct device *dev, int translation)
2595 {
2596         struct dmar_domain *ndomain;
2597         struct intel_iommu *iommu;
2598         u8 bus, devfn;
2599         int ret;
2600
2601         iommu = device_to_iommu(dev, &bus, &devfn);
2602         if (!iommu)
2603                 return -ENODEV;
2604
2605         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2606         if (ndomain != domain)
2607                 return -EBUSY;
2608
2609         ret = domain_context_mapping(domain, dev, translation);
2610         if (ret) {
2611                 domain_remove_one_dev_info(domain, dev);
2612                 return ret;
2613         }
2614
2615         return 0;
2616 }
2617
2618 static bool device_has_rmrr(struct device *dev)
2619 {
2620         struct dmar_rmrr_unit *rmrr;
2621         struct device *tmp;
2622         int i;
2623
2624         rcu_read_lock();
2625         for_each_rmrr_units(rmrr) {
2626                 /*
2627                  * Return TRUE if this RMRR contains the device that
2628                  * is passed in.
2629                  */
2630                 for_each_active_dev_scope(rmrr->devices,
2631                                           rmrr->devices_cnt, i, tmp)
2632                         if (tmp == dev) {
2633                                 rcu_read_unlock();
2634                                 return true;
2635                         }
2636         }
2637         rcu_read_unlock();
2638         return false;
2639 }
2640
2641 /*
2642  * There are a couple cases where we need to restrict the functionality of
2643  * devices associated with RMRRs.  The first is when evaluating a device for
2644  * identity mapping because problems exist when devices are moved in and out
2645  * of domains and their respective RMRR information is lost.  This means that
2646  * a device with associated RMRRs will never be in a "passthrough" domain.
2647  * The second is use of the device through the IOMMU API.  This interface
2648  * expects to have full control of the IOVA space for the device.  We cannot
2649  * satisfy both the requirement that RMRR access is maintained and have an
2650  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2651  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2652  * We therefore prevent devices associated with an RMRR from participating in
2653  * the IOMMU API, which eliminates them from device assignment.
2654  *
2655  * In both cases we assume that PCI USB devices with RMRRs have them largely
2656  * for historical reasons and that the RMRR space is not actively used post
2657  * boot.  This exclusion may change if vendors begin to abuse it.
2658  *
2659  * The same exception is made for graphics devices, with the requirement that
2660  * any use of the RMRR regions will be torn down before assigning the device
2661  * to a guest.
2662  */
2663 static bool device_is_rmrr_locked(struct device *dev)
2664 {
2665         if (!device_has_rmrr(dev))
2666                 return false;
2667
2668         if (dev_is_pci(dev)) {
2669                 struct pci_dev *pdev = to_pci_dev(dev);
2670
2671                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2672                         return false;
2673         }
2674
2675         return true;
2676 }
2677
2678 static int iommu_should_identity_map(struct device *dev, int startup)
2679 {
2680
2681         if (dev_is_pci(dev)) {
2682                 struct pci_dev *pdev = to_pci_dev(dev);
2683
2684                 if (device_is_rmrr_locked(dev))
2685                         return 0;
2686
2687                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2688                         return 1;
2689
2690                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2691                         return 1;
2692
2693                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2694                         return 0;
2695
2696                 /*
2697                  * We want to start off with all devices in the 1:1 domain, and
2698                  * take them out later if we find they can't access all of memory.
2699                  *
2700                  * However, we can't do this for PCI devices behind bridges,
2701                  * because all PCI devices behind the same bridge will end up
2702                  * with the same source-id on their transactions.
2703                  *
2704                  * Practically speaking, we can't change things around for these
2705                  * devices at run-time, because we can't be sure there'll be no
2706                  * DMA transactions in flight for any of their siblings.
2707                  *
2708                  * So PCI devices (unless they're on the root bus) as well as
2709                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2710                  * the 1:1 domain, just in _case_ one of their siblings turns out
2711                  * not to be able to map all of memory.
2712                  */
2713                 if (!pci_is_pcie(pdev)) {
2714                         if (!pci_is_root_bus(pdev->bus))
2715                                 return 0;
2716                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2717                                 return 0;
2718                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2719                         return 0;
2720         } else {
2721                 if (device_has_rmrr(dev))
2722                         return 0;
2723         }
2724
2725         /*
2726          * At boot time, we don't yet know if devices will be 64-bit capable.
2727          * Assume that they will — if they turn out not to be, then we can
2728          * take them out of the 1:1 domain later.
2729          */
2730         if (!startup) {
2731                 /*
2732                  * If the device's dma_mask is less than the system's memory
2733                  * size then this is not a candidate for identity mapping.
2734                  */
2735                 u64 dma_mask = *dev->dma_mask;
2736
2737                 if (dev->coherent_dma_mask &&
2738                     dev->coherent_dma_mask < dma_mask)
2739                         dma_mask = dev->coherent_dma_mask;
2740
2741                 return dma_mask >= dma_get_required_mask(dev);
2742         }
2743
2744         return 1;
2745 }
2746
2747 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2748 {
2749         int ret;
2750
2751         if (!iommu_should_identity_map(dev, 1))
2752                 return 0;
2753
2754         ret = domain_add_dev_info(si_domain, dev,
2755                                   hw ? CONTEXT_TT_PASS_THROUGH :
2756                                        CONTEXT_TT_MULTI_LEVEL);
2757         if (!ret)
2758                 pr_info("%s identity mapping for device %s\n",
2759                         hw ? "Hardware" : "Software", dev_name(dev));
2760         else if (ret == -ENODEV)
2761                 /* device not associated with an iommu */
2762                 ret = 0;
2763
2764         return ret;
2765 }
2766
2767
2768 static int __init iommu_prepare_static_identity_mapping(int hw)
2769 {
2770         struct pci_dev *pdev = NULL;
2771         struct dmar_drhd_unit *drhd;
2772         struct intel_iommu *iommu;
2773         struct device *dev;
2774         int i;
2775         int ret = 0;
2776
2777         for_each_pci_dev(pdev) {
2778                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2779                 if (ret)
2780                         return ret;
2781         }
2782
2783         for_each_active_iommu(iommu, drhd)
2784                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2785                         struct acpi_device_physical_node *pn;
2786                         struct acpi_device *adev;
2787
2788                         if (dev->bus != &acpi_bus_type)
2789                                 continue;
2790
2791                         adev= to_acpi_device(dev);
2792                         mutex_lock(&adev->physical_node_lock);
2793                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2794                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2795                                 if (ret)
2796                                         break;
2797                         }
2798                         mutex_unlock(&adev->physical_node_lock);
2799                         if (ret)
2800                                 return ret;
2801                 }
2802
2803         return 0;
2804 }
2805
2806 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2807 {
2808         /*
2809          * Start from the sane iommu hardware state.
2810          * If the queued invalidation is already initialized by us
2811          * (for example, while enabling interrupt-remapping) then
2812          * we got the things already rolling from a sane state.
2813          */
2814         if (!iommu->qi) {
2815                 /*
2816                  * Clear any previous faults.
2817                  */
2818                 dmar_fault(-1, iommu);
2819                 /*
2820                  * Disable queued invalidation if supported and already enabled
2821                  * before OS handover.
2822                  */
2823                 dmar_disable_qi(iommu);
2824         }
2825
2826         if (dmar_enable_qi(iommu)) {
2827                 /*
2828                  * Queued Invalidate not enabled, use Register Based Invalidate
2829                  */
2830                 iommu->flush.flush_context = __iommu_flush_context;
2831                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2832                 pr_info("%s: Using Register based invalidation\n",
2833                         iommu->name);
2834         } else {
2835                 iommu->flush.flush_context = qi_flush_context;
2836                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2837                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2838         }
2839 }
2840
2841 static int copy_context_table(struct intel_iommu *iommu,
2842                               struct root_entry *old_re,
2843                               struct context_entry **tbl,
2844                               int bus, bool ext)
2845 {
2846         struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
2847         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2848         phys_addr_t old_ce_phys;
2849
2850         tbl_idx = ext ? bus * 2 : bus;
2851
2852         for (devfn = 0; devfn < 256; devfn++) {
2853                 /* First calculate the correct index */
2854                 idx = (ext ? devfn * 2 : devfn) % 256;
2855
2856                 if (idx == 0) {
2857                         /* First save what we may have and clean up */
2858                         if (new_ce) {
2859                                 tbl[tbl_idx] = new_ce;
2860                                 __iommu_flush_cache(iommu, new_ce,
2861                                                     VTD_PAGE_SIZE);
2862                                 pos = 1;
2863                         }
2864
2865                         if (old_ce)
2866                                 iounmap(old_ce);
2867
2868                         ret = 0;
2869                         if (devfn < 0x80)
2870                                 old_ce_phys = root_entry_lctp(old_re);
2871                         else
2872                                 old_ce_phys = root_entry_uctp(old_re);
2873
2874                         if (!old_ce_phys) {
2875                                 if (ext && devfn == 0) {
2876                                         /* No LCTP, try UCTP */
2877                                         devfn = 0x7f;
2878                                         continue;
2879                                 } else {
2880                                         goto out;
2881                                 }
2882                         }
2883
2884                         ret = -ENOMEM;
2885                         old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2886                         if (!old_ce)
2887                                 goto out;
2888
2889                         new_ce = alloc_pgtable_page(iommu->node);
2890                         if (!new_ce)
2891                                 goto out_unmap;
2892
2893                         ret = 0;
2894                 }
2895
2896                 /* Now copy the context entry */
2897                 ce = old_ce[idx];
2898
2899                 if (!__context_present(&ce))
2900                         continue;
2901
2902                 did = context_domain_id(&ce);
2903                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2904                         set_bit(did, iommu->domain_ids);
2905
2906                 /*
2907                  * We need a marker for copied context entries. This
2908                  * marker needs to work for the old format as well as
2909                  * for extended context entries.
2910                  *
2911                  * Bit 67 of the context entry is used. In the old
2912                  * format this bit is available to software, in the
2913                  * extended format it is the PGE bit, but PGE is ignored
2914                  * by HW if PASIDs are disabled (and thus still
2915                  * available).
2916                  *
2917                  * So disable PASIDs first and then mark the entry
2918                  * copied. This means that we don't copy PASID
2919                  * translations from the old kernel, but this is fine as
2920                  * faults there are not fatal.
2921                  */
2922                 context_clear_pasid_enable(&ce);
2923                 context_set_copied(&ce);
2924
2925                 new_ce[idx] = ce;
2926         }
2927
2928         tbl[tbl_idx + pos] = new_ce;
2929
2930         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2931
2932 out_unmap:
2933         iounmap(old_ce);
2934
2935 out:
2936         return ret;
2937 }
2938
2939 static int copy_translation_tables(struct intel_iommu *iommu)
2940 {
2941         struct context_entry **ctxt_tbls;
2942         struct root_entry *old_rt;
2943         phys_addr_t old_rt_phys;
2944         int ctxt_table_entries;
2945         unsigned long flags;
2946         u64 rtaddr_reg;
2947         int bus, ret;
2948         bool new_ext, ext;
2949
2950         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2951         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2952         new_ext    = !!ecap_ecs(iommu->ecap);
2953
2954         /*
2955          * The RTT bit can only be changed when translation is disabled,
2956          * but disabling translation means to open a window for data
2957          * corruption. So bail out and don't copy anything if we would
2958          * have to change the bit.
2959          */
2960         if (new_ext != ext)
2961                 return -EINVAL;
2962
2963         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2964         if (!old_rt_phys)
2965                 return -EINVAL;
2966
2967         old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2968         if (!old_rt)
2969                 return -ENOMEM;
2970
2971         /* This is too big for the stack - allocate it from slab */
2972         ctxt_table_entries = ext ? 512 : 256;
2973         ret = -ENOMEM;
2974         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2975         if (!ctxt_tbls)
2976                 goto out_unmap;
2977
2978         for (bus = 0; bus < 256; bus++) {
2979                 ret = copy_context_table(iommu, &old_rt[bus],
2980                                          ctxt_tbls, bus, ext);
2981                 if (ret) {
2982                         pr_err("%s: Failed to copy context table for bus %d\n",
2983                                 iommu->name, bus);
2984                         continue;
2985                 }
2986         }
2987
2988         spin_lock_irqsave(&iommu->lock, flags);
2989
2990         /* Context tables are copied, now write them to the root_entry table */
2991         for (bus = 0; bus < 256; bus++) {
2992                 int idx = ext ? bus * 2 : bus;
2993                 u64 val;
2994
2995                 if (ctxt_tbls[idx]) {
2996                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2997                         iommu->root_entry[bus].lo = val;
2998                 }
2999
3000                 if (!ext || !ctxt_tbls[idx + 1])
3001                         continue;
3002
3003                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3004                 iommu->root_entry[bus].hi = val;
3005         }
3006
3007         spin_unlock_irqrestore(&iommu->lock, flags);
3008
3009         kfree(ctxt_tbls);
3010
3011         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3012
3013         ret = 0;
3014
3015 out_unmap:
3016         iounmap(old_rt);
3017
3018         return ret;
3019 }
3020
3021 static int __init init_dmars(void)
3022 {
3023         struct dmar_drhd_unit *drhd;
3024         struct dmar_rmrr_unit *rmrr;
3025         bool copied_tables = false;
3026         struct device *dev;
3027         struct intel_iommu *iommu;
3028         int i, ret;
3029
3030         /*
3031          * for each drhd
3032          *    allocate root
3033          *    initialize and program root entry to not present
3034          * endfor
3035          */
3036         for_each_drhd_unit(drhd) {
3037                 /*
3038                  * lock not needed as this is only incremented in the single
3039                  * threaded kernel __init code path all other access are read
3040                  * only
3041                  */
3042                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3043                         g_num_of_iommus++;
3044                         continue;
3045                 }
3046                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3047         }
3048
3049         /* Preallocate enough resources for IOMMU hot-addition */
3050         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3051                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3052
3053         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3054                         GFP_KERNEL);
3055         if (!g_iommus) {
3056                 pr_err("Allocating global iommu array failed\n");
3057                 ret = -ENOMEM;
3058                 goto error;
3059         }
3060
3061         deferred_flush = kzalloc(g_num_of_iommus *
3062                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3063         if (!deferred_flush) {
3064                 ret = -ENOMEM;
3065                 goto free_g_iommus;
3066         }
3067
3068         for_each_active_iommu(iommu, drhd) {
3069                 g_iommus[iommu->seq_id] = iommu;
3070
3071                 intel_iommu_init_qi(iommu);
3072
3073                 ret = iommu_init_domains(iommu);
3074                 if (ret)
3075                         goto free_iommu;
3076
3077                 init_translation_status(iommu);
3078
3079                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3080                         iommu_disable_translation(iommu);
3081                         clear_translation_pre_enabled(iommu);
3082                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3083                                 iommu->name);
3084                 }
3085
3086                 /*
3087                  * TBD:
3088                  * we could share the same root & context tables
3089                  * among all IOMMU's. Need to Split it later.
3090                  */
3091                 ret = iommu_alloc_root_entry(iommu);
3092                 if (ret)
3093                         goto free_iommu;
3094
3095                 if (translation_pre_enabled(iommu)) {
3096                         pr_info("Translation already enabled - trying to copy translation structures\n");
3097
3098                         ret = copy_translation_tables(iommu);
3099                         if (ret) {
3100                                 /*
3101                                  * We found the IOMMU with translation
3102                                  * enabled - but failed to copy over the
3103                                  * old root-entry table. Try to proceed
3104                                  * by disabling translation now and
3105                                  * allocating a clean root-entry table.
3106                                  * This might cause DMAR faults, but
3107                                  * probably the dump will still succeed.
3108                                  */
3109                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3110                                        iommu->name);
3111                                 iommu_disable_translation(iommu);
3112                                 clear_translation_pre_enabled(iommu);
3113                         } else {
3114                                 pr_info("Copied translation tables from previous kernel for %s\n",
3115                                         iommu->name);
3116                                 copied_tables = true;
3117                         }
3118                 }
3119
3120                 iommu_flush_write_buffer(iommu);
3121                 iommu_set_root_entry(iommu);
3122                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3123                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3124
3125                 if (!ecap_pass_through(iommu->ecap))
3126                         hw_pass_through = 0;
3127         }
3128
3129         if (iommu_pass_through)
3130                 iommu_identity_mapping |= IDENTMAP_ALL;
3131
3132 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3133         iommu_identity_mapping |= IDENTMAP_GFX;
3134 #endif
3135
3136         if (iommu_identity_mapping) {
3137                 ret = si_domain_init(hw_pass_through);
3138                 if (ret)
3139                         goto free_iommu;
3140         }
3141
3142         check_tylersburg_isoch();
3143
3144         /*
3145          * If we copied translations from a previous kernel in the kdump
3146          * case, we can not assign the devices to domains now, as that
3147          * would eliminate the old mappings. So skip this part and defer
3148          * the assignment to device driver initialization time.
3149          */
3150         if (copied_tables)
3151                 goto domains_done;
3152
3153         /*
3154          * If pass through is not set or not enabled, setup context entries for
3155          * identity mappings for rmrr, gfx, and isa and may fall back to static
3156          * identity mapping if iommu_identity_mapping is set.
3157          */
3158         if (iommu_identity_mapping) {
3159                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3160                 if (ret) {
3161                         pr_crit("Failed to setup IOMMU pass-through\n");
3162                         goto free_iommu;
3163                 }
3164         }
3165         /*
3166          * For each rmrr
3167          *   for each dev attached to rmrr
3168          *   do
3169          *     locate drhd for dev, alloc domain for dev
3170          *     allocate free domain
3171          *     allocate page table entries for rmrr
3172          *     if context not allocated for bus
3173          *           allocate and init context
3174          *           set present in root table for this bus
3175          *     init context with domain, translation etc
3176          *    endfor
3177          * endfor
3178          */
3179         pr_info("Setting RMRR:\n");
3180         for_each_rmrr_units(rmrr) {
3181                 /* some BIOS lists non-exist devices in DMAR table. */
3182                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3183                                           i, dev) {
3184                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3185                         if (ret)
3186                                 pr_err("Mapping reserved region failed\n");
3187                 }
3188         }
3189
3190         iommu_prepare_isa();
3191
3192 domains_done:
3193
3194         /*
3195          * for each drhd
3196          *   enable fault log
3197          *   global invalidate context cache
3198          *   global invalidate iotlb
3199          *   enable translation
3200          */
3201         for_each_iommu(iommu, drhd) {
3202                 if (drhd->ignored) {
3203                         /*
3204                          * we always have to disable PMRs or DMA may fail on
3205                          * this device
3206                          */
3207                         if (force_on)
3208                                 iommu_disable_protect_mem_regions(iommu);
3209                         continue;
3210                 }
3211
3212                 iommu_flush_write_buffer(iommu);
3213
3214                 ret = dmar_set_interrupt(iommu);
3215                 if (ret)
3216                         goto free_iommu;
3217
3218                 if (!translation_pre_enabled(iommu))
3219                         iommu_enable_translation(iommu);
3220
3221                 iommu_disable_protect_mem_regions(iommu);
3222         }
3223
3224         return 0;
3225
3226 free_iommu:
3227         for_each_active_iommu(iommu, drhd) {
3228                 disable_dmar_iommu(iommu);
3229                 free_dmar_iommu(iommu);
3230         }
3231         kfree(deferred_flush);
3232 free_g_iommus:
3233         kfree(g_iommus);
3234 error:
3235         return ret;
3236 }
3237
3238 /* This takes a number of _MM_ pages, not VTD pages */
3239 static struct iova *intel_alloc_iova(struct device *dev,
3240                                      struct dmar_domain *domain,
3241                                      unsigned long nrpages, uint64_t dma_mask)
3242 {
3243         struct iova *iova = NULL;
3244
3245         /* Restrict dma_mask to the width that the iommu can handle */
3246         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3247
3248         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3249                 /*
3250                  * First try to allocate an io virtual address in
3251                  * DMA_BIT_MASK(32) and if that fails then try allocating
3252                  * from higher range
3253                  */
3254                 iova = alloc_iova(&domain->iovad, nrpages,
3255                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3256                 if (iova)
3257                         return iova;
3258         }
3259         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3260         if (unlikely(!iova)) {
3261                 pr_err("Allocating %ld-page iova for %s failed",
3262                        nrpages, dev_name(dev));
3263                 return NULL;
3264         }
3265
3266         return iova;
3267 }
3268
3269 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3270 {
3271         struct dmar_domain *domain;
3272         int ret;
3273
3274         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3275         if (!domain) {
3276                 pr_err("Allocating domain for %s failed\n",
3277                        dev_name(dev));
3278                 return NULL;
3279         }
3280
3281         /* make sure context mapping is ok */
3282         if (unlikely(!domain_context_mapped(dev))) {
3283                 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
3284                 if (ret) {
3285                         pr_err("Domain context map for %s failed\n",
3286                                dev_name(dev));
3287                         return NULL;
3288                 }
3289         }
3290
3291         return domain;
3292 }
3293
3294 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3295 {
3296         struct device_domain_info *info;
3297
3298         /* No lock here, assumes no domain exit in normal case */
3299         info = dev->archdata.iommu;
3300         if (likely(info))
3301                 return info->domain;
3302
3303         return __get_valid_domain_for_dev(dev);
3304 }
3305
3306 /* Check if the dev needs to go through non-identity map and unmap process.*/
3307 static int iommu_no_mapping(struct device *dev)
3308 {
3309         int found;
3310
3311         if (iommu_dummy(dev))
3312                 return 1;
3313
3314         if (!iommu_identity_mapping)
3315                 return 0;
3316
3317         found = identity_mapping(dev);
3318         if (found) {
3319                 if (iommu_should_identity_map(dev, 0))
3320                         return 1;
3321                 else {
3322                         /*
3323                          * 32 bit DMA is removed from si_domain and fall back
3324                          * to non-identity mapping.
3325                          */
3326                         domain_remove_one_dev_info(si_domain, dev);
3327                         pr_info("32bit %s uses non-identity mapping\n",
3328                                 dev_name(dev));
3329                         return 0;
3330                 }
3331         } else {
3332                 /*
3333                  * In case of a detached 64 bit DMA device from vm, the device
3334                  * is put into si_domain for identity mapping.
3335                  */
3336                 if (iommu_should_identity_map(dev, 0)) {
3337                         int ret;
3338                         ret = domain_add_dev_info(si_domain, dev,
3339                                                   hw_pass_through ?
3340                                                   CONTEXT_TT_PASS_THROUGH :
3341                                                   CONTEXT_TT_MULTI_LEVEL);
3342                         if (!ret) {
3343                                 pr_info("64bit %s uses identity mapping\n",
3344                                         dev_name(dev));
3345                                 return 1;
3346                         }
3347                 }
3348         }
3349
3350         return 0;
3351 }
3352
3353 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3354                                      size_t size, int dir, u64 dma_mask)
3355 {
3356         struct dmar_domain *domain;
3357         phys_addr_t start_paddr;
3358         struct iova *iova;
3359         int prot = 0;
3360         int ret;
3361         struct intel_iommu *iommu;
3362         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3363
3364         BUG_ON(dir == DMA_NONE);
3365
3366         if (iommu_no_mapping(dev))
3367                 return paddr;
3368
3369         domain = get_valid_domain_for_dev(dev);
3370         if (!domain)
3371                 return 0;
3372
3373         iommu = domain_get_iommu(domain);
3374         size = aligned_nrpages(paddr, size);
3375
3376         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3377         if (!iova)
3378                 goto error;
3379
3380         /*
3381          * Check if DMAR supports zero-length reads on write only
3382          * mappings..
3383          */
3384         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3385                         !cap_zlr(iommu->cap))
3386                 prot |= DMA_PTE_READ;
3387         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3388                 prot |= DMA_PTE_WRITE;
3389         /*
3390          * paddr - (paddr + size) might be partial page, we should map the whole
3391          * page.  Note: if two part of one page are separately mapped, we
3392          * might have two guest_addr mapping to the same host paddr, but this
3393          * is not a big problem
3394          */
3395         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3396                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3397         if (ret)
3398                 goto error;
3399
3400         /* it's a non-present to present mapping. Only flush if caching mode */
3401         if (cap_caching_mode(iommu->cap))
3402                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3403         else
3404                 iommu_flush_write_buffer(iommu);
3405
3406         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3407         start_paddr += paddr & ~PAGE_MASK;
3408         return start_paddr;
3409
3410 error:
3411         if (iova)
3412                 __free_iova(&domain->iovad, iova);
3413         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3414                 dev_name(dev), size, (unsigned long long)paddr, dir);
3415         return 0;
3416 }
3417
3418 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3419                                  unsigned long offset, size_t size,
3420                                  enum dma_data_direction dir,
3421                                  struct dma_attrs *attrs)
3422 {
3423         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3424                                   dir, *dev->dma_mask);
3425 }
3426
3427 static void flush_unmaps(void)
3428 {
3429         int i, j;
3430
3431         timer_on = 0;
3432
3433         /* just flush them all */
3434         for (i = 0; i < g_num_of_iommus; i++) {
3435                 struct intel_iommu *iommu = g_iommus[i];
3436                 if (!iommu)
3437                         continue;
3438
3439                 if (!deferred_flush[i].next)
3440                         continue;
3441
3442                 /* In caching mode, global flushes turn emulation expensive */
3443                 if (!cap_caching_mode(iommu->cap))
3444                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3445                                          DMA_TLB_GLOBAL_FLUSH);
3446                 for (j = 0; j < deferred_flush[i].next; j++) {
3447                         unsigned long mask;
3448                         struct iova *iova = deferred_flush[i].iova[j];
3449                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3450
3451                         /* On real hardware multiple invalidations are expensive */
3452                         if (cap_caching_mode(iommu->cap))
3453                                 iommu_flush_iotlb_psi(iommu, domain->id,
3454                                         iova->pfn_lo, iova_size(iova),
3455                                         !deferred_flush[i].freelist[j], 0);
3456                         else {
3457                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3458                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3459                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3460                         }
3461                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3462                         if (deferred_flush[i].freelist[j])
3463                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3464                 }
3465                 deferred_flush[i].next = 0;
3466         }
3467
3468         list_size = 0;
3469 }
3470
3471 static void flush_unmaps_timeout(unsigned long data)
3472 {
3473         unsigned long flags;
3474
3475         spin_lock_irqsave(&async_umap_flush_lock, flags);
3476         flush_unmaps();
3477         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3478 }
3479
3480 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3481 {
3482         unsigned long flags;
3483         int next, iommu_id;
3484         struct intel_iommu *iommu;
3485
3486         spin_lock_irqsave(&async_umap_flush_lock, flags);
3487         if (list_size == HIGH_WATER_MARK)
3488                 flush_unmaps();
3489
3490         iommu = domain_get_iommu(dom);
3491         iommu_id = iommu->seq_id;
3492
3493         next = deferred_flush[iommu_id].next;
3494         deferred_flush[iommu_id].domain[next] = dom;
3495         deferred_flush[iommu_id].iova[next] = iova;
3496         deferred_flush[iommu_id].freelist[next] = freelist;
3497         deferred_flush[iommu_id].next++;
3498
3499         if (!timer_on) {
3500                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3501                 timer_on = 1;
3502         }
3503         list_size++;
3504         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3505 }
3506
3507 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3508 {
3509         struct dmar_domain *domain;
3510         unsigned long start_pfn, last_pfn;
3511         struct iova *iova;
3512         struct intel_iommu *iommu;
3513         struct page *freelist;
3514
3515         if (iommu_no_mapping(dev))
3516                 return;
3517
3518         domain = find_domain(dev);
3519         BUG_ON(!domain);
3520
3521         iommu = domain_get_iommu(domain);
3522
3523         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3524         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3525                       (unsigned long long)dev_addr))
3526                 return;
3527
3528         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3529         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3530
3531         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3532                  dev_name(dev), start_pfn, last_pfn);
3533
3534         freelist = domain_unmap(domain, start_pfn, last_pfn);
3535
3536         if (intel_iommu_strict) {
3537                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3538                                       last_pfn - start_pfn + 1, !freelist, 0);
3539                 /* free iova */
3540                 __free_iova(&domain->iovad, iova);
3541                 dma_free_pagelist(freelist);
3542         } else {
3543                 add_unmap(domain, iova, freelist);
3544                 /*
3545                  * queue up the release of the unmap to save the 1/6th of the
3546                  * cpu used up by the iotlb flush operation...
3547                  */
3548         }
3549 }
3550
3551 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3552                              size_t size, enum dma_data_direction dir,
3553                              struct dma_attrs *attrs)
3554 {
3555         intel_unmap(dev, dev_addr);
3556 }
3557
3558 static void *intel_alloc_coherent(struct device *dev, size_t size,
3559                                   dma_addr_t *dma_handle, gfp_t flags,
3560                                   struct dma_attrs *attrs)
3561 {
3562         struct page *page = NULL;
3563         int order;
3564
3565         size = PAGE_ALIGN(size);
3566         order = get_order(size);
3567
3568         if (!iommu_no_mapping(dev))
3569                 flags &= ~(GFP_DMA | GFP_DMA32);
3570         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3571                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3572                         flags |= GFP_DMA;
3573                 else
3574                         flags |= GFP_DMA32;
3575         }
3576
3577         if (flags & __GFP_WAIT) {
3578                 unsigned int count = size >> PAGE_SHIFT;
3579
3580                 page = dma_alloc_from_contiguous(dev, count, order);
3581                 if (page && iommu_no_mapping(dev) &&
3582                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3583                         dma_release_from_contiguous(dev, page, count);
3584                         page = NULL;
3585                 }
3586         }
3587
3588         if (!page)
3589                 page = alloc_pages(flags, order);
3590         if (!page)
3591                 return NULL;
3592         memset(page_address(page), 0, size);
3593
3594         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595                                          DMA_BIDIRECTIONAL,
3596                                          dev->coherent_dma_mask);
3597         if (*dma_handle)
3598                 return page_address(page);
3599         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600                 __free_pages(page, order);
3601
3602         return NULL;
3603 }
3604
3605 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3607 {
3608         int order;
3609         struct page *page = virt_to_page(vaddr);
3610
3611         size = PAGE_ALIGN(size);
3612         order = get_order(size);
3613
3614         intel_unmap(dev, dma_handle);
3615         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616                 __free_pages(page, order);
3617 }
3618
3619 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620                            int nelems, enum dma_data_direction dir,
3621                            struct dma_attrs *attrs)
3622 {
3623         intel_unmap(dev, sglist[0].dma_address);
3624 }
3625
3626 static int intel_nontranslate_map_sg(struct device *hddev,
3627         struct scatterlist *sglist, int nelems, int dir)
3628 {
3629         int i;
3630         struct scatterlist *sg;
3631
3632         for_each_sg(sglist, sg, nelems, i) {
3633                 BUG_ON(!sg_page(sg));
3634                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3635                 sg->dma_length = sg->length;
3636         }
3637         return nelems;
3638 }
3639
3640 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3641                         enum dma_data_direction dir, struct dma_attrs *attrs)
3642 {
3643         int i;
3644         struct dmar_domain *domain;
3645         size_t size = 0;
3646         int prot = 0;
3647         struct iova *iova = NULL;
3648         int ret;
3649         struct scatterlist *sg;
3650         unsigned long start_vpfn;
3651         struct intel_iommu *iommu;
3652
3653         BUG_ON(dir == DMA_NONE);
3654         if (iommu_no_mapping(dev))
3655                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3656
3657         domain = get_valid_domain_for_dev(dev);
3658         if (!domain)
3659                 return 0;
3660
3661         iommu = domain_get_iommu(domain);
3662
3663         for_each_sg(sglist, sg, nelems, i)
3664                 size += aligned_nrpages(sg->offset, sg->length);
3665
3666         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3667                                 *dev->dma_mask);
3668         if (!iova) {
3669                 sglist->dma_length = 0;
3670                 return 0;
3671         }
3672
3673         /*
3674          * Check if DMAR supports zero-length reads on write only
3675          * mappings..
3676          */
3677         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3678                         !cap_zlr(iommu->cap))
3679                 prot |= DMA_PTE_READ;
3680         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3681                 prot |= DMA_PTE_WRITE;
3682
3683         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3684
3685         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3686         if (unlikely(ret)) {
3687                 dma_pte_free_pagetable(domain, start_vpfn,
3688                                        start_vpfn + size - 1);
3689                 __free_iova(&domain->iovad, iova);
3690                 return 0;
3691         }
3692
3693         /* it's a non-present to present mapping. Only flush if caching mode */
3694         if (cap_caching_mode(iommu->cap))
3695                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3696         else
3697                 iommu_flush_write_buffer(iommu);
3698
3699         return nelems;
3700 }
3701
3702 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3703 {
3704         return !dma_addr;
3705 }
3706
3707 struct dma_map_ops intel_dma_ops = {
3708         .alloc = intel_alloc_coherent,
3709         .free = intel_free_coherent,
3710         .map_sg = intel_map_sg,
3711         .unmap_sg = intel_unmap_sg,
3712         .map_page = intel_map_page,
3713         .unmap_page = intel_unmap_page,
3714         .mapping_error = intel_mapping_error,
3715 };
3716
3717 static inline int iommu_domain_cache_init(void)
3718 {
3719         int ret = 0;
3720
3721         iommu_domain_cache = kmem_cache_create("iommu_domain",
3722                                          sizeof(struct dmar_domain),
3723                                          0,
3724                                          SLAB_HWCACHE_ALIGN,
3725
3726                                          NULL);
3727         if (!iommu_domain_cache) {
3728                 pr_err("Couldn't create iommu_domain cache\n");
3729                 ret = -ENOMEM;
3730         }
3731
3732         return ret;
3733 }
3734
3735 static inline int iommu_devinfo_cache_init(void)
3736 {
3737         int ret = 0;
3738
3739         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3740                                          sizeof(struct device_domain_info),
3741                                          0,
3742                                          SLAB_HWCACHE_ALIGN,
3743                                          NULL);
3744         if (!iommu_devinfo_cache) {
3745                 pr_err("Couldn't create devinfo cache\n");
3746                 ret = -ENOMEM;
3747         }
3748
3749         return ret;
3750 }
3751
3752 static int __init iommu_init_mempool(void)
3753 {
3754         int ret;
3755         ret = iommu_iova_cache_init();
3756         if (ret)
3757                 return ret;
3758
3759         ret = iommu_domain_cache_init();
3760         if (ret)
3761                 goto domain_error;
3762
3763         ret = iommu_devinfo_cache_init();
3764         if (!ret)
3765                 return ret;
3766
3767         kmem_cache_destroy(iommu_domain_cache);
3768 domain_error:
3769         iommu_iova_cache_destroy();
3770
3771         return -ENOMEM;
3772 }
3773
3774 static void __init iommu_exit_mempool(void)
3775 {
3776         kmem_cache_destroy(iommu_devinfo_cache);
3777         kmem_cache_destroy(iommu_domain_cache);
3778         iommu_iova_cache_destroy();
3779 }
3780
3781 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3782 {
3783         struct dmar_drhd_unit *drhd;
3784         u32 vtbar;
3785         int rc;
3786
3787         /* We know that this device on this chipset has its own IOMMU.
3788          * If we find it under a different IOMMU, then the BIOS is lying
3789          * to us. Hope that the IOMMU for this device is actually
3790          * disabled, and it needs no translation...
3791          */
3792         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3793         if (rc) {
3794                 /* "can't" happen */
3795                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3796                 return;
3797         }
3798         vtbar &= 0xffff0000;
3799
3800         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3801         drhd = dmar_find_matched_drhd_unit(pdev);
3802         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3803                             TAINT_FIRMWARE_WORKAROUND,
3804                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3805                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3806 }
3807 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3808
3809 static void __init init_no_remapping_devices(void)
3810 {
3811         struct dmar_drhd_unit *drhd;
3812         struct device *dev;
3813         int i;
3814
3815         for_each_drhd_unit(drhd) {
3816                 if (!drhd->include_all) {
3817                         for_each_active_dev_scope(drhd->devices,
3818                                                   drhd->devices_cnt, i, dev)
3819                                 break;
3820                         /* ignore DMAR unit if no devices exist */
3821                         if (i == drhd->devices_cnt)
3822                                 drhd->ignored = 1;
3823                 }
3824         }
3825
3826         for_each_active_drhd_unit(drhd) {
3827                 if (drhd->include_all)
3828                         continue;
3829
3830                 for_each_active_dev_scope(drhd->devices,
3831                                           drhd->devices_cnt, i, dev)
3832                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3833                                 break;
3834                 if (i < drhd->devices_cnt)
3835                         continue;
3836
3837                 /* This IOMMU has *only* gfx devices. Either bypass it or
3838                    set the gfx_mapped flag, as appropriate */
3839                 if (dmar_map_gfx) {
3840                         intel_iommu_gfx_mapped = 1;
3841                 } else {
3842                         drhd->ignored = 1;
3843                         for_each_active_dev_scope(drhd->devices,
3844                                                   drhd->devices_cnt, i, dev)
3845                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3846                 }
3847         }
3848 }
3849
3850 #ifdef CONFIG_SUSPEND
3851 static int init_iommu_hw(void)
3852 {
3853         struct dmar_drhd_unit *drhd;
3854         struct intel_iommu *iommu = NULL;
3855
3856         for_each_active_iommu(iommu, drhd)
3857                 if (iommu->qi)
3858                         dmar_reenable_qi(iommu);
3859
3860         for_each_iommu(iommu, drhd) {
3861                 if (drhd->ignored) {
3862                         /*
3863                          * we always have to disable PMRs or DMA may fail on
3864                          * this device
3865                          */
3866                         if (force_on)
3867                                 iommu_disable_protect_mem_regions(iommu);
3868                         continue;
3869                 }
3870         
3871                 iommu_flush_write_buffer(iommu);
3872
3873                 iommu_set_root_entry(iommu);
3874
3875                 iommu->flush.flush_context(iommu, 0, 0, 0,
3876                                            DMA_CCMD_GLOBAL_INVL);
3877                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3878                 iommu_enable_translation(iommu);
3879                 iommu_disable_protect_mem_regions(iommu);
3880         }
3881
3882         return 0;
3883 }
3884
3885 static void iommu_flush_all(void)
3886 {
3887         struct dmar_drhd_unit *drhd;
3888         struct intel_iommu *iommu;
3889
3890         for_each_active_iommu(iommu, drhd) {
3891                 iommu->flush.flush_context(iommu, 0, 0, 0,
3892                                            DMA_CCMD_GLOBAL_INVL);
3893                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3894                                          DMA_TLB_GLOBAL_FLUSH);
3895         }
3896 }
3897
3898 static int iommu_suspend(void)
3899 {
3900         struct dmar_drhd_unit *drhd;
3901         struct intel_iommu *iommu = NULL;
3902         unsigned long flag;
3903
3904         for_each_active_iommu(iommu, drhd) {
3905                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3906                                                  GFP_ATOMIC);
3907                 if (!iommu->iommu_state)
3908                         goto nomem;
3909         }
3910
3911         iommu_flush_all();
3912
3913         for_each_active_iommu(iommu, drhd) {
3914                 iommu_disable_translation(iommu);
3915
3916                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3917
3918                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3919                         readl(iommu->reg + DMAR_FECTL_REG);
3920                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3921                         readl(iommu->reg + DMAR_FEDATA_REG);
3922                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3923                         readl(iommu->reg + DMAR_FEADDR_REG);
3924                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3925                         readl(iommu->reg + DMAR_FEUADDR_REG);
3926
3927                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3928         }
3929         return 0;
3930
3931 nomem:
3932         for_each_active_iommu(iommu, drhd)
3933                 kfree(iommu->iommu_state);
3934
3935         return -ENOMEM;
3936 }
3937
3938 static void iommu_resume(void)
3939 {
3940         struct dmar_drhd_unit *drhd;
3941         struct intel_iommu *iommu = NULL;
3942         unsigned long flag;
3943
3944         if (init_iommu_hw()) {
3945                 if (force_on)
3946                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3947                 else
3948                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3949                 return;
3950         }
3951
3952         for_each_active_iommu(iommu, drhd) {
3953
3954                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3955
3956                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3957                         iommu->reg + DMAR_FECTL_REG);
3958                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3959                         iommu->reg + DMAR_FEDATA_REG);
3960                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3961                         iommu->reg + DMAR_FEADDR_REG);
3962                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3963                         iommu->reg + DMAR_FEUADDR_REG);
3964
3965                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3966         }
3967
3968         for_each_active_iommu(iommu, drhd)
3969                 kfree(iommu->iommu_state);
3970 }
3971
3972 static struct syscore_ops iommu_syscore_ops = {
3973         .resume         = iommu_resume,
3974         .suspend        = iommu_suspend,
3975 };
3976
3977 static void __init init_iommu_pm_ops(void)
3978 {
3979         register_syscore_ops(&iommu_syscore_ops);
3980 }
3981
3982 #else
3983 static inline void init_iommu_pm_ops(void) {}
3984 #endif  /* CONFIG_PM */
3985
3986
3987 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3988 {
3989         struct acpi_dmar_reserved_memory *rmrr;
3990         struct dmar_rmrr_unit *rmrru;
3991
3992         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3993         if (!rmrru)
3994                 return -ENOMEM;
3995
3996         rmrru->hdr = header;
3997         rmrr = (struct acpi_dmar_reserved_memory *)header;
3998         rmrru->base_address = rmrr->base_address;
3999         rmrru->end_address = rmrr->end_address;
4000         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4001                                 ((void *)rmrr) + rmrr->header.length,
4002                                 &rmrru->devices_cnt);
4003         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4004                 kfree(rmrru);
4005                 return -ENOMEM;
4006         }
4007
4008         list_add(&rmrru->list, &dmar_rmrr_units);
4009
4010         return 0;
4011 }
4012
4013 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4014 {
4015         struct dmar_atsr_unit *atsru;
4016         struct acpi_dmar_atsr *tmp;
4017
4018         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4019                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4020                 if (atsr->segment != tmp->segment)
4021                         continue;
4022                 if (atsr->header.length != tmp->header.length)
4023                         continue;
4024                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4025                         return atsru;
4026         }
4027
4028         return NULL;
4029 }
4030
4031 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4032 {
4033         struct acpi_dmar_atsr *atsr;
4034         struct dmar_atsr_unit *atsru;
4035
4036         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4037                 return 0;
4038
4039         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4040         atsru = dmar_find_atsr(atsr);
4041         if (atsru)
4042                 return 0;
4043
4044         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4045         if (!atsru)
4046                 return -ENOMEM;
4047
4048         /*
4049          * If memory is allocated from slab by ACPI _DSM method, we need to
4050          * copy the memory content because the memory buffer will be freed
4051          * on return.
4052          */
4053         atsru->hdr = (void *)(atsru + 1);
4054         memcpy(atsru->hdr, hdr, hdr->length);
4055         atsru->include_all = atsr->flags & 0x1;
4056         if (!atsru->include_all) {
4057                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4058                                 (void *)atsr + atsr->header.length,
4059                                 &atsru->devices_cnt);
4060                 if (atsru->devices_cnt && atsru->devices == NULL) {
4061                         kfree(atsru);
4062                         return -ENOMEM;
4063                 }
4064         }
4065
4066         list_add_rcu(&atsru->list, &dmar_atsr_units);
4067
4068         return 0;
4069 }
4070
4071 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4072 {
4073         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4074         kfree(atsru);
4075 }
4076
4077 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4078 {
4079         struct acpi_dmar_atsr *atsr;
4080         struct dmar_atsr_unit *atsru;
4081
4082         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4083         atsru = dmar_find_atsr(atsr);
4084         if (atsru) {
4085                 list_del_rcu(&atsru->list);
4086                 synchronize_rcu();
4087                 intel_iommu_free_atsr(atsru);
4088         }
4089
4090         return 0;
4091 }
4092
4093 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4094 {
4095         int i;
4096         struct device *dev;
4097         struct acpi_dmar_atsr *atsr;
4098         struct dmar_atsr_unit *atsru;
4099
4100         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4101         atsru = dmar_find_atsr(atsr);
4102         if (!atsru)
4103                 return 0;
4104
4105         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4106                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4107                                           i, dev)
4108                         return -EBUSY;
4109
4110         return 0;
4111 }
4112
4113 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4114 {
4115         int sp, ret = 0;
4116         struct intel_iommu *iommu = dmaru->iommu;
4117
4118         if (g_iommus[iommu->seq_id])
4119                 return 0;
4120
4121         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4122                 pr_warn("%s: Doesn't support hardware pass through.\n",
4123                         iommu->name);
4124                 return -ENXIO;
4125         }
4126         if (!ecap_sc_support(iommu->ecap) &&
4127             domain_update_iommu_snooping(iommu)) {
4128                 pr_warn("%s: Doesn't support snooping.\n",
4129                         iommu->name);
4130                 return -ENXIO;
4131         }
4132         sp = domain_update_iommu_superpage(iommu) - 1;
4133         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4134                 pr_warn("%s: Doesn't support large page.\n",
4135                         iommu->name);
4136                 return -ENXIO;
4137         }
4138
4139         /*
4140          * Disable translation if already enabled prior to OS handover.
4141          */
4142         if (iommu->gcmd & DMA_GCMD_TE)
4143                 iommu_disable_translation(iommu);
4144
4145         g_iommus[iommu->seq_id] = iommu;
4146         ret = iommu_init_domains(iommu);
4147         if (ret == 0)
4148                 ret = iommu_alloc_root_entry(iommu);
4149         if (ret)
4150                 goto out;
4151
4152         if (dmaru->ignored) {
4153                 /*
4154                  * we always have to disable PMRs or DMA may fail on this device
4155                  */
4156                 if (force_on)
4157                         iommu_disable_protect_mem_regions(iommu);
4158                 return 0;
4159         }
4160
4161         intel_iommu_init_qi(iommu);
4162         iommu_flush_write_buffer(iommu);
4163         ret = dmar_set_interrupt(iommu);
4164         if (ret)
4165                 goto disable_iommu;
4166
4167         iommu_set_root_entry(iommu);
4168         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4169         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4170         iommu_enable_translation(iommu);
4171
4172         if (si_domain) {
4173                 ret = iommu_attach_domain(si_domain, iommu);
4174                 if (ret < 0 || si_domain->id != ret)
4175                         goto disable_iommu;
4176                 domain_attach_iommu(si_domain, iommu);
4177         }
4178
4179         iommu_disable_protect_mem_regions(iommu);
4180         return 0;
4181
4182 disable_iommu:
4183         disable_dmar_iommu(iommu);
4184 out:
4185         free_dmar_iommu(iommu);
4186         return ret;
4187 }
4188
4189 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4190 {
4191         int ret = 0;
4192         struct intel_iommu *iommu = dmaru->iommu;
4193
4194         if (!intel_iommu_enabled)
4195                 return 0;
4196         if (iommu == NULL)
4197                 return -EINVAL;
4198
4199         if (insert) {
4200                 ret = intel_iommu_add(dmaru);
4201         } else {
4202                 disable_dmar_iommu(iommu);
4203                 free_dmar_iommu(iommu);
4204         }
4205
4206         return ret;
4207 }
4208
4209 static void intel_iommu_free_dmars(void)
4210 {
4211         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4212         struct dmar_atsr_unit *atsru, *atsr_n;
4213
4214         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4215                 list_del(&rmrru->list);
4216                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4217                 kfree(rmrru);
4218         }
4219
4220         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4221                 list_del(&atsru->list);
4222                 intel_iommu_free_atsr(atsru);
4223         }
4224 }
4225
4226 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4227 {
4228         int i, ret = 1;
4229         struct pci_bus *bus;
4230         struct pci_dev *bridge = NULL;
4231         struct device *tmp;
4232         struct acpi_dmar_atsr *atsr;
4233         struct dmar_atsr_unit *atsru;
4234
4235         dev = pci_physfn(dev);
4236         for (bus = dev->bus; bus; bus = bus->parent) {
4237                 bridge = bus->self;
4238                 if (!bridge || !pci_is_pcie(bridge) ||
4239                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4240                         return 0;
4241                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4242                         break;
4243         }
4244         if (!bridge)
4245                 return 0;
4246
4247         rcu_read_lock();
4248         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4249                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4250                 if (atsr->segment != pci_domain_nr(dev->bus))
4251                         continue;
4252
4253                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4254                         if (tmp == &bridge->dev)
4255                                 goto out;
4256
4257                 if (atsru->include_all)
4258                         goto out;
4259         }
4260         ret = 0;
4261 out:
4262         rcu_read_unlock();
4263
4264         return ret;
4265 }
4266
4267 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4268 {
4269         int ret = 0;
4270         struct dmar_rmrr_unit *rmrru;
4271         struct dmar_atsr_unit *atsru;
4272         struct acpi_dmar_atsr *atsr;
4273         struct acpi_dmar_reserved_memory *rmrr;
4274
4275         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4276                 return 0;
4277
4278         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4279                 rmrr = container_of(rmrru->hdr,
4280                                     struct acpi_dmar_reserved_memory, header);
4281                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4282                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4283                                 ((void *)rmrr) + rmrr->header.length,
4284                                 rmrr->segment, rmrru->devices,
4285                                 rmrru->devices_cnt);
4286                         if(ret < 0)
4287                                 return ret;
4288                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4289                         dmar_remove_dev_scope(info, rmrr->segment,
4290                                 rmrru->devices, rmrru->devices_cnt);
4291                 }
4292         }
4293
4294         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4295                 if (atsru->include_all)
4296                         continue;
4297
4298                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4299                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4300                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4301                                         (void *)atsr + atsr->header.length,
4302                                         atsr->segment, atsru->devices,
4303                                         atsru->devices_cnt);
4304                         if (ret > 0)
4305                                 break;
4306                         else if(ret < 0)
4307                                 return ret;
4308                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4309                         if (dmar_remove_dev_scope(info, atsr->segment,
4310                                         atsru->devices, atsru->devices_cnt))
4311                                 break;
4312                 }
4313         }
4314
4315         return 0;
4316 }
4317
4318 /*
4319  * Here we only respond to action of unbound device from driver.
4320  *
4321  * Added device is not attached to its DMAR domain here yet. That will happen
4322  * when mapping the device to iova.
4323  */
4324 static int device_notifier(struct notifier_block *nb,
4325                                   unsigned long action, void *data)
4326 {
4327         struct device *dev = data;
4328         struct dmar_domain *domain;
4329
4330         if (iommu_dummy(dev))
4331                 return 0;
4332
4333         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4334                 return 0;
4335
4336         domain = find_domain(dev);
4337         if (!domain)
4338                 return 0;
4339
4340         down_read(&dmar_global_lock);
4341         domain_remove_one_dev_info(domain, dev);
4342         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4343                 domain_exit(domain);
4344         up_read(&dmar_global_lock);
4345
4346         return 0;
4347 }
4348
4349 static struct notifier_block device_nb = {
4350         .notifier_call = device_notifier,
4351 };
4352
4353 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4354                                        unsigned long val, void *v)
4355 {
4356         struct memory_notify *mhp = v;
4357         unsigned long long start, end;
4358         unsigned long start_vpfn, last_vpfn;
4359
4360         switch (val) {
4361         case MEM_GOING_ONLINE:
4362                 start = mhp->start_pfn << PAGE_SHIFT;
4363                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4364                 if (iommu_domain_identity_map(si_domain, start, end)) {
4365                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4366                                 start, end);
4367                         return NOTIFY_BAD;
4368                 }
4369                 break;
4370
4371         case MEM_OFFLINE:
4372         case MEM_CANCEL_ONLINE:
4373                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4374                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4375                 while (start_vpfn <= last_vpfn) {
4376                         struct iova *iova;
4377                         struct dmar_drhd_unit *drhd;
4378                         struct intel_iommu *iommu;
4379                         struct page *freelist;
4380
4381                         iova = find_iova(&si_domain->iovad, start_vpfn);
4382                         if (iova == NULL) {
4383                                 pr_debug("Failed get IOVA for PFN %lx\n",
4384                                          start_vpfn);
4385                                 break;
4386                         }
4387
4388                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4389                                                      start_vpfn, last_vpfn);
4390                         if (iova == NULL) {
4391                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4392                                         start_vpfn, last_vpfn);
4393                                 return NOTIFY_BAD;
4394                         }
4395
4396                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4397                                                iova->pfn_hi);
4398
4399                         rcu_read_lock();
4400                         for_each_active_iommu(iommu, drhd)
4401                                 iommu_flush_iotlb_psi(iommu, si_domain->id,
4402                                         iova->pfn_lo, iova_size(iova),
4403                                         !freelist, 0);
4404                         rcu_read_unlock();
4405                         dma_free_pagelist(freelist);
4406
4407                         start_vpfn = iova->pfn_hi + 1;
4408                         free_iova_mem(iova);
4409                 }
4410                 break;
4411         }
4412
4413         return NOTIFY_OK;
4414 }
4415
4416 static struct notifier_block intel_iommu_memory_nb = {
4417         .notifier_call = intel_iommu_memory_notifier,
4418         .priority = 0
4419 };
4420
4421
4422 static ssize_t intel_iommu_show_version(struct device *dev,
4423                                         struct device_attribute *attr,
4424                                         char *buf)
4425 {
4426         struct intel_iommu *iommu = dev_get_drvdata(dev);
4427         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4428         return sprintf(buf, "%d:%d\n",
4429                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4430 }
4431 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4432
4433 static ssize_t intel_iommu_show_address(struct device *dev,
4434                                         struct device_attribute *attr,
4435                                         char *buf)
4436 {
4437         struct intel_iommu *iommu = dev_get_drvdata(dev);
4438         return sprintf(buf, "%llx\n", iommu->reg_phys);
4439 }
4440 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4441
4442 static ssize_t intel_iommu_show_cap(struct device *dev,
4443                                     struct device_attribute *attr,
4444                                     char *buf)
4445 {
4446         struct intel_iommu *iommu = dev_get_drvdata(dev);
4447         return sprintf(buf, "%llx\n", iommu->cap);
4448 }
4449 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4450
4451 static ssize_t intel_iommu_show_ecap(struct device *dev,
4452                                     struct device_attribute *attr,
4453                                     char *buf)
4454 {
4455         struct intel_iommu *iommu = dev_get_drvdata(dev);
4456         return sprintf(buf, "%llx\n", iommu->ecap);
4457 }
4458 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4459
4460 static struct attribute *intel_iommu_attrs[] = {
4461         &dev_attr_version.attr,
4462         &dev_attr_address.attr,
4463         &dev_attr_cap.attr,
4464         &dev_attr_ecap.attr,
4465         NULL,
4466 };
4467
4468 static struct attribute_group intel_iommu_group = {
4469         .name = "intel-iommu",
4470         .attrs = intel_iommu_attrs,
4471 };
4472
4473 const struct attribute_group *intel_iommu_groups[] = {
4474         &intel_iommu_group,
4475         NULL,
4476 };
4477
4478 int __init intel_iommu_init(void)
4479 {
4480         int ret = -ENODEV;
4481         struct dmar_drhd_unit *drhd;
4482         struct intel_iommu *iommu;
4483
4484         /* VT-d is required for a TXT/tboot launch, so enforce that */
4485         force_on = tboot_force_iommu();
4486
4487         if (iommu_init_mempool()) {
4488                 if (force_on)
4489                         panic("tboot: Failed to initialize iommu memory\n");
4490                 return -ENOMEM;
4491         }
4492
4493         down_write(&dmar_global_lock);
4494         if (dmar_table_init()) {
4495                 if (force_on)
4496                         panic("tboot: Failed to initialize DMAR table\n");
4497                 goto out_free_dmar;
4498         }
4499
4500         if (dmar_dev_scope_init() < 0) {
4501                 if (force_on)
4502                         panic("tboot: Failed to initialize DMAR device scope\n");
4503                 goto out_free_dmar;
4504         }
4505
4506         if (no_iommu || dmar_disabled)
4507                 goto out_free_dmar;
4508
4509         if (list_empty(&dmar_rmrr_units))
4510                 pr_info("No RMRR found\n");
4511
4512         if (list_empty(&dmar_atsr_units))
4513                 pr_info("No ATSR found\n");
4514
4515         if (dmar_init_reserved_ranges()) {
4516                 if (force_on)
4517                         panic("tboot: Failed to reserve iommu ranges\n");
4518                 goto out_free_reserved_range;
4519         }
4520
4521         init_no_remapping_devices();
4522
4523         ret = init_dmars();
4524         if (ret) {
4525                 if (force_on)
4526                         panic("tboot: Failed to initialize DMARs\n");
4527                 pr_err("Initialization failed\n");
4528                 goto out_free_reserved_range;
4529         }
4530         up_write(&dmar_global_lock);
4531         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4532
4533         init_timer(&unmap_timer);
4534 #ifdef CONFIG_SWIOTLB
4535         swiotlb = 0;
4536 #endif
4537         dma_ops = &intel_dma_ops;
4538
4539         init_iommu_pm_ops();
4540
4541         for_each_active_iommu(iommu, drhd)
4542                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4543                                                        intel_iommu_groups,
4544                                                        iommu->name);
4545
4546         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4547         bus_register_notifier(&pci_bus_type, &device_nb);
4548         if (si_domain && !hw_pass_through)
4549                 register_memory_notifier(&intel_iommu_memory_nb);
4550
4551         intel_iommu_enabled = 1;
4552
4553         return 0;
4554
4555 out_free_reserved_range:
4556         put_iova_domain(&reserved_iova_list);
4557 out_free_dmar:
4558         intel_iommu_free_dmars();
4559         up_write(&dmar_global_lock);
4560         iommu_exit_mempool();
4561         return ret;
4562 }
4563
4564 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4565 {
4566         struct intel_iommu *iommu = opaque;
4567
4568         iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4569         return 0;
4570 }
4571
4572 /*
4573  * NB - intel-iommu lacks any sort of reference counting for the users of
4574  * dependent devices.  If multiple endpoints have intersecting dependent
4575  * devices, unbinding the driver from any one of them will possibly leave
4576  * the others unable to operate.
4577  */
4578 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4579                                            struct device *dev)
4580 {
4581         if (!iommu || !dev || !dev_is_pci(dev))
4582                 return;
4583
4584         pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4585 }
4586
4587 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4588                                        struct device *dev)
4589 {
4590         struct device_domain_info *info, *tmp;
4591         struct intel_iommu *iommu;
4592         unsigned long flags;
4593         bool found = false;
4594         u8 bus, devfn;
4595
4596         iommu = device_to_iommu(dev, &bus, &devfn);
4597         if (!iommu)
4598                 return;
4599
4600         spin_lock_irqsave(&device_domain_lock, flags);
4601         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4602                 if (info->iommu == iommu && info->bus == bus &&
4603                     info->devfn == devfn) {
4604                         unlink_domain_info(info);
4605                         spin_unlock_irqrestore(&device_domain_lock, flags);
4606
4607                         iommu_disable_dev_iotlb(info);
4608                         iommu_detach_dev(iommu, info->bus, info->devfn);
4609                         iommu_detach_dependent_devices(iommu, dev);
4610                         free_devinfo_mem(info);
4611
4612                         spin_lock_irqsave(&device_domain_lock, flags);
4613
4614                         if (found)
4615                                 break;
4616                         else
4617                                 continue;
4618                 }
4619
4620                 /* if there is no other devices under the same iommu
4621                  * owned by this domain, clear this iommu in iommu_bmp
4622                  * update iommu count and coherency
4623                  */
4624                 if (info->iommu == iommu)
4625                         found = true;
4626         }
4627
4628         spin_unlock_irqrestore(&device_domain_lock, flags);
4629
4630         if (found == 0) {
4631                 domain_detach_iommu(domain, iommu);
4632                 if (!domain_type_is_vm_or_si(domain))
4633                         iommu_detach_domain(domain, iommu);
4634         }
4635 }
4636
4637 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4638 {
4639         int adjust_width;
4640
4641         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4642                         DMA_32BIT_PFN);
4643         domain_reserve_special_ranges(domain);
4644
4645         /* calculate AGAW */
4646         domain->gaw = guest_width;
4647         adjust_width = guestwidth_to_adjustwidth(guest_width);
4648         domain->agaw = width_to_agaw(adjust_width);
4649
4650         domain->iommu_coherency = 0;
4651         domain->iommu_snooping = 0;
4652         domain->iommu_superpage = 0;
4653         domain->max_addr = 0;
4654
4655         /* always allocate the top pgd */
4656         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4657         if (!domain->pgd)
4658                 return -ENOMEM;
4659         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4660         return 0;
4661 }
4662
4663 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4664 {
4665         struct dmar_domain *dmar_domain;
4666         struct iommu_domain *domain;
4667
4668         if (type != IOMMU_DOMAIN_UNMANAGED)
4669                 return NULL;
4670
4671         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4672         if (!dmar_domain) {
4673                 pr_err("Can't allocate dmar_domain\n");
4674                 return NULL;
4675         }
4676         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4677                 pr_err("Domain initialization failed\n");
4678                 domain_exit(dmar_domain);
4679                 return NULL;
4680         }
4681         domain_update_iommu_cap(dmar_domain);
4682
4683         domain = &dmar_domain->domain;
4684         domain->geometry.aperture_start = 0;
4685         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4686         domain->geometry.force_aperture = true;
4687
4688         return domain;
4689 }
4690
4691 static void intel_iommu_domain_free(struct iommu_domain *domain)
4692 {
4693         domain_exit(to_dmar_domain(domain));
4694 }
4695
4696 static int intel_iommu_attach_device(struct iommu_domain *domain,
4697                                      struct device *dev)
4698 {
4699         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4700         struct intel_iommu *iommu;
4701         int addr_width;
4702         u8 bus, devfn;
4703
4704         if (device_is_rmrr_locked(dev)) {
4705                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4706                 return -EPERM;
4707         }
4708
4709         /* normally dev is not mapped */
4710         if (unlikely(domain_context_mapped(dev))) {
4711                 struct dmar_domain *old_domain;
4712
4713                 old_domain = find_domain(dev);
4714                 if (old_domain) {
4715                         if (domain_type_is_vm_or_si(dmar_domain))
4716                                 domain_remove_one_dev_info(old_domain, dev);
4717                         else
4718                                 domain_remove_dev_info(old_domain);
4719
4720                         if (!domain_type_is_vm_or_si(old_domain) &&
4721                              list_empty(&old_domain->devices))
4722                                 domain_exit(old_domain);
4723                 }
4724         }
4725
4726         iommu = device_to_iommu(dev, &bus, &devfn);
4727         if (!iommu)
4728                 return -ENODEV;
4729
4730         /* check if this iommu agaw is sufficient for max mapped address */
4731         addr_width = agaw_to_width(iommu->agaw);
4732         if (addr_width > cap_mgaw(iommu->cap))
4733                 addr_width = cap_mgaw(iommu->cap);
4734
4735         if (dmar_domain->max_addr > (1LL << addr_width)) {
4736                 pr_err("%s: iommu width (%d) is not "
4737                        "sufficient for the mapped address (%llx)\n",
4738                        __func__, addr_width, dmar_domain->max_addr);
4739                 return -EFAULT;
4740         }
4741         dmar_domain->gaw = addr_width;
4742
4743         /*
4744          * Knock out extra levels of page tables if necessary
4745          */
4746         while (iommu->agaw < dmar_domain->agaw) {
4747                 struct dma_pte *pte;
4748
4749                 pte = dmar_domain->pgd;
4750                 if (dma_pte_present(pte)) {
4751                         dmar_domain->pgd = (struct dma_pte *)
4752                                 phys_to_virt(dma_pte_addr(pte));
4753                         free_pgtable_page(pte);
4754                 }
4755                 dmar_domain->agaw--;
4756         }
4757
4758         return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4759 }
4760
4761 static void intel_iommu_detach_device(struct iommu_domain *domain,
4762                                       struct device *dev)
4763 {
4764         domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4765 }
4766
4767 static int intel_iommu_map(struct iommu_domain *domain,
4768                            unsigned long iova, phys_addr_t hpa,
4769                            size_t size, int iommu_prot)
4770 {
4771         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4772         u64 max_addr;
4773         int prot = 0;
4774         int ret;
4775
4776         if (iommu_prot & IOMMU_READ)
4777                 prot |= DMA_PTE_READ;
4778         if (iommu_prot & IOMMU_WRITE)
4779                 prot |= DMA_PTE_WRITE;
4780         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4781                 prot |= DMA_PTE_SNP;
4782
4783         max_addr = iova + size;
4784         if (dmar_domain->max_addr < max_addr) {
4785                 u64 end;
4786
4787                 /* check if minimum agaw is sufficient for mapped address */
4788                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4789                 if (end < max_addr) {
4790                         pr_err("%s: iommu width (%d) is not "
4791                                "sufficient for the mapped address (%llx)\n",
4792                                __func__, dmar_domain->gaw, max_addr);
4793                         return -EFAULT;
4794                 }
4795                 dmar_domain->max_addr = max_addr;
4796         }
4797         /* Round up size to next multiple of PAGE_SIZE, if it and
4798            the low bits of hpa would take us onto the next page */
4799         size = aligned_nrpages(hpa, size);
4800         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4801                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4802         return ret;
4803 }
4804
4805 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4806                                 unsigned long iova, size_t size)
4807 {
4808         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4809         struct page *freelist = NULL;
4810         struct intel_iommu *iommu;
4811         unsigned long start_pfn, last_pfn;
4812         unsigned int npages;
4813         int iommu_id, num, ndomains, level = 0;
4814
4815         /* Cope with horrid API which requires us to unmap more than the
4816            size argument if it happens to be a large-page mapping. */
4817         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4818                 BUG();
4819
4820         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4821                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4822
4823         start_pfn = iova >> VTD_PAGE_SHIFT;
4824         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4825
4826         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4827
4828         npages = last_pfn - start_pfn + 1;
4829
4830         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4831                iommu = g_iommus[iommu_id];
4832
4833                /*
4834                 * find bit position of dmar_domain
4835                 */
4836                ndomains = cap_ndoms(iommu->cap);
4837                for_each_set_bit(num, iommu->domain_ids, ndomains) {
4838                        if (iommu->domains[num] == dmar_domain)
4839                                iommu_flush_iotlb_psi(iommu, num, start_pfn,
4840                                                      npages, !freelist, 0);
4841                }
4842
4843         }
4844
4845         dma_free_pagelist(freelist);
4846
4847         if (dmar_domain->max_addr == iova + size)
4848                 dmar_domain->max_addr = iova;
4849
4850         return size;
4851 }
4852
4853 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4854                                             dma_addr_t iova)
4855 {
4856         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4857         struct dma_pte *pte;
4858         int level = 0;
4859         u64 phys = 0;
4860
4861         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4862         if (pte)
4863                 phys = dma_pte_addr(pte);
4864
4865         return phys;
4866 }
4867
4868 static bool intel_iommu_capable(enum iommu_cap cap)
4869 {
4870         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4871                 return domain_update_iommu_snooping(NULL) == 1;
4872         if (cap == IOMMU_CAP_INTR_REMAP)
4873                 return irq_remapping_enabled == 1;
4874
4875         return false;
4876 }
4877
4878 static int intel_iommu_add_device(struct device *dev)
4879 {
4880         struct intel_iommu *iommu;
4881         struct iommu_group *group;
4882         u8 bus, devfn;
4883
4884         iommu = device_to_iommu(dev, &bus, &devfn);
4885         if (!iommu)
4886                 return -ENODEV;
4887
4888         iommu_device_link(iommu->iommu_dev, dev);
4889
4890         group = iommu_group_get_for_dev(dev);
4891
4892         if (IS_ERR(group))
4893                 return PTR_ERR(group);
4894
4895         iommu_group_put(group);
4896         return 0;
4897 }
4898
4899 static void intel_iommu_remove_device(struct device *dev)
4900 {
4901         struct intel_iommu *iommu;
4902         u8 bus, devfn;
4903
4904         iommu = device_to_iommu(dev, &bus, &devfn);
4905         if (!iommu)
4906                 return;
4907
4908         iommu_group_remove_device(dev);
4909
4910         iommu_device_unlink(iommu->iommu_dev, dev);
4911 }
4912
4913 static const struct iommu_ops intel_iommu_ops = {
4914         .capable        = intel_iommu_capable,
4915         .domain_alloc   = intel_iommu_domain_alloc,
4916         .domain_free    = intel_iommu_domain_free,
4917         .attach_dev     = intel_iommu_attach_device,
4918         .detach_dev     = intel_iommu_detach_device,
4919         .map            = intel_iommu_map,
4920         .unmap          = intel_iommu_unmap,
4921         .map_sg         = default_iommu_map_sg,
4922         .iova_to_phys   = intel_iommu_iova_to_phys,
4923         .add_device     = intel_iommu_add_device,
4924         .remove_device  = intel_iommu_remove_device,
4925         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4926 };
4927
4928 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4929 {
4930         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4931         pr_info("Disabling IOMMU for graphics on this chipset\n");
4932         dmar_map_gfx = 0;
4933 }
4934
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4942
4943 static void quirk_iommu_rwbf(struct pci_dev *dev)
4944 {
4945         /*
4946          * Mobile 4 Series Chipset neglects to set RWBF capability,
4947          * but needs it. Same seems to hold for the desktop versions.
4948          */
4949         pr_info("Forcing write-buffer flush capability\n");
4950         rwbf_quirk = 1;
4951 }
4952
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4960
4961 #define GGC 0x52
4962 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4963 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4964 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4965 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4966 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4967 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4968 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4969 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4970
4971 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4972 {
4973         unsigned short ggc;
4974
4975         if (pci_read_config_word(dev, GGC, &ggc))
4976                 return;
4977
4978         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4979                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4980                 dmar_map_gfx = 0;
4981         } else if (dmar_map_gfx) {
4982                 /* we have to ensure the gfx device is idle before we flush */
4983                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
4984                 intel_iommu_strict = 1;
4985        }
4986 }
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4991
4992 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4993    ISOCH DMAR unit for the Azalia sound device, but not give it any
4994    TLB entries, which causes it to deadlock. Check for that.  We do
4995    this in a function called from init_dmars(), instead of in a PCI
4996    quirk, because we don't want to print the obnoxious "BIOS broken"
4997    message if VT-d is actually disabled.
4998 */
4999 static void __init check_tylersburg_isoch(void)
5000 {
5001         struct pci_dev *pdev;
5002         uint32_t vtisochctrl;
5003
5004         /* If there's no Azalia in the system anyway, forget it. */
5005         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5006         if (!pdev)
5007                 return;
5008         pci_dev_put(pdev);
5009
5010         /* System Management Registers. Might be hidden, in which case
5011            we can't do the sanity check. But that's OK, because the
5012            known-broken BIOSes _don't_ actually hide it, so far. */
5013         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5014         if (!pdev)
5015                 return;
5016
5017         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5018                 pci_dev_put(pdev);
5019                 return;
5020         }
5021
5022         pci_dev_put(pdev);
5023
5024         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5025         if (vtisochctrl & 1)
5026                 return;
5027
5028         /* Drop all bits other than the number of TLB entries */
5029         vtisochctrl &= 0x1c;
5030
5031         /* If we have the recommended number of TLB entries (16), fine. */
5032         if (vtisochctrl == 0x10)
5033                 return;
5034
5035         /* Zero TLB entries? You get to ride the short bus to school. */
5036         if (!vtisochctrl) {
5037                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5038                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5039                      dmi_get_system_info(DMI_BIOS_VENDOR),
5040                      dmi_get_system_info(DMI_BIOS_VERSION),
5041                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5042                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5043                 return;
5044         }
5045
5046         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5047                vtisochctrl);
5048 }