Revert "Merge remote-tracking branch 'linux-2.6.32.y/master' into develop"
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74 /* page table handling */
75 #define LEVEL_STRIDE            (9)
76 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
77
78 static inline int agaw_to_level(int agaw)
79 {
80         return agaw + 2;
81 }
82
83 static inline int agaw_to_width(int agaw)
84 {
85         return 30 + agaw * LEVEL_STRIDE;
86 }
87
88 static inline int width_to_agaw(int width)
89 {
90         return (width - 30) / LEVEL_STRIDE;
91 }
92
93 static inline unsigned int level_to_offset_bits(int level)
94 {
95         return (level - 1) * LEVEL_STRIDE;
96 }
97
98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102
103 static inline unsigned long level_mask(int level)
104 {
105         return -1UL << level_to_offset_bits(level);
106 }
107
108 static inline unsigned long level_size(int level)
109 {
110         return 1UL << level_to_offset_bits(level);
111 }
112
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115         return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131         return mm_to_dma_pfn(page_to_pfn(pg));
132 }
133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135         return page_to_dma_pfn(virt_to_page(p));
136 }
137
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143
144 /*
145  * 0: Present
146  * 1-11: Reserved
147  * 12-63: Context Ptr (12 - (haw-1))
148  * 64-127: Reserved
149  */
150 struct root_entry {
151         u64     val;
152         u64     rsvd1;
153 };
154 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
155 static inline bool root_present(struct root_entry *root)
156 {
157         return (root->val & 1);
158 }
159 static inline void set_root_present(struct root_entry *root)
160 {
161         root->val |= 1;
162 }
163 static inline void set_root_value(struct root_entry *root, unsigned long value)
164 {
165         root->val |= value & VTD_PAGE_MASK;
166 }
167
168 static inline struct context_entry *
169 get_context_addr_from_root(struct root_entry *root)
170 {
171         return (struct context_entry *)
172                 (root_present(root)?phys_to_virt(
173                 root->val & VTD_PAGE_MASK) :
174                 NULL);
175 }
176
177 /*
178  * low 64 bits:
179  * 0: present
180  * 1: fault processing disable
181  * 2-3: translation type
182  * 12-63: address space root
183  * high 64 bits:
184  * 0-2: address width
185  * 3-6: aval
186  * 8-23: domain id
187  */
188 struct context_entry {
189         u64 lo;
190         u64 hi;
191 };
192
193 static inline bool context_present(struct context_entry *context)
194 {
195         return (context->lo & 1);
196 }
197 static inline void context_set_present(struct context_entry *context)
198 {
199         context->lo |= 1;
200 }
201
202 static inline void context_set_fault_enable(struct context_entry *context)
203 {
204         context->lo &= (((u64)-1) << 2) | 1;
205 }
206
207 static inline void context_set_translation_type(struct context_entry *context,
208                                                 unsigned long value)
209 {
210         context->lo &= (((u64)-1) << 4) | 3;
211         context->lo |= (value & 3) << 2;
212 }
213
214 static inline void context_set_address_root(struct context_entry *context,
215                                             unsigned long value)
216 {
217         context->lo |= value & VTD_PAGE_MASK;
218 }
219
220 static inline void context_set_address_width(struct context_entry *context,
221                                              unsigned long value)
222 {
223         context->hi |= value & 7;
224 }
225
226 static inline void context_set_domain_id(struct context_entry *context,
227                                          unsigned long value)
228 {
229         context->hi |= (value & ((1 << 16) - 1)) << 8;
230 }
231
232 static inline void context_clear_entry(struct context_entry *context)
233 {
234         context->lo = 0;
235         context->hi = 0;
236 }
237
238 /*
239  * 0: readable
240  * 1: writable
241  * 2-6: reserved
242  * 7: super page
243  * 8-10: available
244  * 11: snoop behavior
245  * 12-63: Host physcial address
246  */
247 struct dma_pte {
248         u64 val;
249 };
250
251 static inline void dma_clear_pte(struct dma_pte *pte)
252 {
253         pte->val = 0;
254 }
255
256 static inline void dma_set_pte_readable(struct dma_pte *pte)
257 {
258         pte->val |= DMA_PTE_READ;
259 }
260
261 static inline void dma_set_pte_writable(struct dma_pte *pte)
262 {
263         pte->val |= DMA_PTE_WRITE;
264 }
265
266 static inline void dma_set_pte_snp(struct dma_pte *pte)
267 {
268         pte->val |= DMA_PTE_SNP;
269 }
270
271 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
272 {
273         pte->val = (pte->val & ~3) | (prot & 3);
274 }
275
276 static inline u64 dma_pte_addr(struct dma_pte *pte)
277 {
278 #ifdef CONFIG_64BIT
279         return pte->val & VTD_PAGE_MASK;
280 #else
281         /* Must have a full atomic 64-bit read */
282         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
283 #endif
284 }
285
286 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
287 {
288         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
289 }
290
291 static inline bool dma_pte_present(struct dma_pte *pte)
292 {
293         return (pte->val & 3) != 0;
294 }
295
296 static inline int first_pte_in_page(struct dma_pte *pte)
297 {
298         return !((unsigned long)pte & ~VTD_PAGE_MASK);
299 }
300
301 /*
302  * This domain is a statically identity mapping domain.
303  *      1. This domain creats a static 1:1 mapping to all usable memory.
304  *      2. It maps to each iommu if successful.
305  *      3. Each iommu mapps to this domain if successful.
306  */
307 static struct dmar_domain *si_domain;
308 static int hw_pass_through = 1;
309
310 /* devices under the same p2p bridge are owned in one domain */
311 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
312
313 /* domain represents a virtual machine, more than one devices
314  * across iommus may be owned in one domain, e.g. kvm guest.
315  */
316 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
317
318 /* si_domain contains mulitple devices */
319 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
320
321 struct dmar_domain {
322         int     id;                     /* domain id */
323         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
324
325         struct list_head devices;       /* all devices' list */
326         struct iova_domain iovad;       /* iova's that belong to this domain */
327
328         struct dma_pte  *pgd;           /* virtual address */
329         int             gaw;            /* max guest address width */
330
331         /* adjusted guest address width, 0 is level 2 30-bit */
332         int             agaw;
333
334         int             flags;          /* flags to find out type of domain */
335
336         int             iommu_coherency;/* indicate coherency of iommu access */
337         int             iommu_snooping; /* indicate snooping control feature*/
338         int             iommu_count;    /* reference count of iommu */
339         spinlock_t      iommu_lock;     /* protect iommu set in domain */
340         u64             max_addr;       /* maximum mapped address */
341 };
342
343 /* PCI domain-device relationship */
344 struct device_domain_info {
345         struct list_head link;  /* link to domain siblings */
346         struct list_head global; /* link to global list */
347         int segment;            /* PCI domain */
348         u8 bus;                 /* PCI bus number */
349         u8 devfn;               /* PCI devfn number */
350         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
351         struct intel_iommu *iommu; /* IOMMU used by this device */
352         struct dmar_domain *domain; /* pointer to domain */
353 };
354
355 static void flush_unmaps_timeout(unsigned long data);
356
357 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
358
359 #define HIGH_WATER_MARK 250
360 struct deferred_flush_tables {
361         int next;
362         struct iova *iova[HIGH_WATER_MARK];
363         struct dmar_domain *domain[HIGH_WATER_MARK];
364 };
365
366 static struct deferred_flush_tables *deferred_flush;
367
368 /* bitmap for indexing intel_iommus */
369 static int g_num_of_iommus;
370
371 static DEFINE_SPINLOCK(async_umap_flush_lock);
372 static LIST_HEAD(unmaps_to_do);
373
374 static int timer_on;
375 static long list_size;
376
377 static void domain_remove_dev_info(struct dmar_domain *domain);
378
379 #ifdef CONFIG_DMAR_DEFAULT_ON
380 int dmar_disabled = 0;
381 #else
382 int dmar_disabled = 1;
383 #endif /*CONFIG_DMAR_DEFAULT_ON*/
384
385 static int __initdata dmar_map_gfx = 1;
386 static int dmar_forcedac;
387 static int intel_iommu_strict;
388
389 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
390 static DEFINE_SPINLOCK(device_domain_lock);
391 static LIST_HEAD(device_domain_list);
392
393 static struct iommu_ops intel_iommu_ops;
394
395 static int __init intel_iommu_setup(char *str)
396 {
397         if (!str)
398                 return -EINVAL;
399         while (*str) {
400                 if (!strncmp(str, "on", 2)) {
401                         dmar_disabled = 0;
402                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
403                 } else if (!strncmp(str, "off", 3)) {
404                         dmar_disabled = 1;
405                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
406                 } else if (!strncmp(str, "igfx_off", 8)) {
407                         dmar_map_gfx = 0;
408                         printk(KERN_INFO
409                                 "Intel-IOMMU: disable GFX device mapping\n");
410                 } else if (!strncmp(str, "forcedac", 8)) {
411                         printk(KERN_INFO
412                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
413                         dmar_forcedac = 1;
414                 } else if (!strncmp(str, "strict", 6)) {
415                         printk(KERN_INFO
416                                 "Intel-IOMMU: disable batched IOTLB flush\n");
417                         intel_iommu_strict = 1;
418                 }
419
420                 str += strcspn(str, ",");
421                 while (*str == ',')
422                         str++;
423         }
424         return 0;
425 }
426 __setup("intel_iommu=", intel_iommu_setup);
427
428 static struct kmem_cache *iommu_domain_cache;
429 static struct kmem_cache *iommu_devinfo_cache;
430 static struct kmem_cache *iommu_iova_cache;
431
432 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
433 {
434         unsigned int flags;
435         void *vaddr;
436
437         /* trying to avoid low memory issues */
438         flags = current->flags & PF_MEMALLOC;
439         current->flags |= PF_MEMALLOC;
440         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
441         current->flags &= (~PF_MEMALLOC | flags);
442         return vaddr;
443 }
444
445
446 static inline void *alloc_pgtable_page(void)
447 {
448         unsigned int flags;
449         void *vaddr;
450
451         /* trying to avoid low memory issues */
452         flags = current->flags & PF_MEMALLOC;
453         current->flags |= PF_MEMALLOC;
454         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
455         current->flags &= (~PF_MEMALLOC | flags);
456         return vaddr;
457 }
458
459 static inline void free_pgtable_page(void *vaddr)
460 {
461         free_page((unsigned long)vaddr);
462 }
463
464 static inline void *alloc_domain_mem(void)
465 {
466         return iommu_kmem_cache_alloc(iommu_domain_cache);
467 }
468
469 static void free_domain_mem(void *vaddr)
470 {
471         kmem_cache_free(iommu_domain_cache, vaddr);
472 }
473
474 static inline void * alloc_devinfo_mem(void)
475 {
476         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
477 }
478
479 static inline void free_devinfo_mem(void *vaddr)
480 {
481         kmem_cache_free(iommu_devinfo_cache, vaddr);
482 }
483
484 struct iova *alloc_iova_mem(void)
485 {
486         return iommu_kmem_cache_alloc(iommu_iova_cache);
487 }
488
489 void free_iova_mem(struct iova *iova)
490 {
491         kmem_cache_free(iommu_iova_cache, iova);
492 }
493
494
495 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
496 {
497         unsigned long sagaw;
498         int agaw = -1;
499
500         sagaw = cap_sagaw(iommu->cap);
501         for (agaw = width_to_agaw(max_gaw);
502              agaw >= 0; agaw--) {
503                 if (test_bit(agaw, &sagaw))
504                         break;
505         }
506
507         return agaw;
508 }
509
510 /*
511  * Calculate max SAGAW for each iommu.
512  */
513 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
514 {
515         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
516 }
517
518 /*
519  * calculate agaw for each iommu.
520  * "SAGAW" may be different across iommus, use a default agaw, and
521  * get a supported less agaw for iommus that don't support the default agaw.
522  */
523 int iommu_calculate_agaw(struct intel_iommu *iommu)
524 {
525         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
526 }
527
528 /* This functionin only returns single iommu in a domain */
529 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
530 {
531         int iommu_id;
532
533         /* si_domain and vm domain should not get here. */
534         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
535         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
536
537         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
538         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
539                 return NULL;
540
541         return g_iommus[iommu_id];
542 }
543
544 static void domain_update_iommu_coherency(struct dmar_domain *domain)
545 {
546         int i;
547
548         domain->iommu_coherency = 1;
549
550         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
551         for (; i < g_num_of_iommus; ) {
552                 if (!ecap_coherent(g_iommus[i]->ecap)) {
553                         domain->iommu_coherency = 0;
554                         break;
555                 }
556                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
557         }
558 }
559
560 static void domain_update_iommu_snooping(struct dmar_domain *domain)
561 {
562         int i;
563
564         domain->iommu_snooping = 1;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567         for (; i < g_num_of_iommus; ) {
568                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
569                         domain->iommu_snooping = 0;
570                         break;
571                 }
572                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
573         }
574 }
575
576 /* Some capabilities may be different across iommus */
577 static void domain_update_iommu_cap(struct dmar_domain *domain)
578 {
579         domain_update_iommu_coherency(domain);
580         domain_update_iommu_snooping(domain);
581 }
582
583 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
584 {
585         struct dmar_drhd_unit *drhd = NULL;
586         int i;
587
588         for_each_drhd_unit(drhd) {
589                 if (drhd->ignored)
590                         continue;
591                 if (segment != drhd->segment)
592                         continue;
593
594                 for (i = 0; i < drhd->devices_cnt; i++) {
595                         if (drhd->devices[i] &&
596                             drhd->devices[i]->bus->number == bus &&
597                             drhd->devices[i]->devfn == devfn)
598                                 return drhd->iommu;
599                         if (drhd->devices[i] &&
600                             drhd->devices[i]->subordinate &&
601                             drhd->devices[i]->subordinate->number <= bus &&
602                             drhd->devices[i]->subordinate->subordinate >= bus)
603                                 return drhd->iommu;
604                 }
605
606                 if (drhd->include_all)
607                         return drhd->iommu;
608         }
609
610         return NULL;
611 }
612
613 static void domain_flush_cache(struct dmar_domain *domain,
614                                void *addr, int size)
615 {
616         if (!domain->iommu_coherency)
617                 clflush_cache_range(addr, size);
618 }
619
620 /* Gets context entry for a given bus and devfn */
621 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
622                 u8 bus, u8 devfn)
623 {
624         struct root_entry *root;
625         struct context_entry *context;
626         unsigned long phy_addr;
627         unsigned long flags;
628
629         spin_lock_irqsave(&iommu->lock, flags);
630         root = &iommu->root_entry[bus];
631         context = get_context_addr_from_root(root);
632         if (!context) {
633                 context = (struct context_entry *)alloc_pgtable_page();
634                 if (!context) {
635                         spin_unlock_irqrestore(&iommu->lock, flags);
636                         return NULL;
637                 }
638                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
639                 phy_addr = virt_to_phys((void *)context);
640                 set_root_value(root, phy_addr);
641                 set_root_present(root);
642                 __iommu_flush_cache(iommu, root, sizeof(*root));
643         }
644         spin_unlock_irqrestore(&iommu->lock, flags);
645         return &context[devfn];
646 }
647
648 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
649 {
650         struct root_entry *root;
651         struct context_entry *context;
652         int ret;
653         unsigned long flags;
654
655         spin_lock_irqsave(&iommu->lock, flags);
656         root = &iommu->root_entry[bus];
657         context = get_context_addr_from_root(root);
658         if (!context) {
659                 ret = 0;
660                 goto out;
661         }
662         ret = context_present(&context[devfn]);
663 out:
664         spin_unlock_irqrestore(&iommu->lock, flags);
665         return ret;
666 }
667
668 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
669 {
670         struct root_entry *root;
671         struct context_entry *context;
672         unsigned long flags;
673
674         spin_lock_irqsave(&iommu->lock, flags);
675         root = &iommu->root_entry[bus];
676         context = get_context_addr_from_root(root);
677         if (context) {
678                 context_clear_entry(&context[devfn]);
679                 __iommu_flush_cache(iommu, &context[devfn], \
680                         sizeof(*context));
681         }
682         spin_unlock_irqrestore(&iommu->lock, flags);
683 }
684
685 static void free_context_table(struct intel_iommu *iommu)
686 {
687         struct root_entry *root;
688         int i;
689         unsigned long flags;
690         struct context_entry *context;
691
692         spin_lock_irqsave(&iommu->lock, flags);
693         if (!iommu->root_entry) {
694                 goto out;
695         }
696         for (i = 0; i < ROOT_ENTRY_NR; i++) {
697                 root = &iommu->root_entry[i];
698                 context = get_context_addr_from_root(root);
699                 if (context)
700                         free_pgtable_page(context);
701         }
702         free_pgtable_page(iommu->root_entry);
703         iommu->root_entry = NULL;
704 out:
705         spin_unlock_irqrestore(&iommu->lock, flags);
706 }
707
708 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
709                                       unsigned long pfn)
710 {
711         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
712         struct dma_pte *parent, *pte = NULL;
713         int level = agaw_to_level(domain->agaw);
714         int offset;
715
716         BUG_ON(!domain->pgd);
717         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
718         parent = domain->pgd;
719
720         while (level > 0) {
721                 void *tmp_page;
722
723                 offset = pfn_level_offset(pfn, level);
724                 pte = &parent[offset];
725                 if (level == 1)
726                         break;
727
728                 if (!dma_pte_present(pte)) {
729                         uint64_t pteval;
730
731                         tmp_page = alloc_pgtable_page();
732
733                         if (!tmp_page)
734                                 return NULL;
735
736                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
737                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
738                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
739                                 /* Someone else set it while we were thinking; use theirs. */
740                                 free_pgtable_page(tmp_page);
741                         } else {
742                                 dma_pte_addr(pte);
743                                 domain_flush_cache(domain, pte, sizeof(*pte));
744                         }
745                 }
746                 parent = phys_to_virt(dma_pte_addr(pte));
747                 level--;
748         }
749
750         return pte;
751 }
752
753 /* return address's pte at specific level */
754 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
755                                          unsigned long pfn,
756                                          int level)
757 {
758         struct dma_pte *parent, *pte = NULL;
759         int total = agaw_to_level(domain->agaw);
760         int offset;
761
762         parent = domain->pgd;
763         while (level <= total) {
764                 offset = pfn_level_offset(pfn, total);
765                 pte = &parent[offset];
766                 if (level == total)
767                         return pte;
768
769                 if (!dma_pte_present(pte))
770                         break;
771                 parent = phys_to_virt(dma_pte_addr(pte));
772                 total--;
773         }
774         return NULL;
775 }
776
777 /* clear last level pte, a tlb flush should be followed */
778 static void dma_pte_clear_range(struct dmar_domain *domain,
779                                 unsigned long start_pfn,
780                                 unsigned long last_pfn)
781 {
782         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
783         struct dma_pte *first_pte, *pte;
784
785         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
786         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
787         BUG_ON(start_pfn > last_pfn);
788
789         /* we don't need lock here; nobody else touches the iova range */
790         do {
791                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
792                 if (!pte) {
793                         start_pfn = align_to_level(start_pfn + 1, 2);
794                         continue;
795                 }
796                 do { 
797                         dma_clear_pte(pte);
798                         start_pfn++;
799                         pte++;
800                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
801
802                 domain_flush_cache(domain, first_pte,
803                                    (void *)pte - (void *)first_pte);
804
805         } while (start_pfn && start_pfn <= last_pfn);
806 }
807
808 /* free page table pages. last level pte should already be cleared */
809 static void dma_pte_free_pagetable(struct dmar_domain *domain,
810                                    unsigned long start_pfn,
811                                    unsigned long last_pfn)
812 {
813         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
814         struct dma_pte *first_pte, *pte;
815         int total = agaw_to_level(domain->agaw);
816         int level;
817         unsigned long tmp;
818
819         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
820         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
821         BUG_ON(start_pfn > last_pfn);
822
823         /* We don't need lock here; nobody else touches the iova range */
824         level = 2;
825         while (level <= total) {
826                 tmp = align_to_level(start_pfn, level);
827
828                 /* If we can't even clear one PTE at this level, we're done */
829                 if (tmp + level_size(level) - 1 > last_pfn)
830                         return;
831
832                 do {
833                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
834                         if (!pte) {
835                                 tmp = align_to_level(tmp + 1, level + 1);
836                                 continue;
837                         }
838                         do {
839                                 if (dma_pte_present(pte)) {
840                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
841                                         dma_clear_pte(pte);
842                                 }
843                                 pte++;
844                                 tmp += level_size(level);
845                         } while (!first_pte_in_page(pte) &&
846                                  tmp + level_size(level) - 1 <= last_pfn);
847
848                         domain_flush_cache(domain, first_pte,
849                                            (void *)pte - (void *)first_pte);
850                         
851                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
852                 level++;
853         }
854         /* free pgd */
855         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
856                 free_pgtable_page(domain->pgd);
857                 domain->pgd = NULL;
858         }
859 }
860
861 /* iommu handling */
862 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
863 {
864         struct root_entry *root;
865         unsigned long flags;
866
867         root = (struct root_entry *)alloc_pgtable_page();
868         if (!root)
869                 return -ENOMEM;
870
871         __iommu_flush_cache(iommu, root, ROOT_SIZE);
872
873         spin_lock_irqsave(&iommu->lock, flags);
874         iommu->root_entry = root;
875         spin_unlock_irqrestore(&iommu->lock, flags);
876
877         return 0;
878 }
879
880 static void iommu_set_root_entry(struct intel_iommu *iommu)
881 {
882         void *addr;
883         u32 sts;
884         unsigned long flag;
885
886         addr = iommu->root_entry;
887
888         spin_lock_irqsave(&iommu->register_lock, flag);
889         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
890
891         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
892
893         /* Make sure hardware complete it */
894         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895                       readl, (sts & DMA_GSTS_RTPS), sts);
896
897         spin_unlock_irqrestore(&iommu->register_lock, flag);
898 }
899
900 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
901 {
902         u32 val;
903         unsigned long flag;
904
905         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
906                 return;
907
908         spin_lock_irqsave(&iommu->register_lock, flag);
909         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
910
911         /* Make sure hardware complete it */
912         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
913                       readl, (!(val & DMA_GSTS_WBFS)), val);
914
915         spin_unlock_irqrestore(&iommu->register_lock, flag);
916 }
917
918 /* return value determine if we need a write buffer flush */
919 static void __iommu_flush_context(struct intel_iommu *iommu,
920                                   u16 did, u16 source_id, u8 function_mask,
921                                   u64 type)
922 {
923         u64 val = 0;
924         unsigned long flag;
925
926         switch (type) {
927         case DMA_CCMD_GLOBAL_INVL:
928                 val = DMA_CCMD_GLOBAL_INVL;
929                 break;
930         case DMA_CCMD_DOMAIN_INVL:
931                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
932                 break;
933         case DMA_CCMD_DEVICE_INVL:
934                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
935                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
936                 break;
937         default:
938                 BUG();
939         }
940         val |= DMA_CCMD_ICC;
941
942         spin_lock_irqsave(&iommu->register_lock, flag);
943         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
944
945         /* Make sure hardware complete it */
946         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
947                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
948
949         spin_unlock_irqrestore(&iommu->register_lock, flag);
950 }
951
952 /* return value determine if we need a write buffer flush */
953 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
954                                 u64 addr, unsigned int size_order, u64 type)
955 {
956         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
957         u64 val = 0, val_iva = 0;
958         unsigned long flag;
959
960         switch (type) {
961         case DMA_TLB_GLOBAL_FLUSH:
962                 /* global flush doesn't need set IVA_REG */
963                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
964                 break;
965         case DMA_TLB_DSI_FLUSH:
966                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
967                 break;
968         case DMA_TLB_PSI_FLUSH:
969                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
970                 /* Note: always flush non-leaf currently */
971                 val_iva = size_order | addr;
972                 break;
973         default:
974                 BUG();
975         }
976         /* Note: set drain read/write */
977 #if 0
978         /*
979          * This is probably to be super secure.. Looks like we can
980          * ignore it without any impact.
981          */
982         if (cap_read_drain(iommu->cap))
983                 val |= DMA_TLB_READ_DRAIN;
984 #endif
985         if (cap_write_drain(iommu->cap))
986                 val |= DMA_TLB_WRITE_DRAIN;
987
988         spin_lock_irqsave(&iommu->register_lock, flag);
989         /* Note: Only uses first TLB reg currently */
990         if (val_iva)
991                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
992         dmar_writeq(iommu->reg + tlb_offset + 8, val);
993
994         /* Make sure hardware complete it */
995         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
996                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
997
998         spin_unlock_irqrestore(&iommu->register_lock, flag);
999
1000         /* check IOTLB invalidation granularity */
1001         if (DMA_TLB_IAIG(val) == 0)
1002                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1003         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1004                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1005                         (unsigned long long)DMA_TLB_IIRG(type),
1006                         (unsigned long long)DMA_TLB_IAIG(val));
1007 }
1008
1009 static struct device_domain_info *iommu_support_dev_iotlb(
1010         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1011 {
1012         int found = 0;
1013         unsigned long flags;
1014         struct device_domain_info *info;
1015         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1016
1017         if (!ecap_dev_iotlb_support(iommu->ecap))
1018                 return NULL;
1019
1020         if (!iommu->qi)
1021                 return NULL;
1022
1023         spin_lock_irqsave(&device_domain_lock, flags);
1024         list_for_each_entry(info, &domain->devices, link)
1025                 if (info->bus == bus && info->devfn == devfn) {
1026                         found = 1;
1027                         break;
1028                 }
1029         spin_unlock_irqrestore(&device_domain_lock, flags);
1030
1031         if (!found || !info->dev)
1032                 return NULL;
1033
1034         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1035                 return NULL;
1036
1037         if (!dmar_find_matched_atsr_unit(info->dev))
1038                 return NULL;
1039
1040         info->iommu = iommu;
1041
1042         return info;
1043 }
1044
1045 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1046 {
1047         if (!info)
1048                 return;
1049
1050         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1051 }
1052
1053 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1054 {
1055         if (!info->dev || !pci_ats_enabled(info->dev))
1056                 return;
1057
1058         pci_disable_ats(info->dev);
1059 }
1060
1061 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1062                                   u64 addr, unsigned mask)
1063 {
1064         u16 sid, qdep;
1065         unsigned long flags;
1066         struct device_domain_info *info;
1067
1068         spin_lock_irqsave(&device_domain_lock, flags);
1069         list_for_each_entry(info, &domain->devices, link) {
1070                 if (!info->dev || !pci_ats_enabled(info->dev))
1071                         continue;
1072
1073                 sid = info->bus << 8 | info->devfn;
1074                 qdep = pci_ats_queue_depth(info->dev);
1075                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1076         }
1077         spin_unlock_irqrestore(&device_domain_lock, flags);
1078 }
1079
1080 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1081                                   unsigned long pfn, unsigned int pages)
1082 {
1083         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1084         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1085
1086         BUG_ON(pages == 0);
1087
1088         /*
1089          * Fallback to domain selective flush if no PSI support or the size is
1090          * too big.
1091          * PSI requires page size to be 2 ^ x, and the base address is naturally
1092          * aligned to the size
1093          */
1094         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1095                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1096                                                 DMA_TLB_DSI_FLUSH);
1097         else
1098                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1099                                                 DMA_TLB_PSI_FLUSH);
1100
1101         /*
1102          * In caching mode, domain ID 0 is reserved for non-present to present
1103          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1104          */
1105         if (!cap_caching_mode(iommu->cap) || did)
1106                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1107 }
1108
1109 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1110 {
1111         u32 pmen;
1112         unsigned long flags;
1113
1114         spin_lock_irqsave(&iommu->register_lock, flags);
1115         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1116         pmen &= ~DMA_PMEN_EPM;
1117         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1118
1119         /* wait for the protected region status bit to clear */
1120         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1121                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1122
1123         spin_unlock_irqrestore(&iommu->register_lock, flags);
1124 }
1125
1126 static int iommu_enable_translation(struct intel_iommu *iommu)
1127 {
1128         u32 sts;
1129         unsigned long flags;
1130
1131         spin_lock_irqsave(&iommu->register_lock, flags);
1132         iommu->gcmd |= DMA_GCMD_TE;
1133         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1134
1135         /* Make sure hardware complete it */
1136         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1137                       readl, (sts & DMA_GSTS_TES), sts);
1138
1139         spin_unlock_irqrestore(&iommu->register_lock, flags);
1140         return 0;
1141 }
1142
1143 static int iommu_disable_translation(struct intel_iommu *iommu)
1144 {
1145         u32 sts;
1146         unsigned long flag;
1147
1148         spin_lock_irqsave(&iommu->register_lock, flag);
1149         iommu->gcmd &= ~DMA_GCMD_TE;
1150         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1151
1152         /* Make sure hardware complete it */
1153         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1154                       readl, (!(sts & DMA_GSTS_TES)), sts);
1155
1156         spin_unlock_irqrestore(&iommu->register_lock, flag);
1157         return 0;
1158 }
1159
1160
1161 static int iommu_init_domains(struct intel_iommu *iommu)
1162 {
1163         unsigned long ndomains;
1164         unsigned long nlongs;
1165
1166         ndomains = cap_ndoms(iommu->cap);
1167         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1168         nlongs = BITS_TO_LONGS(ndomains);
1169
1170         spin_lock_init(&iommu->lock);
1171
1172         /* TBD: there might be 64K domains,
1173          * consider other allocation for future chip
1174          */
1175         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1176         if (!iommu->domain_ids) {
1177                 printk(KERN_ERR "Allocating domain id array failed\n");
1178                 return -ENOMEM;
1179         }
1180         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1181                         GFP_KERNEL);
1182         if (!iommu->domains) {
1183                 printk(KERN_ERR "Allocating domain array failed\n");
1184                 return -ENOMEM;
1185         }
1186
1187         /*
1188          * if Caching mode is set, then invalid translations are tagged
1189          * with domainid 0. Hence we need to pre-allocate it.
1190          */
1191         if (cap_caching_mode(iommu->cap))
1192                 set_bit(0, iommu->domain_ids);
1193         return 0;
1194 }
1195
1196
1197 static void domain_exit(struct dmar_domain *domain);
1198 static void vm_domain_exit(struct dmar_domain *domain);
1199
1200 void free_dmar_iommu(struct intel_iommu *iommu)
1201 {
1202         struct dmar_domain *domain;
1203         int i;
1204         unsigned long flags;
1205
1206         if ((iommu->domains) && (iommu->domain_ids)) {
1207                 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1208                 for (; i < cap_ndoms(iommu->cap); ) {
1209                         domain = iommu->domains[i];
1210                         clear_bit(i, iommu->domain_ids);
1211
1212                         spin_lock_irqsave(&domain->iommu_lock, flags);
1213                         if (--domain->iommu_count == 0) {
1214                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1215                                         vm_domain_exit(domain);
1216                                 else
1217                                         domain_exit(domain);
1218                         }
1219                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1220
1221                         i = find_next_bit(iommu->domain_ids,
1222                                 cap_ndoms(iommu->cap), i+1);
1223                 }
1224         }
1225
1226         if (iommu->gcmd & DMA_GCMD_TE)
1227                 iommu_disable_translation(iommu);
1228
1229         if (iommu->irq) {
1230                 set_irq_data(iommu->irq, NULL);
1231                 /* This will mask the irq */
1232                 free_irq(iommu->irq, iommu);
1233                 destroy_irq(iommu->irq);
1234         }
1235
1236         kfree(iommu->domains);
1237         kfree(iommu->domain_ids);
1238
1239         g_iommus[iommu->seq_id] = NULL;
1240
1241         /* if all iommus are freed, free g_iommus */
1242         for (i = 0; i < g_num_of_iommus; i++) {
1243                 if (g_iommus[i])
1244                         break;
1245         }
1246
1247         if (i == g_num_of_iommus)
1248                 kfree(g_iommus);
1249
1250         /* free context mapping */
1251         free_context_table(iommu);
1252 }
1253
1254 static struct dmar_domain *alloc_domain(void)
1255 {
1256         struct dmar_domain *domain;
1257
1258         domain = alloc_domain_mem();
1259         if (!domain)
1260                 return NULL;
1261
1262         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1263         domain->flags = 0;
1264
1265         return domain;
1266 }
1267
1268 static int iommu_attach_domain(struct dmar_domain *domain,
1269                                struct intel_iommu *iommu)
1270 {
1271         int num;
1272         unsigned long ndomains;
1273         unsigned long flags;
1274
1275         ndomains = cap_ndoms(iommu->cap);
1276
1277         spin_lock_irqsave(&iommu->lock, flags);
1278
1279         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1280         if (num >= ndomains) {
1281                 spin_unlock_irqrestore(&iommu->lock, flags);
1282                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1283                 return -ENOMEM;
1284         }
1285
1286         domain->id = num;
1287         set_bit(num, iommu->domain_ids);
1288         set_bit(iommu->seq_id, &domain->iommu_bmp);
1289         iommu->domains[num] = domain;
1290         spin_unlock_irqrestore(&iommu->lock, flags);
1291
1292         return 0;
1293 }
1294
1295 static void iommu_detach_domain(struct dmar_domain *domain,
1296                                 struct intel_iommu *iommu)
1297 {
1298         unsigned long flags;
1299         int num, ndomains;
1300         int found = 0;
1301
1302         spin_lock_irqsave(&iommu->lock, flags);
1303         ndomains = cap_ndoms(iommu->cap);
1304         num = find_first_bit(iommu->domain_ids, ndomains);
1305         for (; num < ndomains; ) {
1306                 if (iommu->domains[num] == domain) {
1307                         found = 1;
1308                         break;
1309                 }
1310                 num = find_next_bit(iommu->domain_ids,
1311                                     cap_ndoms(iommu->cap), num+1);
1312         }
1313
1314         if (found) {
1315                 clear_bit(num, iommu->domain_ids);
1316                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1317                 iommu->domains[num] = NULL;
1318         }
1319         spin_unlock_irqrestore(&iommu->lock, flags);
1320 }
1321
1322 static struct iova_domain reserved_iova_list;
1323 static struct lock_class_key reserved_rbtree_key;
1324
1325 static void dmar_init_reserved_ranges(void)
1326 {
1327         struct pci_dev *pdev = NULL;
1328         struct iova *iova;
1329         int i;
1330
1331         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1332
1333         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1334                 &reserved_rbtree_key);
1335
1336         /* IOAPIC ranges shouldn't be accessed by DMA */
1337         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1338                 IOVA_PFN(IOAPIC_RANGE_END));
1339         if (!iova)
1340                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1341
1342         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1343         for_each_pci_dev(pdev) {
1344                 struct resource *r;
1345
1346                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1347                         r = &pdev->resource[i];
1348                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1349                                 continue;
1350                         iova = reserve_iova(&reserved_iova_list,
1351                                             IOVA_PFN(r->start),
1352                                             IOVA_PFN(r->end));
1353                         if (!iova)
1354                                 printk(KERN_ERR "Reserve iova failed\n");
1355                 }
1356         }
1357
1358 }
1359
1360 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1361 {
1362         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1363 }
1364
1365 static inline int guestwidth_to_adjustwidth(int gaw)
1366 {
1367         int agaw;
1368         int r = (gaw - 12) % 9;
1369
1370         if (r == 0)
1371                 agaw = gaw;
1372         else
1373                 agaw = gaw + 9 - r;
1374         if (agaw > 64)
1375                 agaw = 64;
1376         return agaw;
1377 }
1378
1379 static int domain_init(struct dmar_domain *domain, int guest_width)
1380 {
1381         struct intel_iommu *iommu;
1382         int adjust_width, agaw;
1383         unsigned long sagaw;
1384
1385         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1386         spin_lock_init(&domain->iommu_lock);
1387
1388         domain_reserve_special_ranges(domain);
1389
1390         /* calculate AGAW */
1391         iommu = domain_get_iommu(domain);
1392         if (guest_width > cap_mgaw(iommu->cap))
1393                 guest_width = cap_mgaw(iommu->cap);
1394         domain->gaw = guest_width;
1395         adjust_width = guestwidth_to_adjustwidth(guest_width);
1396         agaw = width_to_agaw(adjust_width);
1397         sagaw = cap_sagaw(iommu->cap);
1398         if (!test_bit(agaw, &sagaw)) {
1399                 /* hardware doesn't support it, choose a bigger one */
1400                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1401                 agaw = find_next_bit(&sagaw, 5, agaw);
1402                 if (agaw >= 5)
1403                         return -ENODEV;
1404         }
1405         domain->agaw = agaw;
1406         INIT_LIST_HEAD(&domain->devices);
1407
1408         if (ecap_coherent(iommu->ecap))
1409                 domain->iommu_coherency = 1;
1410         else
1411                 domain->iommu_coherency = 0;
1412
1413         if (ecap_sc_support(iommu->ecap))
1414                 domain->iommu_snooping = 1;
1415         else
1416                 domain->iommu_snooping = 0;
1417
1418         domain->iommu_count = 1;
1419
1420         /* always allocate the top pgd */
1421         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1422         if (!domain->pgd)
1423                 return -ENOMEM;
1424         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1425         return 0;
1426 }
1427
1428 static void domain_exit(struct dmar_domain *domain)
1429 {
1430         struct dmar_drhd_unit *drhd;
1431         struct intel_iommu *iommu;
1432
1433         /* Domain 0 is reserved, so dont process it */
1434         if (!domain)
1435                 return;
1436
1437         domain_remove_dev_info(domain);
1438         /* destroy iovas */
1439         put_iova_domain(&domain->iovad);
1440
1441         /* clear ptes */
1442         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1443
1444         /* free page tables */
1445         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1446
1447         for_each_active_iommu(iommu, drhd)
1448                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1449                         iommu_detach_domain(domain, iommu);
1450
1451         free_domain_mem(domain);
1452 }
1453
1454 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1455                                  u8 bus, u8 devfn, int translation)
1456 {
1457         struct context_entry *context;
1458         unsigned long flags;
1459         struct intel_iommu *iommu;
1460         struct dma_pte *pgd;
1461         unsigned long num;
1462         unsigned long ndomains;
1463         int id;
1464         int agaw;
1465         struct device_domain_info *info = NULL;
1466
1467         pr_debug("Set context mapping for %02x:%02x.%d\n",
1468                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1469
1470         BUG_ON(!domain->pgd);
1471         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1472                translation != CONTEXT_TT_MULTI_LEVEL);
1473
1474         iommu = device_to_iommu(segment, bus, devfn);
1475         if (!iommu)
1476                 return -ENODEV;
1477
1478         context = device_to_context_entry(iommu, bus, devfn);
1479         if (!context)
1480                 return -ENOMEM;
1481         spin_lock_irqsave(&iommu->lock, flags);
1482         if (context_present(context)) {
1483                 spin_unlock_irqrestore(&iommu->lock, flags);
1484                 return 0;
1485         }
1486
1487         id = domain->id;
1488         pgd = domain->pgd;
1489
1490         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1491             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1492                 int found = 0;
1493
1494                 /* find an available domain id for this device in iommu */
1495                 ndomains = cap_ndoms(iommu->cap);
1496                 num = find_first_bit(iommu->domain_ids, ndomains);
1497                 for (; num < ndomains; ) {
1498                         if (iommu->domains[num] == domain) {
1499                                 id = num;
1500                                 found = 1;
1501                                 break;
1502                         }
1503                         num = find_next_bit(iommu->domain_ids,
1504                                             cap_ndoms(iommu->cap), num+1);
1505                 }
1506
1507                 if (found == 0) {
1508                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1509                         if (num >= ndomains) {
1510                                 spin_unlock_irqrestore(&iommu->lock, flags);
1511                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1512                                 return -EFAULT;
1513                         }
1514
1515                         set_bit(num, iommu->domain_ids);
1516                         iommu->domains[num] = domain;
1517                         id = num;
1518                 }
1519
1520                 /* Skip top levels of page tables for
1521                  * iommu which has less agaw than default.
1522                  * Unnecessary for PT mode.
1523                  */
1524                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1525                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1526                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1527                                 if (!dma_pte_present(pgd)) {
1528                                         spin_unlock_irqrestore(&iommu->lock, flags);
1529                                         return -ENOMEM;
1530                                 }
1531                         }
1532                 }
1533         }
1534
1535         context_set_domain_id(context, id);
1536
1537         if (translation != CONTEXT_TT_PASS_THROUGH) {
1538                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1539                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1540                                      CONTEXT_TT_MULTI_LEVEL;
1541         }
1542         /*
1543          * In pass through mode, AW must be programmed to indicate the largest
1544          * AGAW value supported by hardware. And ASR is ignored by hardware.
1545          */
1546         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1547                 context_set_address_width(context, iommu->msagaw);
1548         else {
1549                 context_set_address_root(context, virt_to_phys(pgd));
1550                 context_set_address_width(context, iommu->agaw);
1551         }
1552
1553         context_set_translation_type(context, translation);
1554         context_set_fault_enable(context);
1555         context_set_present(context);
1556         domain_flush_cache(domain, context, sizeof(*context));
1557
1558         /*
1559          * It's a non-present to present mapping. If hardware doesn't cache
1560          * non-present entry we only need to flush the write-buffer. If the
1561          * _does_ cache non-present entries, then it does so in the special
1562          * domain #0, which we have to flush:
1563          */
1564         if (cap_caching_mode(iommu->cap)) {
1565                 iommu->flush.flush_context(iommu, 0,
1566                                            (((u16)bus) << 8) | devfn,
1567                                            DMA_CCMD_MASK_NOBIT,
1568                                            DMA_CCMD_DEVICE_INVL);
1569                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1570         } else {
1571                 iommu_flush_write_buffer(iommu);
1572         }
1573         iommu_enable_dev_iotlb(info);
1574         spin_unlock_irqrestore(&iommu->lock, flags);
1575
1576         spin_lock_irqsave(&domain->iommu_lock, flags);
1577         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1578                 domain->iommu_count++;
1579                 domain_update_iommu_cap(domain);
1580         }
1581         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1582         return 0;
1583 }
1584
1585 static int
1586 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1587                         int translation)
1588 {
1589         int ret;
1590         struct pci_dev *tmp, *parent;
1591
1592         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1593                                          pdev->bus->number, pdev->devfn,
1594                                          translation);
1595         if (ret)
1596                 return ret;
1597
1598         /* dependent device mapping */
1599         tmp = pci_find_upstream_pcie_bridge(pdev);
1600         if (!tmp)
1601                 return 0;
1602         /* Secondary interface's bus number and devfn 0 */
1603         parent = pdev->bus->self;
1604         while (parent != tmp) {
1605                 ret = domain_context_mapping_one(domain,
1606                                                  pci_domain_nr(parent->bus),
1607                                                  parent->bus->number,
1608                                                  parent->devfn, translation);
1609                 if (ret)
1610                         return ret;
1611                 parent = parent->bus->self;
1612         }
1613         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1614                 return domain_context_mapping_one(domain,
1615                                         pci_domain_nr(tmp->subordinate),
1616                                         tmp->subordinate->number, 0,
1617                                         translation);
1618         else /* this is a legacy PCI bridge */
1619                 return domain_context_mapping_one(domain,
1620                                                   pci_domain_nr(tmp->bus),
1621                                                   tmp->bus->number,
1622                                                   tmp->devfn,
1623                                                   translation);
1624 }
1625
1626 static int domain_context_mapped(struct pci_dev *pdev)
1627 {
1628         int ret;
1629         struct pci_dev *tmp, *parent;
1630         struct intel_iommu *iommu;
1631
1632         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1633                                 pdev->devfn);
1634         if (!iommu)
1635                 return -ENODEV;
1636
1637         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1638         if (!ret)
1639                 return ret;
1640         /* dependent device mapping */
1641         tmp = pci_find_upstream_pcie_bridge(pdev);
1642         if (!tmp)
1643                 return ret;
1644         /* Secondary interface's bus number and devfn 0 */
1645         parent = pdev->bus->self;
1646         while (parent != tmp) {
1647                 ret = device_context_mapped(iommu, parent->bus->number,
1648                                             parent->devfn);
1649                 if (!ret)
1650                         return ret;
1651                 parent = parent->bus->self;
1652         }
1653         if (tmp->is_pcie)
1654                 return device_context_mapped(iommu, tmp->subordinate->number,
1655                                              0);
1656         else
1657                 return device_context_mapped(iommu, tmp->bus->number,
1658                                              tmp->devfn);
1659 }
1660
1661 /* Returns a number of VTD pages, but aligned to MM page size */
1662 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1663                                             size_t size)
1664 {
1665         host_addr &= ~PAGE_MASK;
1666         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1667 }
1668
1669 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1670                             struct scatterlist *sg, unsigned long phys_pfn,
1671                             unsigned long nr_pages, int prot)
1672 {
1673         struct dma_pte *first_pte = NULL, *pte = NULL;
1674         phys_addr_t uninitialized_var(pteval);
1675         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1676         unsigned long sg_res;
1677
1678         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1679
1680         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1681                 return -EINVAL;
1682
1683         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1684
1685         if (sg)
1686                 sg_res = 0;
1687         else {
1688                 sg_res = nr_pages + 1;
1689                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1690         }
1691
1692         while (nr_pages--) {
1693                 uint64_t tmp;
1694
1695                 if (!sg_res) {
1696                         sg_res = aligned_nrpages(sg->offset, sg->length);
1697                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1698                         sg->dma_length = sg->length;
1699                         pteval = page_to_phys(sg_page(sg)) | prot;
1700                 }
1701                 if (!pte) {
1702                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1703                         if (!pte)
1704                                 return -ENOMEM;
1705                 }
1706                 /* We don't need lock here, nobody else
1707                  * touches the iova range
1708                  */
1709                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1710                 if (tmp) {
1711                         static int dumps = 5;
1712                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1713                                iov_pfn, tmp, (unsigned long long)pteval);
1714                         if (dumps) {
1715                                 dumps--;
1716                                 debug_dma_dump_mappings(NULL);
1717                         }
1718                         WARN_ON(1);
1719                 }
1720                 pte++;
1721                 if (!nr_pages || first_pte_in_page(pte)) {
1722                         domain_flush_cache(domain, first_pte,
1723                                            (void *)pte - (void *)first_pte);
1724                         pte = NULL;
1725                 }
1726                 iov_pfn++;
1727                 pteval += VTD_PAGE_SIZE;
1728                 sg_res--;
1729                 if (!sg_res)
1730                         sg = sg_next(sg);
1731         }
1732         return 0;
1733 }
1734
1735 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1736                                     struct scatterlist *sg, unsigned long nr_pages,
1737                                     int prot)
1738 {
1739         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1740 }
1741
1742 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1743                                      unsigned long phys_pfn, unsigned long nr_pages,
1744                                      int prot)
1745 {
1746         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1747 }
1748
1749 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1750 {
1751         if (!iommu)
1752                 return;
1753
1754         clear_context_table(iommu, bus, devfn);
1755         iommu->flush.flush_context(iommu, 0, 0, 0,
1756                                            DMA_CCMD_GLOBAL_INVL);
1757         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1758 }
1759
1760 static void domain_remove_dev_info(struct dmar_domain *domain)
1761 {
1762         struct device_domain_info *info;
1763         unsigned long flags;
1764         struct intel_iommu *iommu;
1765
1766         spin_lock_irqsave(&device_domain_lock, flags);
1767         while (!list_empty(&domain->devices)) {
1768                 info = list_entry(domain->devices.next,
1769                         struct device_domain_info, link);
1770                 list_del(&info->link);
1771                 list_del(&info->global);
1772                 if (info->dev)
1773                         info->dev->dev.archdata.iommu = NULL;
1774                 spin_unlock_irqrestore(&device_domain_lock, flags);
1775
1776                 iommu_disable_dev_iotlb(info);
1777                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1778                 iommu_detach_dev(iommu, info->bus, info->devfn);
1779                 free_devinfo_mem(info);
1780
1781                 spin_lock_irqsave(&device_domain_lock, flags);
1782         }
1783         spin_unlock_irqrestore(&device_domain_lock, flags);
1784 }
1785
1786 /*
1787  * find_domain
1788  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1789  */
1790 static struct dmar_domain *
1791 find_domain(struct pci_dev *pdev)
1792 {
1793         struct device_domain_info *info;
1794
1795         /* No lock here, assumes no domain exit in normal case */
1796         info = pdev->dev.archdata.iommu;
1797         if (info)
1798                 return info->domain;
1799         return NULL;
1800 }
1801
1802 /* domain is initialized */
1803 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1804 {
1805         struct dmar_domain *domain, *found = NULL;
1806         struct intel_iommu *iommu;
1807         struct dmar_drhd_unit *drhd;
1808         struct device_domain_info *info, *tmp;
1809         struct pci_dev *dev_tmp;
1810         unsigned long flags;
1811         int bus = 0, devfn = 0;
1812         int segment;
1813         int ret;
1814
1815         domain = find_domain(pdev);
1816         if (domain)
1817                 return domain;
1818
1819         segment = pci_domain_nr(pdev->bus);
1820
1821         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1822         if (dev_tmp) {
1823                 if (dev_tmp->is_pcie) {
1824                         bus = dev_tmp->subordinate->number;
1825                         devfn = 0;
1826                 } else {
1827                         bus = dev_tmp->bus->number;
1828                         devfn = dev_tmp->devfn;
1829                 }
1830                 spin_lock_irqsave(&device_domain_lock, flags);
1831                 list_for_each_entry(info, &device_domain_list, global) {
1832                         if (info->segment == segment &&
1833                             info->bus == bus && info->devfn == devfn) {
1834                                 found = info->domain;
1835                                 break;
1836                         }
1837                 }
1838                 spin_unlock_irqrestore(&device_domain_lock, flags);
1839                 /* pcie-pci bridge already has a domain, uses it */
1840                 if (found) {
1841                         domain = found;
1842                         goto found_domain;
1843                 }
1844         }
1845
1846         domain = alloc_domain();
1847         if (!domain)
1848                 goto error;
1849
1850         /* Allocate new domain for the device */
1851         drhd = dmar_find_matched_drhd_unit(pdev);
1852         if (!drhd) {
1853                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1854                         pci_name(pdev));
1855                 return NULL;
1856         }
1857         iommu = drhd->iommu;
1858
1859         ret = iommu_attach_domain(domain, iommu);
1860         if (ret) {
1861                 domain_exit(domain);
1862                 goto error;
1863         }
1864
1865         if (domain_init(domain, gaw)) {
1866                 domain_exit(domain);
1867                 goto error;
1868         }
1869
1870         /* register pcie-to-pci device */
1871         if (dev_tmp) {
1872                 info = alloc_devinfo_mem();
1873                 if (!info) {
1874                         domain_exit(domain);
1875                         goto error;
1876                 }
1877                 info->segment = segment;
1878                 info->bus = bus;
1879                 info->devfn = devfn;
1880                 info->dev = NULL;
1881                 info->domain = domain;
1882                 /* This domain is shared by devices under p2p bridge */
1883                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1884
1885                 /* pcie-to-pci bridge already has a domain, uses it */
1886                 found = NULL;
1887                 spin_lock_irqsave(&device_domain_lock, flags);
1888                 list_for_each_entry(tmp, &device_domain_list, global) {
1889                         if (tmp->segment == segment &&
1890                             tmp->bus == bus && tmp->devfn == devfn) {
1891                                 found = tmp->domain;
1892                                 break;
1893                         }
1894                 }
1895                 if (found) {
1896                         free_devinfo_mem(info);
1897                         domain_exit(domain);
1898                         domain = found;
1899                 } else {
1900                         list_add(&info->link, &domain->devices);
1901                         list_add(&info->global, &device_domain_list);
1902                 }
1903                 spin_unlock_irqrestore(&device_domain_lock, flags);
1904         }
1905
1906 found_domain:
1907         info = alloc_devinfo_mem();
1908         if (!info)
1909                 goto error;
1910         info->segment = segment;
1911         info->bus = pdev->bus->number;
1912         info->devfn = pdev->devfn;
1913         info->dev = pdev;
1914         info->domain = domain;
1915         spin_lock_irqsave(&device_domain_lock, flags);
1916         /* somebody is fast */
1917         found = find_domain(pdev);
1918         if (found != NULL) {
1919                 spin_unlock_irqrestore(&device_domain_lock, flags);
1920                 if (found != domain) {
1921                         domain_exit(domain);
1922                         domain = found;
1923                 }
1924                 free_devinfo_mem(info);
1925                 return domain;
1926         }
1927         list_add(&info->link, &domain->devices);
1928         list_add(&info->global, &device_domain_list);
1929         pdev->dev.archdata.iommu = info;
1930         spin_unlock_irqrestore(&device_domain_lock, flags);
1931         return domain;
1932 error:
1933         /* recheck it here, maybe others set it */
1934         return find_domain(pdev);
1935 }
1936
1937 static int iommu_identity_mapping;
1938 #define IDENTMAP_ALL            1
1939 #define IDENTMAP_GFX            2
1940 #define IDENTMAP_AZALIA         4
1941
1942 static int iommu_domain_identity_map(struct dmar_domain *domain,
1943                                      unsigned long long start,
1944                                      unsigned long long end)
1945 {
1946         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1947         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1948
1949         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1950                           dma_to_mm_pfn(last_vpfn))) {
1951                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1952                 return -ENOMEM;
1953         }
1954
1955         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1956                  start, end, domain->id);
1957         /*
1958          * RMRR range might have overlap with physical memory range,
1959          * clear it first
1960          */
1961         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1962
1963         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1964                                   last_vpfn - first_vpfn + 1,
1965                                   DMA_PTE_READ|DMA_PTE_WRITE);
1966 }
1967
1968 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1969                                       unsigned long long start,
1970                                       unsigned long long end)
1971 {
1972         struct dmar_domain *domain;
1973         int ret;
1974
1975         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1976         if (!domain)
1977                 return -ENOMEM;
1978
1979         /* For _hardware_ passthrough, don't bother. But for software
1980            passthrough, we do it anyway -- it may indicate a memory
1981            range which is reserved in E820, so which didn't get set
1982            up to start with in si_domain */
1983         if (domain == si_domain && hw_pass_through) {
1984                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1985                        pci_name(pdev), start, end);
1986                 return 0;
1987         }
1988
1989         printk(KERN_INFO
1990                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1991                pci_name(pdev), start, end);
1992         
1993         if (end < start) {
1994                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1995                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1996                         dmi_get_system_info(DMI_BIOS_VENDOR),
1997                         dmi_get_system_info(DMI_BIOS_VERSION),
1998                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1999                 ret = -EIO;
2000                 goto error;
2001         }
2002
2003         if (end >> agaw_to_width(domain->agaw)) {
2004                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2005                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2006                      agaw_to_width(domain->agaw),
2007                      dmi_get_system_info(DMI_BIOS_VENDOR),
2008                      dmi_get_system_info(DMI_BIOS_VERSION),
2009                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2010                 ret = -EIO;
2011                 goto error;
2012         }
2013
2014         ret = iommu_domain_identity_map(domain, start, end);
2015         if (ret)
2016                 goto error;
2017
2018         /* context entry init */
2019         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2020         if (ret)
2021                 goto error;
2022
2023         return 0;
2024
2025  error:
2026         domain_exit(domain);
2027         return ret;
2028 }
2029
2030 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2031         struct pci_dev *pdev)
2032 {
2033         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2034                 return 0;
2035         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2036                 rmrr->end_address + 1);
2037 }
2038
2039 #ifdef CONFIG_DMAR_FLOPPY_WA
2040 static inline void iommu_prepare_isa(void)
2041 {
2042         struct pci_dev *pdev;
2043         int ret;
2044
2045         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2046         if (!pdev)
2047                 return;
2048
2049         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2050         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2051
2052         if (ret)
2053                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2054                        "floppy might not work\n");
2055
2056 }
2057 #else
2058 static inline void iommu_prepare_isa(void)
2059 {
2060         return;
2061 }
2062 #endif /* !CONFIG_DMAR_FLPY_WA */
2063
2064 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2065
2066 static int __init si_domain_work_fn(unsigned long start_pfn,
2067                                     unsigned long end_pfn, void *datax)
2068 {
2069         int *ret = datax;
2070
2071         *ret = iommu_domain_identity_map(si_domain,
2072                                          (uint64_t)start_pfn << PAGE_SHIFT,
2073                                          (uint64_t)end_pfn << PAGE_SHIFT);
2074         return *ret;
2075
2076 }
2077
2078 static int __init si_domain_init(int hw)
2079 {
2080         struct dmar_drhd_unit *drhd;
2081         struct intel_iommu *iommu;
2082         int nid, ret = 0;
2083
2084         si_domain = alloc_domain();
2085         if (!si_domain)
2086                 return -EFAULT;
2087
2088         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2089
2090         for_each_active_iommu(iommu, drhd) {
2091                 ret = iommu_attach_domain(si_domain, iommu);
2092                 if (ret) {
2093                         domain_exit(si_domain);
2094                         return -EFAULT;
2095                 }
2096         }
2097
2098         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2099                 domain_exit(si_domain);
2100                 return -EFAULT;
2101         }
2102
2103         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2104
2105         if (hw)
2106                 return 0;
2107
2108         for_each_online_node(nid) {
2109                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2110                 if (ret)
2111                         return ret;
2112         }
2113
2114         return 0;
2115 }
2116
2117 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2118                                           struct pci_dev *pdev);
2119 static int identity_mapping(struct pci_dev *pdev)
2120 {
2121         struct device_domain_info *info;
2122
2123         if (likely(!iommu_identity_mapping))
2124                 return 0;
2125
2126
2127         list_for_each_entry(info, &si_domain->devices, link)
2128                 if (info->dev == pdev)
2129                         return 1;
2130         return 0;
2131 }
2132
2133 static int domain_add_dev_info(struct dmar_domain *domain,
2134                                struct pci_dev *pdev,
2135                                int translation)
2136 {
2137         struct device_domain_info *info;
2138         unsigned long flags;
2139         int ret;
2140
2141         info = alloc_devinfo_mem();
2142         if (!info)
2143                 return -ENOMEM;
2144
2145         ret = domain_context_mapping(domain, pdev, translation);
2146         if (ret) {
2147                 free_devinfo_mem(info);
2148                 return ret;
2149         }
2150
2151         info->segment = pci_domain_nr(pdev->bus);
2152         info->bus = pdev->bus->number;
2153         info->devfn = pdev->devfn;
2154         info->dev = pdev;
2155         info->domain = domain;
2156
2157         spin_lock_irqsave(&device_domain_lock, flags);
2158         list_add(&info->link, &domain->devices);
2159         list_add(&info->global, &device_domain_list);
2160         pdev->dev.archdata.iommu = info;
2161         spin_unlock_irqrestore(&device_domain_lock, flags);
2162
2163         return 0;
2164 }
2165
2166 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2167 {
2168         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2169                 return 1;
2170
2171         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2172                 return 1;
2173
2174         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2175                 return 0;
2176
2177         /*
2178          * We want to start off with all devices in the 1:1 domain, and
2179          * take them out later if we find they can't access all of memory.
2180          *
2181          * However, we can't do this for PCI devices behind bridges,
2182          * because all PCI devices behind the same bridge will end up
2183          * with the same source-id on their transactions.
2184          *
2185          * Practically speaking, we can't change things around for these
2186          * devices at run-time, because we can't be sure there'll be no
2187          * DMA transactions in flight for any of their siblings.
2188          * 
2189          * So PCI devices (unless they're on the root bus) as well as
2190          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2191          * the 1:1 domain, just in _case_ one of their siblings turns out
2192          * not to be able to map all of memory.
2193          */
2194         if (!pdev->is_pcie) {
2195                 if (!pci_is_root_bus(pdev->bus))
2196                         return 0;
2197                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2198                         return 0;
2199         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2200                 return 0;
2201
2202         /* 
2203          * At boot time, we don't yet know if devices will be 64-bit capable.
2204          * Assume that they will -- if they turn out not to be, then we can 
2205          * take them out of the 1:1 domain later.
2206          */
2207         if (!startup)
2208                 return pdev->dma_mask > DMA_BIT_MASK(32);
2209
2210         return 1;
2211 }
2212
2213 static int __init iommu_prepare_static_identity_mapping(int hw)
2214 {
2215         struct pci_dev *pdev = NULL;
2216         int ret;
2217
2218         ret = si_domain_init(hw);
2219         if (ret)
2220                 return -EFAULT;
2221
2222         for_each_pci_dev(pdev) {
2223                 if (iommu_should_identity_map(pdev, 1)) {
2224                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2225                                hw ? "hardware" : "software", pci_name(pdev));
2226
2227                         ret = domain_add_dev_info(si_domain, pdev,
2228                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2229                                                      CONTEXT_TT_MULTI_LEVEL);
2230                         if (ret)
2231                                 return ret;
2232                 }
2233         }
2234
2235         return 0;
2236 }
2237
2238 int __init init_dmars(void)
2239 {
2240         struct dmar_drhd_unit *drhd;
2241         struct dmar_rmrr_unit *rmrr;
2242         struct pci_dev *pdev;
2243         struct intel_iommu *iommu;
2244         int i, ret;
2245
2246         /*
2247          * for each drhd
2248          *    allocate root
2249          *    initialize and program root entry to not present
2250          * endfor
2251          */
2252         for_each_drhd_unit(drhd) {
2253                 g_num_of_iommus++;
2254                 /*
2255                  * lock not needed as this is only incremented in the single
2256                  * threaded kernel __init code path all other access are read
2257                  * only
2258                  */
2259         }
2260
2261         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2262                         GFP_KERNEL);
2263         if (!g_iommus) {
2264                 printk(KERN_ERR "Allocating global iommu array failed\n");
2265                 ret = -ENOMEM;
2266                 goto error;
2267         }
2268
2269         deferred_flush = kzalloc(g_num_of_iommus *
2270                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2271         if (!deferred_flush) {
2272                 ret = -ENOMEM;
2273                 goto error;
2274         }
2275
2276         for_each_drhd_unit(drhd) {
2277                 if (drhd->ignored)
2278                         continue;
2279
2280                 iommu = drhd->iommu;
2281                 g_iommus[iommu->seq_id] = iommu;
2282
2283                 ret = iommu_init_domains(iommu);
2284                 if (ret)
2285                         goto error;
2286
2287                 /*
2288                  * TBD:
2289                  * we could share the same root & context tables
2290                  * amoung all IOMMU's. Need to Split it later.
2291                  */
2292                 ret = iommu_alloc_root_entry(iommu);
2293                 if (ret) {
2294                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2295                         goto error;
2296                 }
2297                 if (!ecap_pass_through(iommu->ecap))
2298                         hw_pass_through = 0;
2299         }
2300
2301         /*
2302          * Start from the sane iommu hardware state.
2303          */
2304         for_each_drhd_unit(drhd) {
2305                 if (drhd->ignored)
2306                         continue;
2307
2308                 iommu = drhd->iommu;
2309
2310                 /*
2311                  * If the queued invalidation is already initialized by us
2312                  * (for example, while enabling interrupt-remapping) then
2313                  * we got the things already rolling from a sane state.
2314                  */
2315                 if (iommu->qi)
2316                         continue;
2317
2318                 /*
2319                  * Clear any previous faults.
2320                  */
2321                 dmar_fault(-1, iommu);
2322                 /*
2323                  * Disable queued invalidation if supported and already enabled
2324                  * before OS handover.
2325                  */
2326                 dmar_disable_qi(iommu);
2327         }
2328
2329         for_each_drhd_unit(drhd) {
2330                 if (drhd->ignored)
2331                         continue;
2332
2333                 iommu = drhd->iommu;
2334
2335                 if (dmar_enable_qi(iommu)) {
2336                         /*
2337                          * Queued Invalidate not enabled, use Register Based
2338                          * Invalidate
2339                          */
2340                         iommu->flush.flush_context = __iommu_flush_context;
2341                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2342                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2343                                "invalidation\n",
2344                                (unsigned long long)drhd->reg_base_addr);
2345                 } else {
2346                         iommu->flush.flush_context = qi_flush_context;
2347                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2348                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2349                                "invalidation\n",
2350                                (unsigned long long)drhd->reg_base_addr);
2351                 }
2352         }
2353
2354         if (iommu_pass_through)
2355                 iommu_identity_mapping |= IDENTMAP_ALL;
2356
2357 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2358         iommu_identity_mapping |= IDENTMAP_GFX;
2359 #endif
2360
2361         check_tylersburg_isoch();
2362
2363         /*
2364          * If pass through is not set or not enabled, setup context entries for
2365          * identity mappings for rmrr, gfx, and isa and may fall back to static
2366          * identity mapping if iommu_identity_mapping is set.
2367          */
2368         if (iommu_identity_mapping) {
2369                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2370                 if (ret) {
2371                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2372                         goto error;
2373                 }
2374         }
2375         /*
2376          * For each rmrr
2377          *   for each dev attached to rmrr
2378          *   do
2379          *     locate drhd for dev, alloc domain for dev
2380          *     allocate free domain
2381          *     allocate page table entries for rmrr
2382          *     if context not allocated for bus
2383          *           allocate and init context
2384          *           set present in root table for this bus
2385          *     init context with domain, translation etc
2386          *    endfor
2387          * endfor
2388          */
2389         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2390         for_each_rmrr_units(rmrr) {
2391                 for (i = 0; i < rmrr->devices_cnt; i++) {
2392                         pdev = rmrr->devices[i];
2393                         /*
2394                          * some BIOS lists non-exist devices in DMAR
2395                          * table.
2396                          */
2397                         if (!pdev)
2398                                 continue;
2399                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2400                         if (ret)
2401                                 printk(KERN_ERR
2402                                        "IOMMU: mapping reserved region failed\n");
2403                 }
2404         }
2405
2406         iommu_prepare_isa();
2407
2408         /*
2409          * for each drhd
2410          *   enable fault log
2411          *   global invalidate context cache
2412          *   global invalidate iotlb
2413          *   enable translation
2414          */
2415         for_each_drhd_unit(drhd) {
2416                 if (drhd->ignored)
2417                         continue;
2418                 iommu = drhd->iommu;
2419
2420                 iommu_flush_write_buffer(iommu);
2421
2422                 ret = dmar_set_interrupt(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 iommu_set_root_entry(iommu);
2427
2428                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2429                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2430
2431                 ret = iommu_enable_translation(iommu);
2432                 if (ret)
2433                         goto error;
2434
2435                 iommu_disable_protect_mem_regions(iommu);
2436         }
2437
2438         return 0;
2439 error:
2440         for_each_drhd_unit(drhd) {
2441                 if (drhd->ignored)
2442                         continue;
2443                 iommu = drhd->iommu;
2444                 free_iommu(iommu);
2445         }
2446         kfree(g_iommus);
2447         return ret;
2448 }
2449
2450 /* This takes a number of _MM_ pages, not VTD pages */
2451 static struct iova *intel_alloc_iova(struct device *dev,
2452                                      struct dmar_domain *domain,
2453                                      unsigned long nrpages, uint64_t dma_mask)
2454 {
2455         struct pci_dev *pdev = to_pci_dev(dev);
2456         struct iova *iova = NULL;
2457
2458         /* Restrict dma_mask to the width that the iommu can handle */
2459         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2460
2461         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2462                 /*
2463                  * First try to allocate an io virtual address in
2464                  * DMA_BIT_MASK(32) and if that fails then try allocating
2465                  * from higher range
2466                  */
2467                 iova = alloc_iova(&domain->iovad, nrpages,
2468                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2469                 if (iova)
2470                         return iova;
2471         }
2472         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2473         if (unlikely(!iova)) {
2474                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2475                        nrpages, pci_name(pdev));
2476                 return NULL;
2477         }
2478
2479         return iova;
2480 }
2481
2482 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2483 {
2484         struct dmar_domain *domain;
2485         int ret;
2486
2487         domain = get_domain_for_dev(pdev,
2488                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2489         if (!domain) {
2490                 printk(KERN_ERR
2491                         "Allocating domain for %s failed", pci_name(pdev));
2492                 return NULL;
2493         }
2494
2495         /* make sure context mapping is ok */
2496         if (unlikely(!domain_context_mapped(pdev))) {
2497                 ret = domain_context_mapping(domain, pdev,
2498                                              CONTEXT_TT_MULTI_LEVEL);
2499                 if (ret) {
2500                         printk(KERN_ERR
2501                                 "Domain context map for %s failed",
2502                                 pci_name(pdev));
2503                         return NULL;
2504                 }
2505         }
2506
2507         return domain;
2508 }
2509
2510 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2511 {
2512         struct device_domain_info *info;
2513
2514         /* No lock here, assumes no domain exit in normal case */
2515         info = dev->dev.archdata.iommu;
2516         if (likely(info))
2517                 return info->domain;
2518
2519         return __get_valid_domain_for_dev(dev);
2520 }
2521
2522 static int iommu_dummy(struct pci_dev *pdev)
2523 {
2524         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2525 }
2526
2527 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2528 static int iommu_no_mapping(struct device *dev)
2529 {
2530         struct pci_dev *pdev;
2531         int found;
2532
2533         if (unlikely(dev->bus != &pci_bus_type))
2534                 return 1;
2535
2536         pdev = to_pci_dev(dev);
2537         if (iommu_dummy(pdev))
2538                 return 1;
2539
2540         if (!iommu_identity_mapping)
2541                 return 0;
2542
2543         found = identity_mapping(pdev);
2544         if (found) {
2545                 if (iommu_should_identity_map(pdev, 0))
2546                         return 1;
2547                 else {
2548                         /*
2549                          * 32 bit DMA is removed from si_domain and fall back
2550                          * to non-identity mapping.
2551                          */
2552                         domain_remove_one_dev_info(si_domain, pdev);
2553                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2554                                pci_name(pdev));
2555                         return 0;
2556                 }
2557         } else {
2558                 /*
2559                  * In case of a detached 64 bit DMA device from vm, the device
2560                  * is put into si_domain for identity mapping.
2561                  */
2562                 if (iommu_should_identity_map(pdev, 0)) {
2563                         int ret;
2564                         ret = domain_add_dev_info(si_domain, pdev,
2565                                                   hw_pass_through ?
2566                                                   CONTEXT_TT_PASS_THROUGH :
2567                                                   CONTEXT_TT_MULTI_LEVEL);
2568                         if (!ret) {
2569                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2570                                        pci_name(pdev));
2571                                 return 1;
2572                         }
2573                 }
2574         }
2575
2576         return 0;
2577 }
2578
2579 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2580                                      size_t size, int dir, u64 dma_mask)
2581 {
2582         struct pci_dev *pdev = to_pci_dev(hwdev);
2583         struct dmar_domain *domain;
2584         phys_addr_t start_paddr;
2585         struct iova *iova;
2586         int prot = 0;
2587         int ret;
2588         struct intel_iommu *iommu;
2589         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2590
2591         BUG_ON(dir == DMA_NONE);
2592
2593         if (iommu_no_mapping(hwdev))
2594                 return paddr;
2595
2596         domain = get_valid_domain_for_dev(pdev);
2597         if (!domain)
2598                 return 0;
2599
2600         iommu = domain_get_iommu(domain);
2601         size = aligned_nrpages(paddr, size);
2602
2603         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2604                                 pdev->dma_mask);
2605         if (!iova)
2606                 goto error;
2607
2608         /*
2609          * Check if DMAR supports zero-length reads on write only
2610          * mappings..
2611          */
2612         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2613                         !cap_zlr(iommu->cap))
2614                 prot |= DMA_PTE_READ;
2615         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2616                 prot |= DMA_PTE_WRITE;
2617         /*
2618          * paddr - (paddr + size) might be partial page, we should map the whole
2619          * page.  Note: if two part of one page are separately mapped, we
2620          * might have two guest_addr mapping to the same host paddr, but this
2621          * is not a big problem
2622          */
2623         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2624                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2625         if (ret)
2626                 goto error;
2627
2628         /* it's a non-present to present mapping. Only flush if caching mode */
2629         if (cap_caching_mode(iommu->cap))
2630                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2631         else
2632                 iommu_flush_write_buffer(iommu);
2633
2634         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2635         start_paddr += paddr & ~PAGE_MASK;
2636         return start_paddr;
2637
2638 error:
2639         if (iova)
2640                 __free_iova(&domain->iovad, iova);
2641         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2642                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2643         return 0;
2644 }
2645
2646 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2647                                  unsigned long offset, size_t size,
2648                                  enum dma_data_direction dir,
2649                                  struct dma_attrs *attrs)
2650 {
2651         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2652                                   dir, to_pci_dev(dev)->dma_mask);
2653 }
2654
2655 static void flush_unmaps(void)
2656 {
2657         int i, j;
2658
2659         timer_on = 0;
2660
2661         /* just flush them all */
2662         for (i = 0; i < g_num_of_iommus; i++) {
2663                 struct intel_iommu *iommu = g_iommus[i];
2664                 if (!iommu)
2665                         continue;
2666
2667                 if (!deferred_flush[i].next)
2668                         continue;
2669
2670                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2671                                          DMA_TLB_GLOBAL_FLUSH);
2672                 for (j = 0; j < deferred_flush[i].next; j++) {
2673                         unsigned long mask;
2674                         struct iova *iova = deferred_flush[i].iova[j];
2675
2676                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2677                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2678                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2679                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2680                 }
2681                 deferred_flush[i].next = 0;
2682         }
2683
2684         list_size = 0;
2685 }
2686
2687 static void flush_unmaps_timeout(unsigned long data)
2688 {
2689         unsigned long flags;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         flush_unmaps();
2693         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2694 }
2695
2696 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2697 {
2698         unsigned long flags;
2699         int next, iommu_id;
2700         struct intel_iommu *iommu;
2701
2702         spin_lock_irqsave(&async_umap_flush_lock, flags);
2703         if (list_size == HIGH_WATER_MARK)
2704                 flush_unmaps();
2705
2706         iommu = domain_get_iommu(dom);
2707         iommu_id = iommu->seq_id;
2708
2709         next = deferred_flush[iommu_id].next;
2710         deferred_flush[iommu_id].domain[next] = dom;
2711         deferred_flush[iommu_id].iova[next] = iova;
2712         deferred_flush[iommu_id].next++;
2713
2714         if (!timer_on) {
2715                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2716                 timer_on = 1;
2717         }
2718         list_size++;
2719         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2720 }
2721
2722 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2723                              size_t size, enum dma_data_direction dir,
2724                              struct dma_attrs *attrs)
2725 {
2726         struct pci_dev *pdev = to_pci_dev(dev);
2727         struct dmar_domain *domain;
2728         unsigned long start_pfn, last_pfn;
2729         struct iova *iova;
2730         struct intel_iommu *iommu;
2731
2732         if (iommu_no_mapping(dev))
2733                 return;
2734
2735         domain = find_domain(pdev);
2736         BUG_ON(!domain);
2737
2738         iommu = domain_get_iommu(domain);
2739
2740         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2741         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2742                       (unsigned long long)dev_addr))
2743                 return;
2744
2745         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2746         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2747
2748         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2749                  pci_name(pdev), start_pfn, last_pfn);
2750
2751         /*  clear the whole page */
2752         dma_pte_clear_range(domain, start_pfn, last_pfn);
2753
2754         /* free page tables */
2755         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2756
2757         if (intel_iommu_strict) {
2758                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2759                                       last_pfn - start_pfn + 1);
2760                 /* free iova */
2761                 __free_iova(&domain->iovad, iova);
2762         } else {
2763                 add_unmap(domain, iova);
2764                 /*
2765                  * queue up the release of the unmap to save the 1/6th of the
2766                  * cpu used up by the iotlb flush operation...
2767                  */
2768         }
2769 }
2770
2771 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2772                                   dma_addr_t *dma_handle, gfp_t flags)
2773 {
2774         void *vaddr;
2775         int order;
2776
2777         size = PAGE_ALIGN(size);
2778         order = get_order(size);
2779
2780         if (!iommu_no_mapping(hwdev))
2781                 flags &= ~(GFP_DMA | GFP_DMA32);
2782         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2783                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2784                         flags |= GFP_DMA;
2785                 else
2786                         flags |= GFP_DMA32;
2787         }
2788
2789         vaddr = (void *)__get_free_pages(flags, order);
2790         if (!vaddr)
2791                 return NULL;
2792         memset(vaddr, 0, size);
2793
2794         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2795                                          DMA_BIDIRECTIONAL,
2796                                          hwdev->coherent_dma_mask);
2797         if (*dma_handle)
2798                 return vaddr;
2799         free_pages((unsigned long)vaddr, order);
2800         return NULL;
2801 }
2802
2803 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2804                                 dma_addr_t dma_handle)
2805 {
2806         int order;
2807
2808         size = PAGE_ALIGN(size);
2809         order = get_order(size);
2810
2811         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2812         free_pages((unsigned long)vaddr, order);
2813 }
2814
2815 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2816                            int nelems, enum dma_data_direction dir,
2817                            struct dma_attrs *attrs)
2818 {
2819         struct pci_dev *pdev = to_pci_dev(hwdev);
2820         struct dmar_domain *domain;
2821         unsigned long start_pfn, last_pfn;
2822         struct iova *iova;
2823         struct intel_iommu *iommu;
2824
2825         if (iommu_no_mapping(hwdev))
2826                 return;
2827
2828         domain = find_domain(pdev);
2829         BUG_ON(!domain);
2830
2831         iommu = domain_get_iommu(domain);
2832
2833         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2834         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2835                       (unsigned long long)sglist[0].dma_address))
2836                 return;
2837
2838         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2839         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2840
2841         /*  clear the whole page */
2842         dma_pte_clear_range(domain, start_pfn, last_pfn);
2843
2844         /* free page tables */
2845         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2846
2847         if (intel_iommu_strict) {
2848                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2849                                       last_pfn - start_pfn + 1);
2850                 /* free iova */
2851                 __free_iova(&domain->iovad, iova);
2852         } else {
2853                 add_unmap(domain, iova);
2854                 /*
2855                  * queue up the release of the unmap to save the 1/6th of the
2856                  * cpu used up by the iotlb flush operation...
2857                  */
2858         }
2859 }
2860
2861 static int intel_nontranslate_map_sg(struct device *hddev,
2862         struct scatterlist *sglist, int nelems, int dir)
2863 {
2864         int i;
2865         struct scatterlist *sg;
2866
2867         for_each_sg(sglist, sg, nelems, i) {
2868                 BUG_ON(!sg_page(sg));
2869                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2870                 sg->dma_length = sg->length;
2871         }
2872         return nelems;
2873 }
2874
2875 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2876                         enum dma_data_direction dir, struct dma_attrs *attrs)
2877 {
2878         int i;
2879         struct pci_dev *pdev = to_pci_dev(hwdev);
2880         struct dmar_domain *domain;
2881         size_t size = 0;
2882         int prot = 0;
2883         size_t offset_pfn = 0;
2884         struct iova *iova = NULL;
2885         int ret;
2886         struct scatterlist *sg;
2887         unsigned long start_vpfn;
2888         struct intel_iommu *iommu;
2889
2890         BUG_ON(dir == DMA_NONE);
2891         if (iommu_no_mapping(hwdev))
2892                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2893
2894         domain = get_valid_domain_for_dev(pdev);
2895         if (!domain)
2896                 return 0;
2897
2898         iommu = domain_get_iommu(domain);
2899
2900         for_each_sg(sglist, sg, nelems, i)
2901                 size += aligned_nrpages(sg->offset, sg->length);
2902
2903         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2904                                 pdev->dma_mask);
2905         if (!iova) {
2906                 sglist->dma_length = 0;
2907                 return 0;
2908         }
2909
2910         /*
2911          * Check if DMAR supports zero-length reads on write only
2912          * mappings..
2913          */
2914         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2915                         !cap_zlr(iommu->cap))
2916                 prot |= DMA_PTE_READ;
2917         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2918                 prot |= DMA_PTE_WRITE;
2919
2920         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2921
2922         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2923         if (unlikely(ret)) {
2924                 /*  clear the page */
2925                 dma_pte_clear_range(domain, start_vpfn,
2926                                     start_vpfn + size - 1);
2927                 /* free page tables */
2928                 dma_pte_free_pagetable(domain, start_vpfn,
2929                                        start_vpfn + size - 1);
2930                 /* free iova */
2931                 __free_iova(&domain->iovad, iova);
2932                 return 0;
2933         }
2934
2935         /* it's a non-present to present mapping. Only flush if caching mode */
2936         if (cap_caching_mode(iommu->cap))
2937                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2938         else
2939                 iommu_flush_write_buffer(iommu);
2940
2941         return nelems;
2942 }
2943
2944 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2945 {
2946         return !dma_addr;
2947 }
2948
2949 struct dma_map_ops intel_dma_ops = {
2950         .alloc_coherent = intel_alloc_coherent,
2951         .free_coherent = intel_free_coherent,
2952         .map_sg = intel_map_sg,
2953         .unmap_sg = intel_unmap_sg,
2954         .map_page = intel_map_page,
2955         .unmap_page = intel_unmap_page,
2956         .mapping_error = intel_mapping_error,
2957 };
2958
2959 static inline int iommu_domain_cache_init(void)
2960 {
2961         int ret = 0;
2962
2963         iommu_domain_cache = kmem_cache_create("iommu_domain",
2964                                          sizeof(struct dmar_domain),
2965                                          0,
2966                                          SLAB_HWCACHE_ALIGN,
2967
2968                                          NULL);
2969         if (!iommu_domain_cache) {
2970                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2971                 ret = -ENOMEM;
2972         }
2973
2974         return ret;
2975 }
2976
2977 static inline int iommu_devinfo_cache_init(void)
2978 {
2979         int ret = 0;
2980
2981         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2982                                          sizeof(struct device_domain_info),
2983                                          0,
2984                                          SLAB_HWCACHE_ALIGN,
2985                                          NULL);
2986         if (!iommu_devinfo_cache) {
2987                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2988                 ret = -ENOMEM;
2989         }
2990
2991         return ret;
2992 }
2993
2994 static inline int iommu_iova_cache_init(void)
2995 {
2996         int ret = 0;
2997
2998         iommu_iova_cache = kmem_cache_create("iommu_iova",
2999                                          sizeof(struct iova),
3000                                          0,
3001                                          SLAB_HWCACHE_ALIGN,
3002                                          NULL);
3003         if (!iommu_iova_cache) {
3004                 printk(KERN_ERR "Couldn't create iova cache\n");
3005                 ret = -ENOMEM;
3006         }
3007
3008         return ret;
3009 }
3010
3011 static int __init iommu_init_mempool(void)
3012 {
3013         int ret;
3014         ret = iommu_iova_cache_init();
3015         if (ret)
3016                 return ret;
3017
3018         ret = iommu_domain_cache_init();
3019         if (ret)
3020                 goto domain_error;
3021
3022         ret = iommu_devinfo_cache_init();
3023         if (!ret)
3024                 return ret;
3025
3026         kmem_cache_destroy(iommu_domain_cache);
3027 domain_error:
3028         kmem_cache_destroy(iommu_iova_cache);
3029
3030         return -ENOMEM;
3031 }
3032
3033 static void __init iommu_exit_mempool(void)
3034 {
3035         kmem_cache_destroy(iommu_devinfo_cache);
3036         kmem_cache_destroy(iommu_domain_cache);
3037         kmem_cache_destroy(iommu_iova_cache);
3038
3039 }
3040
3041 static void __init init_no_remapping_devices(void)
3042 {
3043         struct dmar_drhd_unit *drhd;
3044
3045         for_each_drhd_unit(drhd) {
3046                 if (!drhd->include_all) {
3047                         int i;
3048                         for (i = 0; i < drhd->devices_cnt; i++)
3049                                 if (drhd->devices[i] != NULL)
3050                                         break;
3051                         /* ignore DMAR unit if no pci devices exist */
3052                         if (i == drhd->devices_cnt)
3053                                 drhd->ignored = 1;
3054                 }
3055         }
3056
3057         if (dmar_map_gfx)
3058                 return;
3059
3060         for_each_drhd_unit(drhd) {
3061                 int i;
3062                 if (drhd->ignored || drhd->include_all)
3063                         continue;
3064
3065                 for (i = 0; i < drhd->devices_cnt; i++)
3066                         if (drhd->devices[i] &&
3067                                 !IS_GFX_DEVICE(drhd->devices[i]))
3068                                 break;
3069
3070                 if (i < drhd->devices_cnt)
3071                         continue;
3072
3073                 /* bypass IOMMU if it is just for gfx devices */
3074                 drhd->ignored = 1;
3075                 for (i = 0; i < drhd->devices_cnt; i++) {
3076                         if (!drhd->devices[i])
3077                                 continue;
3078                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3079                 }
3080         }
3081 }
3082
3083 #ifdef CONFIG_SUSPEND
3084 static int init_iommu_hw(void)
3085 {
3086         struct dmar_drhd_unit *drhd;
3087         struct intel_iommu *iommu = NULL;
3088
3089         for_each_active_iommu(iommu, drhd)
3090                 if (iommu->qi)
3091                         dmar_reenable_qi(iommu);
3092
3093         for_each_active_iommu(iommu, drhd) {
3094                 iommu_flush_write_buffer(iommu);
3095
3096                 iommu_set_root_entry(iommu);
3097
3098                 iommu->flush.flush_context(iommu, 0, 0, 0,
3099                                            DMA_CCMD_GLOBAL_INVL);
3100                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3101                                          DMA_TLB_GLOBAL_FLUSH);
3102                 iommu_enable_translation(iommu);
3103                 iommu_disable_protect_mem_regions(iommu);
3104         }
3105
3106         return 0;
3107 }
3108
3109 static void iommu_flush_all(void)
3110 {
3111         struct dmar_drhd_unit *drhd;
3112         struct intel_iommu *iommu;
3113
3114         for_each_active_iommu(iommu, drhd) {
3115                 iommu->flush.flush_context(iommu, 0, 0, 0,
3116                                            DMA_CCMD_GLOBAL_INVL);
3117                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3118                                          DMA_TLB_GLOBAL_FLUSH);
3119         }
3120 }
3121
3122 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3123 {
3124         struct dmar_drhd_unit *drhd;
3125         struct intel_iommu *iommu = NULL;
3126         unsigned long flag;
3127
3128         for_each_active_iommu(iommu, drhd) {
3129                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3130                                                  GFP_ATOMIC);
3131                 if (!iommu->iommu_state)
3132                         goto nomem;
3133         }
3134
3135         iommu_flush_all();
3136
3137         for_each_active_iommu(iommu, drhd) {
3138                 iommu_disable_translation(iommu);
3139
3140                 spin_lock_irqsave(&iommu->register_lock, flag);
3141
3142                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3143                         readl(iommu->reg + DMAR_FECTL_REG);
3144                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3145                         readl(iommu->reg + DMAR_FEDATA_REG);
3146                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3147                         readl(iommu->reg + DMAR_FEADDR_REG);
3148                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3149                         readl(iommu->reg + DMAR_FEUADDR_REG);
3150
3151                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3152         }
3153         return 0;
3154
3155 nomem:
3156         for_each_active_iommu(iommu, drhd)
3157                 kfree(iommu->iommu_state);
3158
3159         return -ENOMEM;
3160 }
3161
3162 static int iommu_resume(struct sys_device *dev)
3163 {
3164         struct dmar_drhd_unit *drhd;
3165         struct intel_iommu *iommu = NULL;
3166         unsigned long flag;
3167
3168         if (init_iommu_hw()) {
3169                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3170                 return -EIO;
3171         }
3172
3173         for_each_active_iommu(iommu, drhd) {
3174
3175                 spin_lock_irqsave(&iommu->register_lock, flag);
3176
3177                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3178                         iommu->reg + DMAR_FECTL_REG);
3179                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3180                         iommu->reg + DMAR_FEDATA_REG);
3181                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3182                         iommu->reg + DMAR_FEADDR_REG);
3183                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3184                         iommu->reg + DMAR_FEUADDR_REG);
3185
3186                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3187         }
3188
3189         for_each_active_iommu(iommu, drhd)
3190                 kfree(iommu->iommu_state);
3191
3192         return 0;
3193 }
3194
3195 static struct sysdev_class iommu_sysclass = {
3196         .name           = "iommu",
3197         .resume         = iommu_resume,
3198         .suspend        = iommu_suspend,
3199 };
3200
3201 static struct sys_device device_iommu = {
3202         .cls    = &iommu_sysclass,
3203 };
3204
3205 static int __init init_iommu_sysfs(void)
3206 {
3207         int error;
3208
3209         error = sysdev_class_register(&iommu_sysclass);
3210         if (error)
3211                 return error;
3212
3213         error = sysdev_register(&device_iommu);
3214         if (error)
3215                 sysdev_class_unregister(&iommu_sysclass);
3216
3217         return error;
3218 }
3219
3220 #else
3221 static int __init init_iommu_sysfs(void)
3222 {
3223         return 0;
3224 }
3225 #endif  /* CONFIG_PM */
3226
3227 /*
3228  * Here we only respond to action of unbound device from driver.
3229  *
3230  * Added device is not attached to its DMAR domain here yet. That will happen
3231  * when mapping the device to iova.
3232  */
3233 static int device_notifier(struct notifier_block *nb,
3234                                   unsigned long action, void *data)
3235 {
3236         struct device *dev = data;
3237         struct pci_dev *pdev = to_pci_dev(dev);
3238         struct dmar_domain *domain;
3239
3240         if (iommu_no_mapping(dev))
3241                 return 0;
3242
3243         domain = find_domain(pdev);
3244         if (!domain)
3245                 return 0;
3246
3247         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3248                 domain_remove_one_dev_info(domain, pdev);
3249
3250         return 0;
3251 }
3252
3253 static struct notifier_block device_nb = {
3254         .notifier_call = device_notifier,
3255 };
3256
3257 int __init intel_iommu_init(void)
3258 {
3259         int ret = 0;
3260         int force_on = 0;
3261
3262         /* VT-d is required for a TXT/tboot launch, so enforce that */
3263         force_on = tboot_force_iommu();
3264
3265         if (dmar_table_init()) {
3266                 if (force_on)
3267                         panic("tboot: Failed to initialize DMAR table\n");
3268                 return  -ENODEV;
3269         }
3270
3271         if (dmar_dev_scope_init()) {
3272                 if (force_on)
3273                         panic("tboot: Failed to initialize DMAR device scope\n");
3274                 return  -ENODEV;
3275         }
3276
3277         /*
3278          * Check the need for DMA-remapping initialization now.
3279          * Above initialization will also be used by Interrupt-remapping.
3280          */
3281         if (no_iommu || swiotlb || dmar_disabled)
3282                 return -ENODEV;
3283
3284         iommu_init_mempool();
3285         dmar_init_reserved_ranges();
3286
3287         init_no_remapping_devices();
3288
3289         ret = init_dmars();
3290         if (ret) {
3291                 if (force_on)
3292                         panic("tboot: Failed to initialize DMARs\n");
3293                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3294                 put_iova_domain(&reserved_iova_list);
3295                 iommu_exit_mempool();
3296                 return ret;
3297         }
3298         printk(KERN_INFO
3299         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3300
3301         init_timer(&unmap_timer);
3302         force_iommu = 1;
3303         dma_ops = &intel_dma_ops;
3304
3305         init_iommu_sysfs();
3306
3307         register_iommu(&intel_iommu_ops);
3308
3309         bus_register_notifier(&pci_bus_type, &device_nb);
3310
3311         return 0;
3312 }
3313
3314 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3315                                            struct pci_dev *pdev)
3316 {
3317         struct pci_dev *tmp, *parent;
3318
3319         if (!iommu || !pdev)
3320                 return;
3321
3322         /* dependent device detach */
3323         tmp = pci_find_upstream_pcie_bridge(pdev);
3324         /* Secondary interface's bus number and devfn 0 */
3325         if (tmp) {
3326                 parent = pdev->bus->self;
3327                 while (parent != tmp) {
3328                         iommu_detach_dev(iommu, parent->bus->number,
3329                                          parent->devfn);
3330                         parent = parent->bus->self;
3331                 }
3332                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
3333                         iommu_detach_dev(iommu,
3334                                 tmp->subordinate->number, 0);
3335                 else /* this is a legacy PCI bridge */
3336                         iommu_detach_dev(iommu, tmp->bus->number,
3337                                          tmp->devfn);
3338         }
3339 }
3340
3341 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3342                                           struct pci_dev *pdev)
3343 {
3344         struct device_domain_info *info;
3345         struct intel_iommu *iommu;
3346         unsigned long flags;
3347         int found = 0;
3348         struct list_head *entry, *tmp;
3349
3350         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3351                                 pdev->devfn);
3352         if (!iommu)
3353                 return;
3354
3355         spin_lock_irqsave(&device_domain_lock, flags);
3356         list_for_each_safe(entry, tmp, &domain->devices) {
3357                 info = list_entry(entry, struct device_domain_info, link);
3358                 /* No need to compare PCI domain; it has to be the same */
3359                 if (info->bus == pdev->bus->number &&
3360                     info->devfn == pdev->devfn) {
3361                         list_del(&info->link);
3362                         list_del(&info->global);
3363                         if (info->dev)
3364                                 info->dev->dev.archdata.iommu = NULL;
3365                         spin_unlock_irqrestore(&device_domain_lock, flags);
3366
3367                         iommu_disable_dev_iotlb(info);
3368                         iommu_detach_dev(iommu, info->bus, info->devfn);
3369                         iommu_detach_dependent_devices(iommu, pdev);
3370                         free_devinfo_mem(info);
3371
3372                         spin_lock_irqsave(&device_domain_lock, flags);
3373
3374                         if (found)
3375                                 break;
3376                         else
3377                                 continue;
3378                 }
3379
3380                 /* if there is no other devices under the same iommu
3381                  * owned by this domain, clear this iommu in iommu_bmp
3382                  * update iommu count and coherency
3383                  */
3384                 if (iommu == device_to_iommu(info->segment, info->bus,
3385                                             info->devfn))
3386                         found = 1;
3387         }
3388
3389         if (found == 0) {
3390                 unsigned long tmp_flags;
3391                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3392                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3393                 domain->iommu_count--;
3394                 domain_update_iommu_cap(domain);
3395                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3396         }
3397
3398         spin_unlock_irqrestore(&device_domain_lock, flags);
3399 }
3400
3401 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3402 {
3403         struct device_domain_info *info;
3404         struct intel_iommu *iommu;
3405         unsigned long flags1, flags2;
3406
3407         spin_lock_irqsave(&device_domain_lock, flags1);
3408         while (!list_empty(&domain->devices)) {
3409                 info = list_entry(domain->devices.next,
3410                         struct device_domain_info, link);
3411                 list_del(&info->link);
3412                 list_del(&info->global);
3413                 if (info->dev)
3414                         info->dev->dev.archdata.iommu = NULL;
3415
3416                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3417
3418                 iommu_disable_dev_iotlb(info);
3419                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3420                 iommu_detach_dev(iommu, info->bus, info->devfn);
3421                 iommu_detach_dependent_devices(iommu, info->dev);
3422
3423                 /* clear this iommu in iommu_bmp, update iommu count
3424                  * and capabilities
3425                  */
3426                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3427                 if (test_and_clear_bit(iommu->seq_id,
3428                                        &domain->iommu_bmp)) {
3429                         domain->iommu_count--;
3430                         domain_update_iommu_cap(domain);
3431                 }
3432                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3433
3434                 free_devinfo_mem(info);
3435                 spin_lock_irqsave(&device_domain_lock, flags1);
3436         }
3437         spin_unlock_irqrestore(&device_domain_lock, flags1);
3438 }
3439
3440 /* domain id for virtual machine, it won't be set in context */
3441 static unsigned long vm_domid;
3442
3443 static int vm_domain_min_agaw(struct dmar_domain *domain)
3444 {
3445         int i;
3446         int min_agaw = domain->agaw;
3447
3448         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3449         for (; i < g_num_of_iommus; ) {
3450                 if (min_agaw > g_iommus[i]->agaw)
3451                         min_agaw = g_iommus[i]->agaw;
3452
3453                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3454         }
3455
3456         return min_agaw;
3457 }
3458
3459 static struct dmar_domain *iommu_alloc_vm_domain(void)
3460 {
3461         struct dmar_domain *domain;
3462
3463         domain = alloc_domain_mem();
3464         if (!domain)
3465                 return NULL;
3466
3467         domain->id = vm_domid++;
3468         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3469         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3470
3471         return domain;
3472 }
3473
3474 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3475 {
3476         int adjust_width;
3477
3478         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3479         spin_lock_init(&domain->iommu_lock);
3480
3481         domain_reserve_special_ranges(domain);
3482
3483         /* calculate AGAW */
3484         domain->gaw = guest_width;
3485         adjust_width = guestwidth_to_adjustwidth(guest_width);
3486         domain->agaw = width_to_agaw(adjust_width);
3487
3488         INIT_LIST_HEAD(&domain->devices);
3489
3490         domain->iommu_count = 0;
3491         domain->iommu_coherency = 0;
3492         domain->iommu_snooping = 0;
3493         domain->max_addr = 0;
3494
3495         /* always allocate the top pgd */
3496         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3497         if (!domain->pgd)
3498                 return -ENOMEM;
3499         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3500         return 0;
3501 }
3502
3503 static void iommu_free_vm_domain(struct dmar_domain *domain)
3504 {
3505         unsigned long flags;
3506         struct dmar_drhd_unit *drhd;
3507         struct intel_iommu *iommu;
3508         unsigned long i;
3509         unsigned long ndomains;
3510
3511         for_each_drhd_unit(drhd) {
3512                 if (drhd->ignored)
3513                         continue;
3514                 iommu = drhd->iommu;
3515
3516                 ndomains = cap_ndoms(iommu->cap);
3517                 i = find_first_bit(iommu->domain_ids, ndomains);
3518                 for (; i < ndomains; ) {
3519                         if (iommu->domains[i] == domain) {
3520                                 spin_lock_irqsave(&iommu->lock, flags);
3521                                 clear_bit(i, iommu->domain_ids);
3522                                 iommu->domains[i] = NULL;
3523                                 spin_unlock_irqrestore(&iommu->lock, flags);
3524                                 break;
3525                         }
3526                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3527                 }
3528         }
3529 }
3530
3531 static void vm_domain_exit(struct dmar_domain *domain)
3532 {
3533         /* Domain 0 is reserved, so dont process it */
3534         if (!domain)
3535                 return;
3536
3537         vm_domain_remove_all_dev_info(domain);
3538         /* destroy iovas */
3539         put_iova_domain(&domain->iovad);
3540
3541         /* clear ptes */
3542         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3543
3544         /* free page tables */
3545         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3546
3547         iommu_free_vm_domain(domain);
3548         free_domain_mem(domain);
3549 }
3550
3551 static int intel_iommu_domain_init(struct iommu_domain *domain)
3552 {
3553         struct dmar_domain *dmar_domain;
3554
3555         dmar_domain = iommu_alloc_vm_domain();
3556         if (!dmar_domain) {
3557                 printk(KERN_ERR
3558                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3559                 return -ENOMEM;
3560         }
3561         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3562                 printk(KERN_ERR
3563                         "intel_iommu_domain_init() failed\n");
3564                 vm_domain_exit(dmar_domain);
3565                 return -ENOMEM;
3566         }
3567         domain->priv = dmar_domain;
3568
3569         return 0;
3570 }
3571
3572 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3573 {
3574         struct dmar_domain *dmar_domain = domain->priv;
3575
3576         domain->priv = NULL;
3577         vm_domain_exit(dmar_domain);
3578 }
3579
3580 static int intel_iommu_attach_device(struct iommu_domain *domain,
3581                                      struct device *dev)
3582 {
3583         struct dmar_domain *dmar_domain = domain->priv;
3584         struct pci_dev *pdev = to_pci_dev(dev);
3585         struct intel_iommu *iommu;
3586         int addr_width;
3587         u64 end;
3588
3589         /* normally pdev is not mapped */
3590         if (unlikely(domain_context_mapped(pdev))) {
3591                 struct dmar_domain *old_domain;
3592
3593                 old_domain = find_domain(pdev);
3594                 if (old_domain) {
3595                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3596                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3597                                 domain_remove_one_dev_info(old_domain, pdev);
3598                         else
3599                                 domain_remove_dev_info(old_domain);
3600                 }
3601         }
3602
3603         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3604                                 pdev->devfn);
3605         if (!iommu)
3606                 return -ENODEV;
3607
3608         /* check if this iommu agaw is sufficient for max mapped address */
3609         addr_width = agaw_to_width(iommu->agaw);
3610         end = DOMAIN_MAX_ADDR(addr_width);
3611         end = end & VTD_PAGE_MASK;
3612         if (end < dmar_domain->max_addr) {
3613                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3614                        "sufficient for the mapped address (%llx)\n",
3615                        __func__, iommu->agaw, dmar_domain->max_addr);
3616                 return -EFAULT;
3617         }
3618
3619         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3620 }
3621
3622 static void intel_iommu_detach_device(struct iommu_domain *domain,
3623                                       struct device *dev)
3624 {
3625         struct dmar_domain *dmar_domain = domain->priv;
3626         struct pci_dev *pdev = to_pci_dev(dev);
3627
3628         domain_remove_one_dev_info(dmar_domain, pdev);
3629 }
3630
3631 static int intel_iommu_map_range(struct iommu_domain *domain,
3632                                  unsigned long iova, phys_addr_t hpa,
3633                                  size_t size, int iommu_prot)
3634 {
3635         struct dmar_domain *dmar_domain = domain->priv;
3636         u64 max_addr;
3637         int addr_width;
3638         int prot = 0;
3639         int ret;
3640
3641         if (iommu_prot & IOMMU_READ)
3642                 prot |= DMA_PTE_READ;
3643         if (iommu_prot & IOMMU_WRITE)
3644                 prot |= DMA_PTE_WRITE;
3645         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3646                 prot |= DMA_PTE_SNP;
3647
3648         max_addr = iova + size;
3649         if (dmar_domain->max_addr < max_addr) {
3650                 int min_agaw;
3651                 u64 end;
3652
3653                 /* check if minimum agaw is sufficient for mapped address */
3654                 min_agaw = vm_domain_min_agaw(dmar_domain);
3655                 addr_width = agaw_to_width(min_agaw);
3656                 end = DOMAIN_MAX_ADDR(addr_width);
3657                 end = end & VTD_PAGE_MASK;
3658                 if (end < max_addr) {
3659                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3660                                "sufficient for the mapped address (%llx)\n",
3661                                __func__, min_agaw, max_addr);
3662                         return -EFAULT;
3663                 }
3664                 dmar_domain->max_addr = max_addr;
3665         }
3666         /* Round up size to next multiple of PAGE_SIZE, if it and
3667            the low bits of hpa would take us onto the next page */
3668         size = aligned_nrpages(hpa, size);
3669         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3670                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3671         return ret;
3672 }
3673
3674 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3675                                     unsigned long iova, size_t size)
3676 {
3677         struct dmar_domain *dmar_domain = domain->priv;
3678
3679         if (!size)
3680                 return;
3681
3682         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3683                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3684
3685         if (dmar_domain->max_addr == iova + size)
3686                 dmar_domain->max_addr = iova;
3687 }
3688
3689 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3690                                             unsigned long iova)
3691 {
3692         struct dmar_domain *dmar_domain = domain->priv;
3693         struct dma_pte *pte;
3694         u64 phys = 0;
3695
3696         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3697         if (pte)
3698                 phys = dma_pte_addr(pte);
3699
3700         return phys;
3701 }
3702
3703 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3704                                       unsigned long cap)
3705 {
3706         struct dmar_domain *dmar_domain = domain->priv;
3707
3708         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3709                 return dmar_domain->iommu_snooping;
3710
3711         return 0;
3712 }
3713
3714 static struct iommu_ops intel_iommu_ops = {
3715         .domain_init    = intel_iommu_domain_init,
3716         .domain_destroy = intel_iommu_domain_destroy,
3717         .attach_dev     = intel_iommu_attach_device,
3718         .detach_dev     = intel_iommu_detach_device,
3719         .map            = intel_iommu_map_range,
3720         .unmap          = intel_iommu_unmap_range,
3721         .iova_to_phys   = intel_iommu_iova_to_phys,
3722         .domain_has_cap = intel_iommu_domain_has_cap,
3723 };
3724
3725 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3726 {
3727         /*
3728          * Mobile 4 Series Chipset neglects to set RWBF capability,
3729          * but needs it:
3730          */
3731         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3732         rwbf_quirk = 1;
3733 }
3734
3735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3736
3737 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3738    ISOCH DMAR unit for the Azalia sound device, but not give it any
3739    TLB entries, which causes it to deadlock. Check for that.  We do
3740    this in a function called from init_dmars(), instead of in a PCI
3741    quirk, because we don't want to print the obnoxious "BIOS broken"
3742    message if VT-d is actually disabled.
3743 */
3744 static void __init check_tylersburg_isoch(void)
3745 {
3746         struct pci_dev *pdev;
3747         uint32_t vtisochctrl;
3748
3749         /* If there's no Azalia in the system anyway, forget it. */
3750         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3751         if (!pdev)
3752                 return;
3753         pci_dev_put(pdev);
3754
3755         /* System Management Registers. Might be hidden, in which case
3756            we can't do the sanity check. But that's OK, because the
3757            known-broken BIOSes _don't_ actually hide it, so far. */
3758         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3759         if (!pdev)
3760                 return;
3761
3762         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3763                 pci_dev_put(pdev);
3764                 return;
3765         }
3766
3767         pci_dev_put(pdev);
3768
3769         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3770         if (vtisochctrl & 1)
3771                 return;
3772
3773         /* Drop all bits other than the number of TLB entries */
3774         vtisochctrl &= 0x1c;
3775
3776         /* If we have the recommended number of TLB entries (16), fine. */
3777         if (vtisochctrl == 0x10)
3778                 return;
3779
3780         /* Zero TLB entries? You get to ride the short bus to school. */
3781         if (!vtisochctrl) {
3782                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3783                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3784                      dmi_get_system_info(DMI_BIOS_VENDOR),
3785                      dmi_get_system_info(DMI_BIOS_VERSION),
3786                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3787                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3788                 return;
3789         }
3790         
3791         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3792                vtisochctrl);
3793 }