e541c3bdbf0d9ce1a00e1aca7c357151e38e4ad0
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67  * 0: Present
68  * 1-11: Reserved
69  * 12-63: Context Ptr (12 - (haw-1))
70  * 64-127: Reserved
71  */
72 struct root_entry {
73         u64     val;
74         u64     rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79         return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83         root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87         root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93         return (struct context_entry *)
94                 (root_present(root)?phys_to_virt(
95                 root->val & VTD_PAGE_MASK) :
96                 NULL);
97 }
98
99 /*
100  * low 64 bits:
101  * 0: present
102  * 1: fault processing disable
103  * 2-3: translation type
104  * 12-63: address space root
105  * high 64 bits:
106  * 0-2: address width
107  * 3-6: aval
108  * 8-23: domain id
109  */
110 struct context_entry {
111         u64 lo;
112         u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117         return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121         context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126         context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132                                                 unsigned long value)
133 {
134         context->lo &= (((u64)-1) << 4) | 3;
135         context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139                                             unsigned long value)
140 {
141         context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145                                              unsigned long value)
146 {
147         context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151                                          unsigned long value)
152 {
153         context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158         context->lo = 0;
159         context->hi = 0;
160 }
161
162 /*
163  * 0: readable
164  * 1: writable
165  * 2-6: reserved
166  * 7: super page
167  * 8-10: available
168  * 11: snoop behavior
169  * 12-63: Host physcial address
170  */
171 struct dma_pte {
172         u64 val;
173 };
174
175 static inline void dma_clear_pte(struct dma_pte *pte)
176 {
177         pte->val = 0;
178 }
179
180 static inline void dma_set_pte_readable(struct dma_pte *pte)
181 {
182         pte->val |= DMA_PTE_READ;
183 }
184
185 static inline void dma_set_pte_writable(struct dma_pte *pte)
186 {
187         pte->val |= DMA_PTE_WRITE;
188 }
189
190 static inline void dma_set_pte_snp(struct dma_pte *pte)
191 {
192         pte->val |= DMA_PTE_SNP;
193 }
194
195 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
196 {
197         pte->val = (pte->val & ~3) | (prot & 3);
198 }
199
200 static inline u64 dma_pte_addr(struct dma_pte *pte)
201 {
202         return (pte->val & VTD_PAGE_MASK);
203 }
204
205 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
206 {
207         pte->val |= (addr & VTD_PAGE_MASK);
208 }
209
210 static inline bool dma_pte_present(struct dma_pte *pte)
211 {
212         return (pte->val & 3) != 0;
213 }
214
215 /* devices under the same p2p bridge are owned in one domain */
216 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
217
218 /* domain represents a virtual machine, more than one devices
219  * across iommus may be owned in one domain, e.g. kvm guest.
220  */
221 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
222
223 struct dmar_domain {
224         int     id;                     /* domain id */
225         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
226
227         struct list_head devices;       /* all devices' list */
228         struct iova_domain iovad;       /* iova's that belong to this domain */
229
230         struct dma_pte  *pgd;           /* virtual address */
231         spinlock_t      mapping_lock;   /* page table lock */
232         int             gaw;            /* max guest address width */
233
234         /* adjusted guest address width, 0 is level 2 30-bit */
235         int             agaw;
236
237         int             flags;          /* flags to find out type of domain */
238
239         int             iommu_coherency;/* indicate coherency of iommu access */
240         int             iommu_snooping; /* indicate snooping control feature*/
241         int             iommu_count;    /* reference count of iommu */
242         spinlock_t      iommu_lock;     /* protect iommu set in domain */
243         u64             max_addr;       /* maximum mapped address */
244 };
245
246 /* PCI domain-device relationship */
247 struct device_domain_info {
248         struct list_head link;  /* link to domain siblings */
249         struct list_head global; /* link to global list */
250         u8 bus;                 /* PCI bus numer */
251         u8 devfn;               /* PCI devfn number */
252         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
253         struct dmar_domain *domain; /* pointer to domain */
254 };
255
256 static void flush_unmaps_timeout(unsigned long data);
257
258 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
259
260 #define HIGH_WATER_MARK 250
261 struct deferred_flush_tables {
262         int next;
263         struct iova *iova[HIGH_WATER_MARK];
264         struct dmar_domain *domain[HIGH_WATER_MARK];
265 };
266
267 static struct deferred_flush_tables *deferred_flush;
268
269 /* bitmap for indexing intel_iommus */
270 static int g_num_of_iommus;
271
272 static DEFINE_SPINLOCK(async_umap_flush_lock);
273 static LIST_HEAD(unmaps_to_do);
274
275 static int timer_on;
276 static long list_size;
277
278 static void domain_remove_dev_info(struct dmar_domain *domain);
279
280 #ifdef CONFIG_DMAR_DEFAULT_ON
281 int dmar_disabled = 0;
282 #else
283 int dmar_disabled = 1;
284 #endif /*CONFIG_DMAR_DEFAULT_ON*/
285
286 static int __initdata dmar_map_gfx = 1;
287 static int dmar_forcedac;
288 static int intel_iommu_strict;
289
290 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
291 static DEFINE_SPINLOCK(device_domain_lock);
292 static LIST_HEAD(device_domain_list);
293
294 static struct iommu_ops intel_iommu_ops;
295
296 static int __init intel_iommu_setup(char *str)
297 {
298         if (!str)
299                 return -EINVAL;
300         while (*str) {
301                 if (!strncmp(str, "on", 2)) {
302                         dmar_disabled = 0;
303                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
304                 } else if (!strncmp(str, "off", 3)) {
305                         dmar_disabled = 1;
306                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
307                 } else if (!strncmp(str, "igfx_off", 8)) {
308                         dmar_map_gfx = 0;
309                         printk(KERN_INFO
310                                 "Intel-IOMMU: disable GFX device mapping\n");
311                 } else if (!strncmp(str, "forcedac", 8)) {
312                         printk(KERN_INFO
313                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
314                         dmar_forcedac = 1;
315                 } else if (!strncmp(str, "strict", 6)) {
316                         printk(KERN_INFO
317                                 "Intel-IOMMU: disable batched IOTLB flush\n");
318                         intel_iommu_strict = 1;
319                 }
320
321                 str += strcspn(str, ",");
322                 while (*str == ',')
323                         str++;
324         }
325         return 0;
326 }
327 __setup("intel_iommu=", intel_iommu_setup);
328
329 static struct kmem_cache *iommu_domain_cache;
330 static struct kmem_cache *iommu_devinfo_cache;
331 static struct kmem_cache *iommu_iova_cache;
332
333 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
334 {
335         unsigned int flags;
336         void *vaddr;
337
338         /* trying to avoid low memory issues */
339         flags = current->flags & PF_MEMALLOC;
340         current->flags |= PF_MEMALLOC;
341         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
342         current->flags &= (~PF_MEMALLOC | flags);
343         return vaddr;
344 }
345
346
347 static inline void *alloc_pgtable_page(void)
348 {
349         unsigned int flags;
350         void *vaddr;
351
352         /* trying to avoid low memory issues */
353         flags = current->flags & PF_MEMALLOC;
354         current->flags |= PF_MEMALLOC;
355         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
356         current->flags &= (~PF_MEMALLOC | flags);
357         return vaddr;
358 }
359
360 static inline void free_pgtable_page(void *vaddr)
361 {
362         free_page((unsigned long)vaddr);
363 }
364
365 static inline void *alloc_domain_mem(void)
366 {
367         return iommu_kmem_cache_alloc(iommu_domain_cache);
368 }
369
370 static void free_domain_mem(void *vaddr)
371 {
372         kmem_cache_free(iommu_domain_cache, vaddr);
373 }
374
375 static inline void * alloc_devinfo_mem(void)
376 {
377         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
378 }
379
380 static inline void free_devinfo_mem(void *vaddr)
381 {
382         kmem_cache_free(iommu_devinfo_cache, vaddr);
383 }
384
385 struct iova *alloc_iova_mem(void)
386 {
387         return iommu_kmem_cache_alloc(iommu_iova_cache);
388 }
389
390 void free_iova_mem(struct iova *iova)
391 {
392         kmem_cache_free(iommu_iova_cache, iova);
393 }
394
395
396 static inline int width_to_agaw(int width);
397
398 /* calculate agaw for each iommu.
399  * "SAGAW" may be different across iommus, use a default agaw, and
400  * get a supported less agaw for iommus that don't support the default agaw.
401  */
402 int iommu_calculate_agaw(struct intel_iommu *iommu)
403 {
404         unsigned long sagaw;
405         int agaw = -1;
406
407         sagaw = cap_sagaw(iommu->cap);
408         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
409              agaw >= 0; agaw--) {
410                 if (test_bit(agaw, &sagaw))
411                         break;
412         }
413
414         return agaw;
415 }
416
417 /* in native case, each domain is related to only one iommu */
418 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
419 {
420         int iommu_id;
421
422         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
423
424         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
425         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
426                 return NULL;
427
428         return g_iommus[iommu_id];
429 }
430
431 static void domain_update_iommu_coherency(struct dmar_domain *domain)
432 {
433         int i;
434
435         domain->iommu_coherency = 1;
436
437         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
438         for (; i < g_num_of_iommus; ) {
439                 if (!ecap_coherent(g_iommus[i]->ecap)) {
440                         domain->iommu_coherency = 0;
441                         break;
442                 }
443                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
444         }
445 }
446
447 static void domain_update_iommu_snooping(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_snooping = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
456                         domain->iommu_snooping = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 /* Some capabilities may be different across iommus */
464 static void domain_update_iommu_cap(struct dmar_domain *domain)
465 {
466         domain_update_iommu_coherency(domain);
467         domain_update_iommu_snooping(domain);
468 }
469
470 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
471 {
472         struct dmar_drhd_unit *drhd = NULL;
473         int i;
474
475         for_each_drhd_unit(drhd) {
476                 if (drhd->ignored)
477                         continue;
478
479                 for (i = 0; i < drhd->devices_cnt; i++)
480                         if (drhd->devices[i] &&
481                             drhd->devices[i]->bus->number == bus &&
482                             drhd->devices[i]->devfn == devfn)
483                                 return drhd->iommu;
484
485                 if (drhd->include_all)
486                         return drhd->iommu;
487         }
488
489         return NULL;
490 }
491
492 static void domain_flush_cache(struct dmar_domain *domain,
493                                void *addr, int size)
494 {
495         if (!domain->iommu_coherency)
496                 clflush_cache_range(addr, size);
497 }
498
499 /* Gets context entry for a given bus and devfn */
500 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
501                 u8 bus, u8 devfn)
502 {
503         struct root_entry *root;
504         struct context_entry *context;
505         unsigned long phy_addr;
506         unsigned long flags;
507
508         spin_lock_irqsave(&iommu->lock, flags);
509         root = &iommu->root_entry[bus];
510         context = get_context_addr_from_root(root);
511         if (!context) {
512                 context = (struct context_entry *)alloc_pgtable_page();
513                 if (!context) {
514                         spin_unlock_irqrestore(&iommu->lock, flags);
515                         return NULL;
516                 }
517                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
518                 phy_addr = virt_to_phys((void *)context);
519                 set_root_value(root, phy_addr);
520                 set_root_present(root);
521                 __iommu_flush_cache(iommu, root, sizeof(*root));
522         }
523         spin_unlock_irqrestore(&iommu->lock, flags);
524         return &context[devfn];
525 }
526
527 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
528 {
529         struct root_entry *root;
530         struct context_entry *context;
531         int ret;
532         unsigned long flags;
533
534         spin_lock_irqsave(&iommu->lock, flags);
535         root = &iommu->root_entry[bus];
536         context = get_context_addr_from_root(root);
537         if (!context) {
538                 ret = 0;
539                 goto out;
540         }
541         ret = context_present(&context[devfn]);
542 out:
543         spin_unlock_irqrestore(&iommu->lock, flags);
544         return ret;
545 }
546
547 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
548 {
549         struct root_entry *root;
550         struct context_entry *context;
551         unsigned long flags;
552
553         spin_lock_irqsave(&iommu->lock, flags);
554         root = &iommu->root_entry[bus];
555         context = get_context_addr_from_root(root);
556         if (context) {
557                 context_clear_entry(&context[devfn]);
558                 __iommu_flush_cache(iommu, &context[devfn], \
559                         sizeof(*context));
560         }
561         spin_unlock_irqrestore(&iommu->lock, flags);
562 }
563
564 static void free_context_table(struct intel_iommu *iommu)
565 {
566         struct root_entry *root;
567         int i;
568         unsigned long flags;
569         struct context_entry *context;
570
571         spin_lock_irqsave(&iommu->lock, flags);
572         if (!iommu->root_entry) {
573                 goto out;
574         }
575         for (i = 0; i < ROOT_ENTRY_NR; i++) {
576                 root = &iommu->root_entry[i];
577                 context = get_context_addr_from_root(root);
578                 if (context)
579                         free_pgtable_page(context);
580         }
581         free_pgtable_page(iommu->root_entry);
582         iommu->root_entry = NULL;
583 out:
584         spin_unlock_irqrestore(&iommu->lock, flags);
585 }
586
587 /* page table handling */
588 #define LEVEL_STRIDE            (9)
589 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
590
591 static inline int agaw_to_level(int agaw)
592 {
593         return agaw + 2;
594 }
595
596 static inline int agaw_to_width(int agaw)
597 {
598         return 30 + agaw * LEVEL_STRIDE;
599
600 }
601
602 static inline int width_to_agaw(int width)
603 {
604         return (width - 30) / LEVEL_STRIDE;
605 }
606
607 static inline unsigned int level_to_offset_bits(int level)
608 {
609         return (12 + (level - 1) * LEVEL_STRIDE);
610 }
611
612 static inline int address_level_offset(u64 addr, int level)
613 {
614         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
615 }
616
617 static inline u64 level_mask(int level)
618 {
619         return ((u64)-1 << level_to_offset_bits(level));
620 }
621
622 static inline u64 level_size(int level)
623 {
624         return ((u64)1 << level_to_offset_bits(level));
625 }
626
627 static inline u64 align_to_level(u64 addr, int level)
628 {
629         return ((addr + level_size(level) - 1) & level_mask(level));
630 }
631
632 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
633 {
634         int addr_width = agaw_to_width(domain->agaw);
635         struct dma_pte *parent, *pte = NULL;
636         int level = agaw_to_level(domain->agaw);
637         int offset;
638         unsigned long flags;
639
640         BUG_ON(!domain->pgd);
641
642         addr &= (((u64)1) << addr_width) - 1;
643         parent = domain->pgd;
644
645         spin_lock_irqsave(&domain->mapping_lock, flags);
646         while (level > 0) {
647                 void *tmp_page;
648
649                 offset = address_level_offset(addr, level);
650                 pte = &parent[offset];
651                 if (level == 1)
652                         break;
653
654                 if (!dma_pte_present(pte)) {
655                         tmp_page = alloc_pgtable_page();
656
657                         if (!tmp_page) {
658                                 spin_unlock_irqrestore(&domain->mapping_lock,
659                                         flags);
660                                 return NULL;
661                         }
662                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
663                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
664                         /*
665                          * high level table always sets r/w, last level page
666                          * table control read/write
667                          */
668                         dma_set_pte_readable(pte);
669                         dma_set_pte_writable(pte);
670                         domain_flush_cache(domain, pte, sizeof(*pte));
671                 }
672                 parent = phys_to_virt(dma_pte_addr(pte));
673                 level--;
674         }
675
676         spin_unlock_irqrestore(&domain->mapping_lock, flags);
677         return pte;
678 }
679
680 /* return address's pte at specific level */
681 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
682                 int level)
683 {
684         struct dma_pte *parent, *pte = NULL;
685         int total = agaw_to_level(domain->agaw);
686         int offset;
687
688         parent = domain->pgd;
689         while (level <= total) {
690                 offset = address_level_offset(addr, total);
691                 pte = &parent[offset];
692                 if (level == total)
693                         return pte;
694
695                 if (!dma_pte_present(pte))
696                         break;
697                 parent = phys_to_virt(dma_pte_addr(pte));
698                 total--;
699         }
700         return NULL;
701 }
702
703 /* clear one page's page table */
704 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
705 {
706         struct dma_pte *pte = NULL;
707
708         /* get last level pte */
709         pte = dma_addr_level_pte(domain, addr, 1);
710
711         if (pte) {
712                 dma_clear_pte(pte);
713                 domain_flush_cache(domain, pte, sizeof(*pte));
714         }
715 }
716
717 /* clear last level pte, a tlb flush should be followed */
718 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
719 {
720         int addr_width = agaw_to_width(domain->agaw);
721
722         start &= (((u64)1) << addr_width) - 1;
723         end &= (((u64)1) << addr_width) - 1;
724         /* in case it's partial page */
725         start = PAGE_ALIGN(start);
726         end &= PAGE_MASK;
727
728         /* we don't need lock here, nobody else touches the iova range */
729         while (start < end) {
730                 dma_pte_clear_one(domain, start);
731                 start += VTD_PAGE_SIZE;
732         }
733 }
734
735 /* free page table pages. last level pte should already be cleared */
736 static void dma_pte_free_pagetable(struct dmar_domain *domain,
737         u64 start, u64 end)
738 {
739         int addr_width = agaw_to_width(domain->agaw);
740         struct dma_pte *pte;
741         int total = agaw_to_level(domain->agaw);
742         int level;
743         u64 tmp;
744
745         start &= (((u64)1) << addr_width) - 1;
746         end &= (((u64)1) << addr_width) - 1;
747
748         /* we don't need lock here, nobody else touches the iova range */
749         level = 2;
750         while (level <= total) {
751                 tmp = align_to_level(start, level);
752                 if (tmp >= end || (tmp + level_size(level) > end))
753                         return;
754
755                 while (tmp < end) {
756                         pte = dma_addr_level_pte(domain, tmp, level);
757                         if (pte) {
758                                 free_pgtable_page(
759                                         phys_to_virt(dma_pte_addr(pte)));
760                                 dma_clear_pte(pte);
761                                 domain_flush_cache(domain, pte, sizeof(*pte));
762                         }
763                         tmp += level_size(level);
764                 }
765                 level++;
766         }
767         /* free pgd */
768         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
769                 free_pgtable_page(domain->pgd);
770                 domain->pgd = NULL;
771         }
772 }
773
774 /* iommu handling */
775 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
776 {
777         struct root_entry *root;
778         unsigned long flags;
779
780         root = (struct root_entry *)alloc_pgtable_page();
781         if (!root)
782                 return -ENOMEM;
783
784         __iommu_flush_cache(iommu, root, ROOT_SIZE);
785
786         spin_lock_irqsave(&iommu->lock, flags);
787         iommu->root_entry = root;
788         spin_unlock_irqrestore(&iommu->lock, flags);
789
790         return 0;
791 }
792
793 static void iommu_set_root_entry(struct intel_iommu *iommu)
794 {
795         void *addr;
796         u32 cmd, sts;
797         unsigned long flag;
798
799         addr = iommu->root_entry;
800
801         spin_lock_irqsave(&iommu->register_lock, flag);
802         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
803
804         cmd = iommu->gcmd | DMA_GCMD_SRTP;
805         writel(cmd, iommu->reg + DMAR_GCMD_REG);
806
807         /* Make sure hardware complete it */
808         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
809                 readl, (sts & DMA_GSTS_RTPS), sts);
810
811         spin_unlock_irqrestore(&iommu->register_lock, flag);
812 }
813
814 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
815 {
816         u32 val;
817         unsigned long flag;
818
819         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
820                 return;
821         val = iommu->gcmd | DMA_GCMD_WBF;
822
823         spin_lock_irqsave(&iommu->register_lock, flag);
824         writel(val, iommu->reg + DMAR_GCMD_REG);
825
826         /* Make sure hardware complete it */
827         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
828                         readl, (!(val & DMA_GSTS_WBFS)), val);
829
830         spin_unlock_irqrestore(&iommu->register_lock, flag);
831 }
832
833 /* return value determine if we need a write buffer flush */
834 static int __iommu_flush_context(struct intel_iommu *iommu,
835         u16 did, u16 source_id, u8 function_mask, u64 type,
836         int non_present_entry_flush)
837 {
838         u64 val = 0;
839         unsigned long flag;
840
841         /*
842          * In the non-present entry flush case, if hardware doesn't cache
843          * non-present entry we do nothing and if hardware cache non-present
844          * entry, we flush entries of domain 0 (the domain id is used to cache
845          * any non-present entries)
846          */
847         if (non_present_entry_flush) {
848                 if (!cap_caching_mode(iommu->cap))
849                         return 1;
850                 else
851                         did = 0;
852         }
853
854         switch (type) {
855         case DMA_CCMD_GLOBAL_INVL:
856                 val = DMA_CCMD_GLOBAL_INVL;
857                 break;
858         case DMA_CCMD_DOMAIN_INVL:
859                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
860                 break;
861         case DMA_CCMD_DEVICE_INVL:
862                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
863                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
864                 break;
865         default:
866                 BUG();
867         }
868         val |= DMA_CCMD_ICC;
869
870         spin_lock_irqsave(&iommu->register_lock, flag);
871         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
872
873         /* Make sure hardware complete it */
874         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
875                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
876
877         spin_unlock_irqrestore(&iommu->register_lock, flag);
878
879         /* flush context entry will implicitly flush write buffer */
880         return 0;
881 }
882
883 /* return value determine if we need a write buffer flush */
884 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
885         u64 addr, unsigned int size_order, u64 type,
886         int non_present_entry_flush)
887 {
888         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
889         u64 val = 0, val_iva = 0;
890         unsigned long flag;
891
892         /*
893          * In the non-present entry flush case, if hardware doesn't cache
894          * non-present entry we do nothing and if hardware cache non-present
895          * entry, we flush entries of domain 0 (the domain id is used to cache
896          * any non-present entries)
897          */
898         if (non_present_entry_flush) {
899                 if (!cap_caching_mode(iommu->cap))
900                         return 1;
901                 else
902                         did = 0;
903         }
904
905         switch (type) {
906         case DMA_TLB_GLOBAL_FLUSH:
907                 /* global flush doesn't need set IVA_REG */
908                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
909                 break;
910         case DMA_TLB_DSI_FLUSH:
911                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
912                 break;
913         case DMA_TLB_PSI_FLUSH:
914                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
915                 /* Note: always flush non-leaf currently */
916                 val_iva = size_order | addr;
917                 break;
918         default:
919                 BUG();
920         }
921         /* Note: set drain read/write */
922 #if 0
923         /*
924          * This is probably to be super secure.. Looks like we can
925          * ignore it without any impact.
926          */
927         if (cap_read_drain(iommu->cap))
928                 val |= DMA_TLB_READ_DRAIN;
929 #endif
930         if (cap_write_drain(iommu->cap))
931                 val |= DMA_TLB_WRITE_DRAIN;
932
933         spin_lock_irqsave(&iommu->register_lock, flag);
934         /* Note: Only uses first TLB reg currently */
935         if (val_iva)
936                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
937         dmar_writeq(iommu->reg + tlb_offset + 8, val);
938
939         /* Make sure hardware complete it */
940         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
941                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
942
943         spin_unlock_irqrestore(&iommu->register_lock, flag);
944
945         /* check IOTLB invalidation granularity */
946         if (DMA_TLB_IAIG(val) == 0)
947                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
948         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
949                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
950                         (unsigned long long)DMA_TLB_IIRG(type),
951                         (unsigned long long)DMA_TLB_IAIG(val));
952         /* flush iotlb entry will implicitly flush write buffer */
953         return 0;
954 }
955
956 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
957         u64 addr, unsigned int pages, int non_present_entry_flush)
958 {
959         unsigned int mask;
960
961         BUG_ON(addr & (~VTD_PAGE_MASK));
962         BUG_ON(pages == 0);
963
964         /* Fallback to domain selective flush if no PSI support */
965         if (!cap_pgsel_inv(iommu->cap))
966                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
967                                                 DMA_TLB_DSI_FLUSH,
968                                                 non_present_entry_flush);
969
970         /*
971          * PSI requires page size to be 2 ^ x, and the base address is naturally
972          * aligned to the size
973          */
974         mask = ilog2(__roundup_pow_of_two(pages));
975         /* Fallback to domain selective flush if size is too big */
976         if (mask > cap_max_amask_val(iommu->cap))
977                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
978                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
979
980         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
981                                         DMA_TLB_PSI_FLUSH,
982                                         non_present_entry_flush);
983 }
984
985 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
986 {
987         u32 pmen;
988         unsigned long flags;
989
990         spin_lock_irqsave(&iommu->register_lock, flags);
991         pmen = readl(iommu->reg + DMAR_PMEN_REG);
992         pmen &= ~DMA_PMEN_EPM;
993         writel(pmen, iommu->reg + DMAR_PMEN_REG);
994
995         /* wait for the protected region status bit to clear */
996         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
997                 readl, !(pmen & DMA_PMEN_PRS), pmen);
998
999         spin_unlock_irqrestore(&iommu->register_lock, flags);
1000 }
1001
1002 static int iommu_enable_translation(struct intel_iommu *iommu)
1003 {
1004         u32 sts;
1005         unsigned long flags;
1006
1007         spin_lock_irqsave(&iommu->register_lock, flags);
1008         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1009
1010         /* Make sure hardware complete it */
1011         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1012                 readl, (sts & DMA_GSTS_TES), sts);
1013
1014         iommu->gcmd |= DMA_GCMD_TE;
1015         spin_unlock_irqrestore(&iommu->register_lock, flags);
1016         return 0;
1017 }
1018
1019 static int iommu_disable_translation(struct intel_iommu *iommu)
1020 {
1021         u32 sts;
1022         unsigned long flag;
1023
1024         spin_lock_irqsave(&iommu->register_lock, flag);
1025         iommu->gcmd &= ~DMA_GCMD_TE;
1026         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1027
1028         /* Make sure hardware complete it */
1029         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1030                 readl, (!(sts & DMA_GSTS_TES)), sts);
1031
1032         spin_unlock_irqrestore(&iommu->register_lock, flag);
1033         return 0;
1034 }
1035
1036 /* iommu interrupt handling. Most stuff are MSI-like. */
1037
1038 static const char *fault_reason_strings[] =
1039 {
1040         "Software",
1041         "Present bit in root entry is clear",
1042         "Present bit in context entry is clear",
1043         "Invalid context entry",
1044         "Access beyond MGAW",
1045         "PTE Write access is not set",
1046         "PTE Read access is not set",
1047         "Next page table ptr is invalid",
1048         "Root table address invalid",
1049         "Context table ptr is invalid",
1050         "non-zero reserved fields in RTP",
1051         "non-zero reserved fields in CTP",
1052         "non-zero reserved fields in PTE",
1053 };
1054 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1055
1056 const char *dmar_get_fault_reason(u8 fault_reason)
1057 {
1058         if (fault_reason > MAX_FAULT_REASON_IDX)
1059                 return "Unknown";
1060         else
1061                 return fault_reason_strings[fault_reason];
1062 }
1063
1064 void dmar_msi_unmask(unsigned int irq)
1065 {
1066         struct intel_iommu *iommu = get_irq_data(irq);
1067         unsigned long flag;
1068
1069         /* unmask it */
1070         spin_lock_irqsave(&iommu->register_lock, flag);
1071         writel(0, iommu->reg + DMAR_FECTL_REG);
1072         /* Read a reg to force flush the post write */
1073         readl(iommu->reg + DMAR_FECTL_REG);
1074         spin_unlock_irqrestore(&iommu->register_lock, flag);
1075 }
1076
1077 void dmar_msi_mask(unsigned int irq)
1078 {
1079         unsigned long flag;
1080         struct intel_iommu *iommu = get_irq_data(irq);
1081
1082         /* mask it */
1083         spin_lock_irqsave(&iommu->register_lock, flag);
1084         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1085         /* Read a reg to force flush the post write */
1086         readl(iommu->reg + DMAR_FECTL_REG);
1087         spin_unlock_irqrestore(&iommu->register_lock, flag);
1088 }
1089
1090 void dmar_msi_write(int irq, struct msi_msg *msg)
1091 {
1092         struct intel_iommu *iommu = get_irq_data(irq);
1093         unsigned long flag;
1094
1095         spin_lock_irqsave(&iommu->register_lock, flag);
1096         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1097         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1098         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1099         spin_unlock_irqrestore(&iommu->register_lock, flag);
1100 }
1101
1102 void dmar_msi_read(int irq, struct msi_msg *msg)
1103 {
1104         struct intel_iommu *iommu = get_irq_data(irq);
1105         unsigned long flag;
1106
1107         spin_lock_irqsave(&iommu->register_lock, flag);
1108         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1109         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1110         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1111         spin_unlock_irqrestore(&iommu->register_lock, flag);
1112 }
1113
1114 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1115                 u8 fault_reason, u16 source_id, unsigned long long addr)
1116 {
1117         const char *reason;
1118
1119         reason = dmar_get_fault_reason(fault_reason);
1120
1121         printk(KERN_ERR
1122                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1123                 "fault addr %llx \n"
1124                 "DMAR:[fault reason %02d] %s\n",
1125                 (type ? "DMA Read" : "DMA Write"),
1126                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1127                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1128         return 0;
1129 }
1130
1131 #define PRIMARY_FAULT_REG_LEN (16)
1132 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1133 {
1134         struct intel_iommu *iommu = dev_id;
1135         int reg, fault_index;
1136         u32 fault_status;
1137         unsigned long flag;
1138
1139         spin_lock_irqsave(&iommu->register_lock, flag);
1140         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1141
1142         /* TBD: ignore advanced fault log currently */
1143         if (!(fault_status & DMA_FSTS_PPF))
1144                 goto clear_overflow;
1145
1146         fault_index = dma_fsts_fault_record_index(fault_status);
1147         reg = cap_fault_reg_offset(iommu->cap);
1148         while (1) {
1149                 u8 fault_reason;
1150                 u16 source_id;
1151                 u64 guest_addr;
1152                 int type;
1153                 u32 data;
1154
1155                 /* highest 32 bits */
1156                 data = readl(iommu->reg + reg +
1157                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1158                 if (!(data & DMA_FRCD_F))
1159                         break;
1160
1161                 fault_reason = dma_frcd_fault_reason(data);
1162                 type = dma_frcd_type(data);
1163
1164                 data = readl(iommu->reg + reg +
1165                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1166                 source_id = dma_frcd_source_id(data);
1167
1168                 guest_addr = dmar_readq(iommu->reg + reg +
1169                                 fault_index * PRIMARY_FAULT_REG_LEN);
1170                 guest_addr = dma_frcd_page_addr(guest_addr);
1171                 /* clear the fault */
1172                 writel(DMA_FRCD_F, iommu->reg + reg +
1173                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1174
1175                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1176
1177                 iommu_page_fault_do_one(iommu, type, fault_reason,
1178                                 source_id, guest_addr);
1179
1180                 fault_index++;
1181                 if (fault_index > cap_num_fault_regs(iommu->cap))
1182                         fault_index = 0;
1183                 spin_lock_irqsave(&iommu->register_lock, flag);
1184         }
1185 clear_overflow:
1186         /* clear primary fault overflow */
1187         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1188         if (fault_status & DMA_FSTS_PFO)
1189                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1190
1191         spin_unlock_irqrestore(&iommu->register_lock, flag);
1192         return IRQ_HANDLED;
1193 }
1194
1195 int dmar_set_interrupt(struct intel_iommu *iommu)
1196 {
1197         int irq, ret;
1198
1199         irq = create_irq();
1200         if (!irq) {
1201                 printk(KERN_ERR "IOMMU: no free vectors\n");
1202                 return -EINVAL;
1203         }
1204
1205         set_irq_data(irq, iommu);
1206         iommu->irq = irq;
1207
1208         ret = arch_setup_dmar_msi(irq);
1209         if (ret) {
1210                 set_irq_data(irq, NULL);
1211                 iommu->irq = 0;
1212                 destroy_irq(irq);
1213                 return 0;
1214         }
1215
1216         /* Force fault register is cleared */
1217         iommu_page_fault(irq, iommu);
1218
1219         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1220         if (ret)
1221                 printk(KERN_ERR "IOMMU: can't request irq\n");
1222         return ret;
1223 }
1224
1225 static int iommu_init_domains(struct intel_iommu *iommu)
1226 {
1227         unsigned long ndomains;
1228         unsigned long nlongs;
1229
1230         ndomains = cap_ndoms(iommu->cap);
1231         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1232         nlongs = BITS_TO_LONGS(ndomains);
1233
1234         /* TBD: there might be 64K domains,
1235          * consider other allocation for future chip
1236          */
1237         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1238         if (!iommu->domain_ids) {
1239                 printk(KERN_ERR "Allocating domain id array failed\n");
1240                 return -ENOMEM;
1241         }
1242         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1243                         GFP_KERNEL);
1244         if (!iommu->domains) {
1245                 printk(KERN_ERR "Allocating domain array failed\n");
1246                 kfree(iommu->domain_ids);
1247                 return -ENOMEM;
1248         }
1249
1250         spin_lock_init(&iommu->lock);
1251
1252         /*
1253          * if Caching mode is set, then invalid translations are tagged
1254          * with domainid 0. Hence we need to pre-allocate it.
1255          */
1256         if (cap_caching_mode(iommu->cap))
1257                 set_bit(0, iommu->domain_ids);
1258         return 0;
1259 }
1260
1261
1262 static void domain_exit(struct dmar_domain *domain);
1263 static void vm_domain_exit(struct dmar_domain *domain);
1264
1265 void free_dmar_iommu(struct intel_iommu *iommu)
1266 {
1267         struct dmar_domain *domain;
1268         int i;
1269         unsigned long flags;
1270
1271         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1272         for (; i < cap_ndoms(iommu->cap); ) {
1273                 domain = iommu->domains[i];
1274                 clear_bit(i, iommu->domain_ids);
1275
1276                 spin_lock_irqsave(&domain->iommu_lock, flags);
1277                 if (--domain->iommu_count == 0) {
1278                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1279                                 vm_domain_exit(domain);
1280                         else
1281                                 domain_exit(domain);
1282                 }
1283                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1284
1285                 i = find_next_bit(iommu->domain_ids,
1286                         cap_ndoms(iommu->cap), i+1);
1287         }
1288
1289         if (iommu->gcmd & DMA_GCMD_TE)
1290                 iommu_disable_translation(iommu);
1291
1292         if (iommu->irq) {
1293                 set_irq_data(iommu->irq, NULL);
1294                 /* This will mask the irq */
1295                 free_irq(iommu->irq, iommu);
1296                 destroy_irq(iommu->irq);
1297         }
1298
1299         kfree(iommu->domains);
1300         kfree(iommu->domain_ids);
1301
1302         g_iommus[iommu->seq_id] = NULL;
1303
1304         /* if all iommus are freed, free g_iommus */
1305         for (i = 0; i < g_num_of_iommus; i++) {
1306                 if (g_iommus[i])
1307                         break;
1308         }
1309
1310         if (i == g_num_of_iommus)
1311                 kfree(g_iommus);
1312
1313         /* free context mapping */
1314         free_context_table(iommu);
1315 }
1316
1317 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1318 {
1319         unsigned long num;
1320         unsigned long ndomains;
1321         struct dmar_domain *domain;
1322         unsigned long flags;
1323
1324         domain = alloc_domain_mem();
1325         if (!domain)
1326                 return NULL;
1327
1328         ndomains = cap_ndoms(iommu->cap);
1329
1330         spin_lock_irqsave(&iommu->lock, flags);
1331         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1332         if (num >= ndomains) {
1333                 spin_unlock_irqrestore(&iommu->lock, flags);
1334                 free_domain_mem(domain);
1335                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1336                 return NULL;
1337         }
1338
1339         set_bit(num, iommu->domain_ids);
1340         domain->id = num;
1341         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1342         set_bit(iommu->seq_id, &domain->iommu_bmp);
1343         domain->flags = 0;
1344         iommu->domains[num] = domain;
1345         spin_unlock_irqrestore(&iommu->lock, flags);
1346
1347         return domain;
1348 }
1349
1350 static void iommu_free_domain(struct dmar_domain *domain)
1351 {
1352         unsigned long flags;
1353         struct intel_iommu *iommu;
1354
1355         iommu = domain_get_iommu(domain);
1356
1357         spin_lock_irqsave(&iommu->lock, flags);
1358         clear_bit(domain->id, iommu->domain_ids);
1359         spin_unlock_irqrestore(&iommu->lock, flags);
1360 }
1361
1362 static struct iova_domain reserved_iova_list;
1363 static struct lock_class_key reserved_alloc_key;
1364 static struct lock_class_key reserved_rbtree_key;
1365
1366 static void dmar_init_reserved_ranges(void)
1367 {
1368         struct pci_dev *pdev = NULL;
1369         struct iova *iova;
1370         int i;
1371         u64 addr, size;
1372
1373         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1374
1375         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1376                 &reserved_alloc_key);
1377         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1378                 &reserved_rbtree_key);
1379
1380         /* IOAPIC ranges shouldn't be accessed by DMA */
1381         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1382                 IOVA_PFN(IOAPIC_RANGE_END));
1383         if (!iova)
1384                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1385
1386         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1387         for_each_pci_dev(pdev) {
1388                 struct resource *r;
1389
1390                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1391                         r = &pdev->resource[i];
1392                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1393                                 continue;
1394                         addr = r->start;
1395                         addr &= PAGE_MASK;
1396                         size = r->end - addr;
1397                         size = PAGE_ALIGN(size);
1398                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1399                                 IOVA_PFN(size + addr) - 1);
1400                         if (!iova)
1401                                 printk(KERN_ERR "Reserve iova failed\n");
1402                 }
1403         }
1404
1405 }
1406
1407 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1408 {
1409         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1410 }
1411
1412 static inline int guestwidth_to_adjustwidth(int gaw)
1413 {
1414         int agaw;
1415         int r = (gaw - 12) % 9;
1416
1417         if (r == 0)
1418                 agaw = gaw;
1419         else
1420                 agaw = gaw + 9 - r;
1421         if (agaw > 64)
1422                 agaw = 64;
1423         return agaw;
1424 }
1425
1426 static int domain_init(struct dmar_domain *domain, int guest_width)
1427 {
1428         struct intel_iommu *iommu;
1429         int adjust_width, agaw;
1430         unsigned long sagaw;
1431
1432         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1433         spin_lock_init(&domain->mapping_lock);
1434         spin_lock_init(&domain->iommu_lock);
1435
1436         domain_reserve_special_ranges(domain);
1437
1438         /* calculate AGAW */
1439         iommu = domain_get_iommu(domain);
1440         if (guest_width > cap_mgaw(iommu->cap))
1441                 guest_width = cap_mgaw(iommu->cap);
1442         domain->gaw = guest_width;
1443         adjust_width = guestwidth_to_adjustwidth(guest_width);
1444         agaw = width_to_agaw(adjust_width);
1445         sagaw = cap_sagaw(iommu->cap);
1446         if (!test_bit(agaw, &sagaw)) {
1447                 /* hardware doesn't support it, choose a bigger one */
1448                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1449                 agaw = find_next_bit(&sagaw, 5, agaw);
1450                 if (agaw >= 5)
1451                         return -ENODEV;
1452         }
1453         domain->agaw = agaw;
1454         INIT_LIST_HEAD(&domain->devices);
1455
1456         if (ecap_coherent(iommu->ecap))
1457                 domain->iommu_coherency = 1;
1458         else
1459                 domain->iommu_coherency = 0;
1460
1461         if (ecap_sc_support(iommu->ecap))
1462                 domain->iommu_snooping = 1;
1463         else
1464                 domain->iommu_snooping = 0;
1465
1466         domain->iommu_count = 1;
1467
1468         /* always allocate the top pgd */
1469         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1470         if (!domain->pgd)
1471                 return -ENOMEM;
1472         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1473         return 0;
1474 }
1475
1476 static void domain_exit(struct dmar_domain *domain)
1477 {
1478         u64 end;
1479
1480         /* Domain 0 is reserved, so dont process it */
1481         if (!domain)
1482                 return;
1483
1484         domain_remove_dev_info(domain);
1485         /* destroy iovas */
1486         put_iova_domain(&domain->iovad);
1487         end = DOMAIN_MAX_ADDR(domain->gaw);
1488         end = end & (~PAGE_MASK);
1489
1490         /* clear ptes */
1491         dma_pte_clear_range(domain, 0, end);
1492
1493         /* free page tables */
1494         dma_pte_free_pagetable(domain, 0, end);
1495
1496         iommu_free_domain(domain);
1497         free_domain_mem(domain);
1498 }
1499
1500 static int domain_context_mapping_one(struct dmar_domain *domain,
1501                 u8 bus, u8 devfn)
1502 {
1503         struct context_entry *context;
1504         unsigned long flags;
1505         struct intel_iommu *iommu;
1506         struct dma_pte *pgd;
1507         unsigned long num;
1508         unsigned long ndomains;
1509         int id;
1510         int agaw;
1511
1512         pr_debug("Set context mapping for %02x:%02x.%d\n",
1513                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1514         BUG_ON(!domain->pgd);
1515
1516         iommu = device_to_iommu(bus, devfn);
1517         if (!iommu)
1518                 return -ENODEV;
1519
1520         context = device_to_context_entry(iommu, bus, devfn);
1521         if (!context)
1522                 return -ENOMEM;
1523         spin_lock_irqsave(&iommu->lock, flags);
1524         if (context_present(context)) {
1525                 spin_unlock_irqrestore(&iommu->lock, flags);
1526                 return 0;
1527         }
1528
1529         id = domain->id;
1530         pgd = domain->pgd;
1531
1532         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1533                 int found = 0;
1534
1535                 /* find an available domain id for this device in iommu */
1536                 ndomains = cap_ndoms(iommu->cap);
1537                 num = find_first_bit(iommu->domain_ids, ndomains);
1538                 for (; num < ndomains; ) {
1539                         if (iommu->domains[num] == domain) {
1540                                 id = num;
1541                                 found = 1;
1542                                 break;
1543                         }
1544                         num = find_next_bit(iommu->domain_ids,
1545                                             cap_ndoms(iommu->cap), num+1);
1546                 }
1547
1548                 if (found == 0) {
1549                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1550                         if (num >= ndomains) {
1551                                 spin_unlock_irqrestore(&iommu->lock, flags);
1552                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1553                                 return -EFAULT;
1554                         }
1555
1556                         set_bit(num, iommu->domain_ids);
1557                         iommu->domains[num] = domain;
1558                         id = num;
1559                 }
1560
1561                 /* Skip top levels of page tables for
1562                  * iommu which has less agaw than default.
1563                  */
1564                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1565                         pgd = phys_to_virt(dma_pte_addr(pgd));
1566                         if (!dma_pte_present(pgd)) {
1567                                 spin_unlock_irqrestore(&iommu->lock, flags);
1568                                 return -ENOMEM;
1569                         }
1570                 }
1571         }
1572
1573         context_set_domain_id(context, id);
1574         context_set_address_width(context, iommu->agaw);
1575         context_set_address_root(context, virt_to_phys(pgd));
1576         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1577         context_set_fault_enable(context);
1578         context_set_present(context);
1579         domain_flush_cache(domain, context, sizeof(*context));
1580
1581         /* it's a non-present to present mapping */
1582         if (iommu->flush.flush_context(iommu, domain->id,
1583                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1584                 DMA_CCMD_DEVICE_INVL, 1))
1585                 iommu_flush_write_buffer(iommu);
1586         else
1587                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1588
1589         spin_unlock_irqrestore(&iommu->lock, flags);
1590
1591         spin_lock_irqsave(&domain->iommu_lock, flags);
1592         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1593                 domain->iommu_count++;
1594                 domain_update_iommu_cap(domain);
1595         }
1596         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1597         return 0;
1598 }
1599
1600 static int
1601 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1602 {
1603         int ret;
1604         struct pci_dev *tmp, *parent;
1605
1606         ret = domain_context_mapping_one(domain, pdev->bus->number,
1607                 pdev->devfn);
1608         if (ret)
1609                 return ret;
1610
1611         /* dependent device mapping */
1612         tmp = pci_find_upstream_pcie_bridge(pdev);
1613         if (!tmp)
1614                 return 0;
1615         /* Secondary interface's bus number and devfn 0 */
1616         parent = pdev->bus->self;
1617         while (parent != tmp) {
1618                 ret = domain_context_mapping_one(domain, parent->bus->number,
1619                         parent->devfn);
1620                 if (ret)
1621                         return ret;
1622                 parent = parent->bus->self;
1623         }
1624         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1625                 return domain_context_mapping_one(domain,
1626                         tmp->subordinate->number, 0);
1627         else /* this is a legacy PCI bridge */
1628                 return domain_context_mapping_one(domain,
1629                         tmp->bus->number, tmp->devfn);
1630 }
1631
1632 static int domain_context_mapped(struct pci_dev *pdev)
1633 {
1634         int ret;
1635         struct pci_dev *tmp, *parent;
1636         struct intel_iommu *iommu;
1637
1638         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1639         if (!iommu)
1640                 return -ENODEV;
1641
1642         ret = device_context_mapped(iommu,
1643                 pdev->bus->number, pdev->devfn);
1644         if (!ret)
1645                 return ret;
1646         /* dependent device mapping */
1647         tmp = pci_find_upstream_pcie_bridge(pdev);
1648         if (!tmp)
1649                 return ret;
1650         /* Secondary interface's bus number and devfn 0 */
1651         parent = pdev->bus->self;
1652         while (parent != tmp) {
1653                 ret = device_context_mapped(iommu, parent->bus->number,
1654                         parent->devfn);
1655                 if (!ret)
1656                         return ret;
1657                 parent = parent->bus->self;
1658         }
1659         if (tmp->is_pcie)
1660                 return device_context_mapped(iommu,
1661                         tmp->subordinate->number, 0);
1662         else
1663                 return device_context_mapped(iommu,
1664                         tmp->bus->number, tmp->devfn);
1665 }
1666
1667 static int
1668 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1669                         u64 hpa, size_t size, int prot)
1670 {
1671         u64 start_pfn, end_pfn;
1672         struct dma_pte *pte;
1673         int index;
1674         int addr_width = agaw_to_width(domain->agaw);
1675
1676         hpa &= (((u64)1) << addr_width) - 1;
1677
1678         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1679                 return -EINVAL;
1680         iova &= PAGE_MASK;
1681         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1682         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1683         index = 0;
1684         while (start_pfn < end_pfn) {
1685                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1686                 if (!pte)
1687                         return -ENOMEM;
1688                 /* We don't need lock here, nobody else
1689                  * touches the iova range
1690                  */
1691                 BUG_ON(dma_pte_addr(pte));
1692                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1693                 dma_set_pte_prot(pte, prot);
1694                 if (prot & DMA_PTE_SNP)
1695                         dma_set_pte_snp(pte);
1696                 domain_flush_cache(domain, pte, sizeof(*pte));
1697                 start_pfn++;
1698                 index++;
1699         }
1700         return 0;
1701 }
1702
1703 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1704 {
1705         if (!iommu)
1706                 return;
1707
1708         clear_context_table(iommu, bus, devfn);
1709         iommu->flush.flush_context(iommu, 0, 0, 0,
1710                                            DMA_CCMD_GLOBAL_INVL, 0);
1711         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1712                                          DMA_TLB_GLOBAL_FLUSH, 0);
1713 }
1714
1715 static void domain_remove_dev_info(struct dmar_domain *domain)
1716 {
1717         struct device_domain_info *info;
1718         unsigned long flags;
1719         struct intel_iommu *iommu;
1720
1721         spin_lock_irqsave(&device_domain_lock, flags);
1722         while (!list_empty(&domain->devices)) {
1723                 info = list_entry(domain->devices.next,
1724                         struct device_domain_info, link);
1725                 list_del(&info->link);
1726                 list_del(&info->global);
1727                 if (info->dev)
1728                         info->dev->dev.archdata.iommu = NULL;
1729                 spin_unlock_irqrestore(&device_domain_lock, flags);
1730
1731                 iommu = device_to_iommu(info->bus, info->devfn);
1732                 iommu_detach_dev(iommu, info->bus, info->devfn);
1733                 free_devinfo_mem(info);
1734
1735                 spin_lock_irqsave(&device_domain_lock, flags);
1736         }
1737         spin_unlock_irqrestore(&device_domain_lock, flags);
1738 }
1739
1740 /*
1741  * find_domain
1742  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1743  */
1744 static struct dmar_domain *
1745 find_domain(struct pci_dev *pdev)
1746 {
1747         struct device_domain_info *info;
1748
1749         /* No lock here, assumes no domain exit in normal case */
1750         info = pdev->dev.archdata.iommu;
1751         if (info)
1752                 return info->domain;
1753         return NULL;
1754 }
1755
1756 /* domain is initialized */
1757 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1758 {
1759         struct dmar_domain *domain, *found = NULL;
1760         struct intel_iommu *iommu;
1761         struct dmar_drhd_unit *drhd;
1762         struct device_domain_info *info, *tmp;
1763         struct pci_dev *dev_tmp;
1764         unsigned long flags;
1765         int bus = 0, devfn = 0;
1766
1767         domain = find_domain(pdev);
1768         if (domain)
1769                 return domain;
1770
1771         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1772         if (dev_tmp) {
1773                 if (dev_tmp->is_pcie) {
1774                         bus = dev_tmp->subordinate->number;
1775                         devfn = 0;
1776                 } else {
1777                         bus = dev_tmp->bus->number;
1778                         devfn = dev_tmp->devfn;
1779                 }
1780                 spin_lock_irqsave(&device_domain_lock, flags);
1781                 list_for_each_entry(info, &device_domain_list, global) {
1782                         if (info->bus == bus && info->devfn == devfn) {
1783                                 found = info->domain;
1784                                 break;
1785                         }
1786                 }
1787                 spin_unlock_irqrestore(&device_domain_lock, flags);
1788                 /* pcie-pci bridge already has a domain, uses it */
1789                 if (found) {
1790                         domain = found;
1791                         goto found_domain;
1792                 }
1793         }
1794
1795         /* Allocate new domain for the device */
1796         drhd = dmar_find_matched_drhd_unit(pdev);
1797         if (!drhd) {
1798                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1799                         pci_name(pdev));
1800                 return NULL;
1801         }
1802         iommu = drhd->iommu;
1803
1804         domain = iommu_alloc_domain(iommu);
1805         if (!domain)
1806                 goto error;
1807
1808         if (domain_init(domain, gaw)) {
1809                 domain_exit(domain);
1810                 goto error;
1811         }
1812
1813         /* register pcie-to-pci device */
1814         if (dev_tmp) {
1815                 info = alloc_devinfo_mem();
1816                 if (!info) {
1817                         domain_exit(domain);
1818                         goto error;
1819                 }
1820                 info->bus = bus;
1821                 info->devfn = devfn;
1822                 info->dev = NULL;
1823                 info->domain = domain;
1824                 /* This domain is shared by devices under p2p bridge */
1825                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1826
1827                 /* pcie-to-pci bridge already has a domain, uses it */
1828                 found = NULL;
1829                 spin_lock_irqsave(&device_domain_lock, flags);
1830                 list_for_each_entry(tmp, &device_domain_list, global) {
1831                         if (tmp->bus == bus && tmp->devfn == devfn) {
1832                                 found = tmp->domain;
1833                                 break;
1834                         }
1835                 }
1836                 if (found) {
1837                         free_devinfo_mem(info);
1838                         domain_exit(domain);
1839                         domain = found;
1840                 } else {
1841                         list_add(&info->link, &domain->devices);
1842                         list_add(&info->global, &device_domain_list);
1843                 }
1844                 spin_unlock_irqrestore(&device_domain_lock, flags);
1845         }
1846
1847 found_domain:
1848         info = alloc_devinfo_mem();
1849         if (!info)
1850                 goto error;
1851         info->bus = pdev->bus->number;
1852         info->devfn = pdev->devfn;
1853         info->dev = pdev;
1854         info->domain = domain;
1855         spin_lock_irqsave(&device_domain_lock, flags);
1856         /* somebody is fast */
1857         found = find_domain(pdev);
1858         if (found != NULL) {
1859                 spin_unlock_irqrestore(&device_domain_lock, flags);
1860                 if (found != domain) {
1861                         domain_exit(domain);
1862                         domain = found;
1863                 }
1864                 free_devinfo_mem(info);
1865                 return domain;
1866         }
1867         list_add(&info->link, &domain->devices);
1868         list_add(&info->global, &device_domain_list);
1869         pdev->dev.archdata.iommu = info;
1870         spin_unlock_irqrestore(&device_domain_lock, flags);
1871         return domain;
1872 error:
1873         /* recheck it here, maybe others set it */
1874         return find_domain(pdev);
1875 }
1876
1877 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1878                                       unsigned long long start,
1879                                       unsigned long long end)
1880 {
1881         struct dmar_domain *domain;
1882         unsigned long size;
1883         unsigned long long base;
1884         int ret;
1885
1886         printk(KERN_INFO
1887                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1888                 pci_name(pdev), start, end);
1889         /* page table init */
1890         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1891         if (!domain)
1892                 return -ENOMEM;
1893
1894         /* The address might not be aligned */
1895         base = start & PAGE_MASK;
1896         size = end - base;
1897         size = PAGE_ALIGN(size);
1898         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1899                         IOVA_PFN(base + size) - 1)) {
1900                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1901                 ret = -ENOMEM;
1902                 goto error;
1903         }
1904
1905         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1906                 size, base, pci_name(pdev));
1907         /*
1908          * RMRR range might have overlap with physical memory range,
1909          * clear it first
1910          */
1911         dma_pte_clear_range(domain, base, base + size);
1912
1913         ret = domain_page_mapping(domain, base, base, size,
1914                 DMA_PTE_READ|DMA_PTE_WRITE);
1915         if (ret)
1916                 goto error;
1917
1918         /* context entry init */
1919         ret = domain_context_mapping(domain, pdev);
1920         if (!ret)
1921                 return 0;
1922 error:
1923         domain_exit(domain);
1924         return ret;
1925
1926 }
1927
1928 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1929         struct pci_dev *pdev)
1930 {
1931         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1932                 return 0;
1933         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1934                 rmrr->end_address + 1);
1935 }
1936
1937 #ifdef CONFIG_DMAR_GFX_WA
1938 struct iommu_prepare_data {
1939         struct pci_dev *pdev;
1940         int ret;
1941 };
1942
1943 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1944                                          unsigned long end_pfn, void *datax)
1945 {
1946         struct iommu_prepare_data *data;
1947
1948         data = (struct iommu_prepare_data *)datax;
1949
1950         data->ret = iommu_prepare_identity_map(data->pdev,
1951                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1952         return data->ret;
1953
1954 }
1955
1956 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1957 {
1958         int nid;
1959         struct iommu_prepare_data data;
1960
1961         data.pdev = pdev;
1962         data.ret = 0;
1963
1964         for_each_online_node(nid) {
1965                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1966                 if (data.ret)
1967                         return data.ret;
1968         }
1969         return data.ret;
1970 }
1971
1972 static void __init iommu_prepare_gfx_mapping(void)
1973 {
1974         struct pci_dev *pdev = NULL;
1975         int ret;
1976
1977         for_each_pci_dev(pdev) {
1978                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1979                                 !IS_GFX_DEVICE(pdev))
1980                         continue;
1981                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1982                         pci_name(pdev));
1983                 ret = iommu_prepare_with_active_regions(pdev);
1984                 if (ret)
1985                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1986         }
1987 }
1988 #else /* !CONFIG_DMAR_GFX_WA */
1989 static inline void iommu_prepare_gfx_mapping(void)
1990 {
1991         return;
1992 }
1993 #endif
1994
1995 #ifdef CONFIG_DMAR_FLOPPY_WA
1996 static inline void iommu_prepare_isa(void)
1997 {
1998         struct pci_dev *pdev;
1999         int ret;
2000
2001         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2002         if (!pdev)
2003                 return;
2004
2005         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
2006         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2007
2008         if (ret)
2009                 printk("IOMMU: Failed to create 0-64M identity map, "
2010                         "floppy might not work\n");
2011
2012 }
2013 #else
2014 static inline void iommu_prepare_isa(void)
2015 {
2016         return;
2017 }
2018 #endif /* !CONFIG_DMAR_FLPY_WA */
2019
2020 static int __init init_dmars(void)
2021 {
2022         struct dmar_drhd_unit *drhd;
2023         struct dmar_rmrr_unit *rmrr;
2024         struct pci_dev *pdev;
2025         struct intel_iommu *iommu;
2026         int i, ret, unit = 0;
2027
2028         /*
2029          * for each drhd
2030          *    allocate root
2031          *    initialize and program root entry to not present
2032          * endfor
2033          */
2034         for_each_drhd_unit(drhd) {
2035                 g_num_of_iommus++;
2036                 /*
2037                  * lock not needed as this is only incremented in the single
2038                  * threaded kernel __init code path all other access are read
2039                  * only
2040                  */
2041         }
2042
2043         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2044                         GFP_KERNEL);
2045         if (!g_iommus) {
2046                 printk(KERN_ERR "Allocating global iommu array failed\n");
2047                 ret = -ENOMEM;
2048                 goto error;
2049         }
2050
2051         deferred_flush = kzalloc(g_num_of_iommus *
2052                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2053         if (!deferred_flush) {
2054                 kfree(g_iommus);
2055                 ret = -ENOMEM;
2056                 goto error;
2057         }
2058
2059         for_each_drhd_unit(drhd) {
2060                 if (drhd->ignored)
2061                         continue;
2062
2063                 iommu = drhd->iommu;
2064                 g_iommus[iommu->seq_id] = iommu;
2065
2066                 ret = iommu_init_domains(iommu);
2067                 if (ret)
2068                         goto error;
2069
2070                 /*
2071                  * TBD:
2072                  * we could share the same root & context tables
2073                  * amoung all IOMMU's. Need to Split it later.
2074                  */
2075                 ret = iommu_alloc_root_entry(iommu);
2076                 if (ret) {
2077                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2078                         goto error;
2079                 }
2080         }
2081
2082         for_each_drhd_unit(drhd) {
2083                 if (drhd->ignored)
2084                         continue;
2085
2086                 iommu = drhd->iommu;
2087                 if (dmar_enable_qi(iommu)) {
2088                         /*
2089                          * Queued Invalidate not enabled, use Register Based
2090                          * Invalidate
2091                          */
2092                         iommu->flush.flush_context = __iommu_flush_context;
2093                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2094                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2095                                "invalidation\n",
2096                                (unsigned long long)drhd->reg_base_addr);
2097                 } else {
2098                         iommu->flush.flush_context = qi_flush_context;
2099                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2100                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2101                                "invalidation\n",
2102                                (unsigned long long)drhd->reg_base_addr);
2103                 }
2104         }
2105
2106         /*
2107          * For each rmrr
2108          *   for each dev attached to rmrr
2109          *   do
2110          *     locate drhd for dev, alloc domain for dev
2111          *     allocate free domain
2112          *     allocate page table entries for rmrr
2113          *     if context not allocated for bus
2114          *           allocate and init context
2115          *           set present in root table for this bus
2116          *     init context with domain, translation etc
2117          *    endfor
2118          * endfor
2119          */
2120         for_each_rmrr_units(rmrr) {
2121                 for (i = 0; i < rmrr->devices_cnt; i++) {
2122                         pdev = rmrr->devices[i];
2123                         /* some BIOS lists non-exist devices in DMAR table */
2124                         if (!pdev)
2125                                 continue;
2126                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2127                         if (ret)
2128                                 printk(KERN_ERR
2129                                  "IOMMU: mapping reserved region failed\n");
2130                 }
2131         }
2132
2133         iommu_prepare_gfx_mapping();
2134
2135         iommu_prepare_isa();
2136
2137         /*
2138          * for each drhd
2139          *   enable fault log
2140          *   global invalidate context cache
2141          *   global invalidate iotlb
2142          *   enable translation
2143          */
2144         for_each_drhd_unit(drhd) {
2145                 if (drhd->ignored)
2146                         continue;
2147                 iommu = drhd->iommu;
2148                 sprintf (iommu->name, "dmar%d", unit++);
2149
2150                 iommu_flush_write_buffer(iommu);
2151
2152                 ret = dmar_set_interrupt(iommu);
2153                 if (ret)
2154                         goto error;
2155
2156                 iommu_set_root_entry(iommu);
2157
2158                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2159                                            0);
2160                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2161                                          0);
2162                 iommu_disable_protect_mem_regions(iommu);
2163
2164                 ret = iommu_enable_translation(iommu);
2165                 if (ret)
2166                         goto error;
2167         }
2168
2169         return 0;
2170 error:
2171         for_each_drhd_unit(drhd) {
2172                 if (drhd->ignored)
2173                         continue;
2174                 iommu = drhd->iommu;
2175                 free_iommu(iommu);
2176         }
2177         kfree(g_iommus);
2178         return ret;
2179 }
2180
2181 static inline u64 aligned_size(u64 host_addr, size_t size)
2182 {
2183         u64 addr;
2184         addr = (host_addr & (~PAGE_MASK)) + size;
2185         return PAGE_ALIGN(addr);
2186 }
2187
2188 struct iova *
2189 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2190 {
2191         struct iova *piova;
2192
2193         /* Make sure it's in range */
2194         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2195         if (!size || (IOVA_START_ADDR + size > end))
2196                 return NULL;
2197
2198         piova = alloc_iova(&domain->iovad,
2199                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2200         return piova;
2201 }
2202
2203 static struct iova *
2204 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2205                    size_t size, u64 dma_mask)
2206 {
2207         struct pci_dev *pdev = to_pci_dev(dev);
2208         struct iova *iova = NULL;
2209
2210         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2211                 iova = iommu_alloc_iova(domain, size, dma_mask);
2212         else {
2213                 /*
2214                  * First try to allocate an io virtual address in
2215                  * DMA_32BIT_MASK and if that fails then try allocating
2216                  * from higher range
2217                  */
2218                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2219                 if (!iova)
2220                         iova = iommu_alloc_iova(domain, size, dma_mask);
2221         }
2222
2223         if (!iova) {
2224                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2225                 return NULL;
2226         }
2227
2228         return iova;
2229 }
2230
2231 static struct dmar_domain *
2232 get_valid_domain_for_dev(struct pci_dev *pdev)
2233 {
2234         struct dmar_domain *domain;
2235         int ret;
2236
2237         domain = get_domain_for_dev(pdev,
2238                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2239         if (!domain) {
2240                 printk(KERN_ERR
2241                         "Allocating domain for %s failed", pci_name(pdev));
2242                 return NULL;
2243         }
2244
2245         /* make sure context mapping is ok */
2246         if (unlikely(!domain_context_mapped(pdev))) {
2247                 ret = domain_context_mapping(domain, pdev);
2248                 if (ret) {
2249                         printk(KERN_ERR
2250                                 "Domain context map for %s failed",
2251                                 pci_name(pdev));
2252                         return NULL;
2253                 }
2254         }
2255
2256         return domain;
2257 }
2258
2259 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2260                                      size_t size, int dir, u64 dma_mask)
2261 {
2262         struct pci_dev *pdev = to_pci_dev(hwdev);
2263         struct dmar_domain *domain;
2264         phys_addr_t start_paddr;
2265         struct iova *iova;
2266         int prot = 0;
2267         int ret;
2268         struct intel_iommu *iommu;
2269
2270         BUG_ON(dir == DMA_NONE);
2271         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2272                 return paddr;
2273
2274         domain = get_valid_domain_for_dev(pdev);
2275         if (!domain)
2276                 return 0;
2277
2278         iommu = domain_get_iommu(domain);
2279         size = aligned_size((u64)paddr, size);
2280
2281         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2282         if (!iova)
2283                 goto error;
2284
2285         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2286
2287         /*
2288          * Check if DMAR supports zero-length reads on write only
2289          * mappings..
2290          */
2291         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2292                         !cap_zlr(iommu->cap))
2293                 prot |= DMA_PTE_READ;
2294         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2295                 prot |= DMA_PTE_WRITE;
2296         /*
2297          * paddr - (paddr + size) might be partial page, we should map the whole
2298          * page.  Note: if two part of one page are separately mapped, we
2299          * might have two guest_addr mapping to the same host paddr, but this
2300          * is not a big problem
2301          */
2302         ret = domain_page_mapping(domain, start_paddr,
2303                 ((u64)paddr) & PAGE_MASK, size, prot);
2304         if (ret)
2305                 goto error;
2306
2307         /* it's a non-present to present mapping */
2308         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2309                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2310         if (ret)
2311                 iommu_flush_write_buffer(iommu);
2312
2313         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2314
2315 error:
2316         if (iova)
2317                 __free_iova(&domain->iovad, iova);
2318         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2319                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2320         return 0;
2321 }
2322
2323 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2324                             size_t size, int dir)
2325 {
2326         return __intel_map_single(hwdev, paddr, size, dir,
2327                                   to_pci_dev(hwdev)->dma_mask);
2328 }
2329
2330 static void flush_unmaps(void)
2331 {
2332         int i, j;
2333
2334         timer_on = 0;
2335
2336         /* just flush them all */
2337         for (i = 0; i < g_num_of_iommus; i++) {
2338                 struct intel_iommu *iommu = g_iommus[i];
2339                 if (!iommu)
2340                         continue;
2341
2342                 if (deferred_flush[i].next) {
2343                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2344                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2345                         for (j = 0; j < deferred_flush[i].next; j++) {
2346                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2347                                                 deferred_flush[i].iova[j]);
2348                         }
2349                         deferred_flush[i].next = 0;
2350                 }
2351         }
2352
2353         list_size = 0;
2354 }
2355
2356 static void flush_unmaps_timeout(unsigned long data)
2357 {
2358         unsigned long flags;
2359
2360         spin_lock_irqsave(&async_umap_flush_lock, flags);
2361         flush_unmaps();
2362         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2363 }
2364
2365 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2366 {
2367         unsigned long flags;
2368         int next, iommu_id;
2369         struct intel_iommu *iommu;
2370
2371         spin_lock_irqsave(&async_umap_flush_lock, flags);
2372         if (list_size == HIGH_WATER_MARK)
2373                 flush_unmaps();
2374
2375         iommu = domain_get_iommu(dom);
2376         iommu_id = iommu->seq_id;
2377
2378         next = deferred_flush[iommu_id].next;
2379         deferred_flush[iommu_id].domain[next] = dom;
2380         deferred_flush[iommu_id].iova[next] = iova;
2381         deferred_flush[iommu_id].next++;
2382
2383         if (!timer_on) {
2384                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2385                 timer_on = 1;
2386         }
2387         list_size++;
2388         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2389 }
2390
2391 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2392                         int dir)
2393 {
2394         struct pci_dev *pdev = to_pci_dev(dev);
2395         struct dmar_domain *domain;
2396         unsigned long start_addr;
2397         struct iova *iova;
2398         struct intel_iommu *iommu;
2399
2400         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2401                 return;
2402         domain = find_domain(pdev);
2403         BUG_ON(!domain);
2404
2405         iommu = domain_get_iommu(domain);
2406
2407         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2408         if (!iova)
2409                 return;
2410
2411         start_addr = iova->pfn_lo << PAGE_SHIFT;
2412         size = aligned_size((u64)dev_addr, size);
2413
2414         pr_debug("Device %s unmapping: %lx@%llx\n",
2415                 pci_name(pdev), size, (unsigned long long)start_addr);
2416
2417         /*  clear the whole page */
2418         dma_pte_clear_range(domain, start_addr, start_addr + size);
2419         /* free page tables */
2420         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2421         if (intel_iommu_strict) {
2422                 if (iommu_flush_iotlb_psi(iommu,
2423                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2424                         iommu_flush_write_buffer(iommu);
2425                 /* free iova */
2426                 __free_iova(&domain->iovad, iova);
2427         } else {
2428                 add_unmap(domain, iova);
2429                 /*
2430                  * queue up the release of the unmap to save the 1/6th of the
2431                  * cpu used up by the iotlb flush operation...
2432                  */
2433         }
2434 }
2435
2436 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2437                            dma_addr_t *dma_handle, gfp_t flags)
2438 {
2439         void *vaddr;
2440         int order;
2441
2442         size = PAGE_ALIGN(size);
2443         order = get_order(size);
2444         flags &= ~(GFP_DMA | GFP_DMA32);
2445
2446         vaddr = (void *)__get_free_pages(flags, order);
2447         if (!vaddr)
2448                 return NULL;
2449         memset(vaddr, 0, size);
2450
2451         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2452                                          DMA_BIDIRECTIONAL,
2453                                          hwdev->coherent_dma_mask);
2454         if (*dma_handle)
2455                 return vaddr;
2456         free_pages((unsigned long)vaddr, order);
2457         return NULL;
2458 }
2459
2460 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2461                          dma_addr_t dma_handle)
2462 {
2463         int order;
2464
2465         size = PAGE_ALIGN(size);
2466         order = get_order(size);
2467
2468         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2469         free_pages((unsigned long)vaddr, order);
2470 }
2471
2472 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2473
2474 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2475                     int nelems, int dir)
2476 {
2477         int i;
2478         struct pci_dev *pdev = to_pci_dev(hwdev);
2479         struct dmar_domain *domain;
2480         unsigned long start_addr;
2481         struct iova *iova;
2482         size_t size = 0;
2483         void *addr;
2484         struct scatterlist *sg;
2485         struct intel_iommu *iommu;
2486
2487         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2488                 return;
2489
2490         domain = find_domain(pdev);
2491         BUG_ON(!domain);
2492
2493         iommu = domain_get_iommu(domain);
2494
2495         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2496         if (!iova)
2497                 return;
2498         for_each_sg(sglist, sg, nelems, i) {
2499                 addr = SG_ENT_VIRT_ADDRESS(sg);
2500                 size += aligned_size((u64)addr, sg->length);
2501         }
2502
2503         start_addr = iova->pfn_lo << PAGE_SHIFT;
2504
2505         /*  clear the whole page */
2506         dma_pte_clear_range(domain, start_addr, start_addr + size);
2507         /* free page tables */
2508         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2509
2510         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2511                         size >> VTD_PAGE_SHIFT, 0))
2512                 iommu_flush_write_buffer(iommu);
2513
2514         /* free iova */
2515         __free_iova(&domain->iovad, iova);
2516 }
2517
2518 static int intel_nontranslate_map_sg(struct device *hddev,
2519         struct scatterlist *sglist, int nelems, int dir)
2520 {
2521         int i;
2522         struct scatterlist *sg;
2523
2524         for_each_sg(sglist, sg, nelems, i) {
2525                 BUG_ON(!sg_page(sg));
2526                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2527                 sg->dma_length = sg->length;
2528         }
2529         return nelems;
2530 }
2531
2532 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2533                  int dir)
2534 {
2535         void *addr;
2536         int i;
2537         struct pci_dev *pdev = to_pci_dev(hwdev);
2538         struct dmar_domain *domain;
2539         size_t size = 0;
2540         int prot = 0;
2541         size_t offset = 0;
2542         struct iova *iova = NULL;
2543         int ret;
2544         struct scatterlist *sg;
2545         unsigned long start_addr;
2546         struct intel_iommu *iommu;
2547
2548         BUG_ON(dir == DMA_NONE);
2549         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2550                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2551
2552         domain = get_valid_domain_for_dev(pdev);
2553         if (!domain)
2554                 return 0;
2555
2556         iommu = domain_get_iommu(domain);
2557
2558         for_each_sg(sglist, sg, nelems, i) {
2559                 addr = SG_ENT_VIRT_ADDRESS(sg);
2560                 addr = (void *)virt_to_phys(addr);
2561                 size += aligned_size((u64)addr, sg->length);
2562         }
2563
2564         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2565         if (!iova) {
2566                 sglist->dma_length = 0;
2567                 return 0;
2568         }
2569
2570         /*
2571          * Check if DMAR supports zero-length reads on write only
2572          * mappings..
2573          */
2574         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2575                         !cap_zlr(iommu->cap))
2576                 prot |= DMA_PTE_READ;
2577         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2578                 prot |= DMA_PTE_WRITE;
2579
2580         start_addr = iova->pfn_lo << PAGE_SHIFT;
2581         offset = 0;
2582         for_each_sg(sglist, sg, nelems, i) {
2583                 addr = SG_ENT_VIRT_ADDRESS(sg);
2584                 addr = (void *)virt_to_phys(addr);
2585                 size = aligned_size((u64)addr, sg->length);
2586                 ret = domain_page_mapping(domain, start_addr + offset,
2587                         ((u64)addr) & PAGE_MASK,
2588                         size, prot);
2589                 if (ret) {
2590                         /*  clear the page */
2591                         dma_pte_clear_range(domain, start_addr,
2592                                   start_addr + offset);
2593                         /* free page tables */
2594                         dma_pte_free_pagetable(domain, start_addr,
2595                                   start_addr + offset);
2596                         /* free iova */
2597                         __free_iova(&domain->iovad, iova);
2598                         return 0;
2599                 }
2600                 sg->dma_address = start_addr + offset +
2601                                 ((u64)addr & (~PAGE_MASK));
2602                 sg->dma_length = sg->length;
2603                 offset += size;
2604         }
2605
2606         /* it's a non-present to present mapping */
2607         if (iommu_flush_iotlb_psi(iommu, domain->id,
2608                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2609                 iommu_flush_write_buffer(iommu);
2610         return nelems;
2611 }
2612
2613 static struct dma_mapping_ops intel_dma_ops = {
2614         .alloc_coherent = intel_alloc_coherent,
2615         .free_coherent = intel_free_coherent,
2616         .map_single = intel_map_single,
2617         .unmap_single = intel_unmap_single,
2618         .map_sg = intel_map_sg,
2619         .unmap_sg = intel_unmap_sg,
2620 };
2621
2622 static inline int iommu_domain_cache_init(void)
2623 {
2624         int ret = 0;
2625
2626         iommu_domain_cache = kmem_cache_create("iommu_domain",
2627                                          sizeof(struct dmar_domain),
2628                                          0,
2629                                          SLAB_HWCACHE_ALIGN,
2630
2631                                          NULL);
2632         if (!iommu_domain_cache) {
2633                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2634                 ret = -ENOMEM;
2635         }
2636
2637         return ret;
2638 }
2639
2640 static inline int iommu_devinfo_cache_init(void)
2641 {
2642         int ret = 0;
2643
2644         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2645                                          sizeof(struct device_domain_info),
2646                                          0,
2647                                          SLAB_HWCACHE_ALIGN,
2648                                          NULL);
2649         if (!iommu_devinfo_cache) {
2650                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2651                 ret = -ENOMEM;
2652         }
2653
2654         return ret;
2655 }
2656
2657 static inline int iommu_iova_cache_init(void)
2658 {
2659         int ret = 0;
2660
2661         iommu_iova_cache = kmem_cache_create("iommu_iova",
2662                                          sizeof(struct iova),
2663                                          0,
2664                                          SLAB_HWCACHE_ALIGN,
2665                                          NULL);
2666         if (!iommu_iova_cache) {
2667                 printk(KERN_ERR "Couldn't create iova cache\n");
2668                 ret = -ENOMEM;
2669         }
2670
2671         return ret;
2672 }
2673
2674 static int __init iommu_init_mempool(void)
2675 {
2676         int ret;
2677         ret = iommu_iova_cache_init();
2678         if (ret)
2679                 return ret;
2680
2681         ret = iommu_domain_cache_init();
2682         if (ret)
2683                 goto domain_error;
2684
2685         ret = iommu_devinfo_cache_init();
2686         if (!ret)
2687                 return ret;
2688
2689         kmem_cache_destroy(iommu_domain_cache);
2690 domain_error:
2691         kmem_cache_destroy(iommu_iova_cache);
2692
2693         return -ENOMEM;
2694 }
2695
2696 static void __init iommu_exit_mempool(void)
2697 {
2698         kmem_cache_destroy(iommu_devinfo_cache);
2699         kmem_cache_destroy(iommu_domain_cache);
2700         kmem_cache_destroy(iommu_iova_cache);
2701
2702 }
2703
2704 static void __init init_no_remapping_devices(void)
2705 {
2706         struct dmar_drhd_unit *drhd;
2707
2708         for_each_drhd_unit(drhd) {
2709                 if (!drhd->include_all) {
2710                         int i;
2711                         for (i = 0; i < drhd->devices_cnt; i++)
2712                                 if (drhd->devices[i] != NULL)
2713                                         break;
2714                         /* ignore DMAR unit if no pci devices exist */
2715                         if (i == drhd->devices_cnt)
2716                                 drhd->ignored = 1;
2717                 }
2718         }
2719
2720         if (dmar_map_gfx)
2721                 return;
2722
2723         for_each_drhd_unit(drhd) {
2724                 int i;
2725                 if (drhd->ignored || drhd->include_all)
2726                         continue;
2727
2728                 for (i = 0; i < drhd->devices_cnt; i++)
2729                         if (drhd->devices[i] &&
2730                                 !IS_GFX_DEVICE(drhd->devices[i]))
2731                                 break;
2732
2733                 if (i < drhd->devices_cnt)
2734                         continue;
2735
2736                 /* bypass IOMMU if it is just for gfx devices */
2737                 drhd->ignored = 1;
2738                 for (i = 0; i < drhd->devices_cnt; i++) {
2739                         if (!drhd->devices[i])
2740                                 continue;
2741                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2742                 }
2743         }
2744 }
2745
2746 int __init intel_iommu_init(void)
2747 {
2748         int ret = 0;
2749
2750         if (dmar_table_init())
2751                 return  -ENODEV;
2752
2753         if (dmar_dev_scope_init())
2754                 return  -ENODEV;
2755
2756         /*
2757          * Check the need for DMA-remapping initialization now.
2758          * Above initialization will also be used by Interrupt-remapping.
2759          */
2760         if (no_iommu || swiotlb || dmar_disabled)
2761                 return -ENODEV;
2762
2763         iommu_init_mempool();
2764         dmar_init_reserved_ranges();
2765
2766         init_no_remapping_devices();
2767
2768         ret = init_dmars();
2769         if (ret) {
2770                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2771                 put_iova_domain(&reserved_iova_list);
2772                 iommu_exit_mempool();
2773                 return ret;
2774         }
2775         printk(KERN_INFO
2776         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2777
2778         init_timer(&unmap_timer);
2779         force_iommu = 1;
2780         dma_ops = &intel_dma_ops;
2781
2782         register_iommu(&intel_iommu_ops);
2783
2784         return 0;
2785 }
2786
2787 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2788                                   struct pci_dev *pdev)
2789 {
2790         struct device_domain_info *info;
2791         unsigned long flags;
2792
2793         info = alloc_devinfo_mem();
2794         if (!info)
2795                 return -ENOMEM;
2796
2797         info->bus = pdev->bus->number;
2798         info->devfn = pdev->devfn;
2799         info->dev = pdev;
2800         info->domain = domain;
2801
2802         spin_lock_irqsave(&device_domain_lock, flags);
2803         list_add(&info->link, &domain->devices);
2804         list_add(&info->global, &device_domain_list);
2805         pdev->dev.archdata.iommu = info;
2806         spin_unlock_irqrestore(&device_domain_lock, flags);
2807
2808         return 0;
2809 }
2810
2811 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2812                                            struct pci_dev *pdev)
2813 {
2814         struct pci_dev *tmp, *parent;
2815
2816         if (!iommu || !pdev)
2817                 return;
2818
2819         /* dependent device detach */
2820         tmp = pci_find_upstream_pcie_bridge(pdev);
2821         /* Secondary interface's bus number and devfn 0 */
2822         if (tmp) {
2823                 parent = pdev->bus->self;
2824                 while (parent != tmp) {
2825                         iommu_detach_dev(iommu, parent->bus->number,
2826                                 parent->devfn);
2827                         parent = parent->bus->self;
2828                 }
2829                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2830                         iommu_detach_dev(iommu,
2831                                 tmp->subordinate->number, 0);
2832                 else /* this is a legacy PCI bridge */
2833                         iommu_detach_dev(iommu,
2834                                 tmp->bus->number, tmp->devfn);
2835         }
2836 }
2837
2838 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2839                                           struct pci_dev *pdev)
2840 {
2841         struct device_domain_info *info;
2842         struct intel_iommu *iommu;
2843         unsigned long flags;
2844         int found = 0;
2845         struct list_head *entry, *tmp;
2846
2847         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2848         if (!iommu)
2849                 return;
2850
2851         spin_lock_irqsave(&device_domain_lock, flags);
2852         list_for_each_safe(entry, tmp, &domain->devices) {
2853                 info = list_entry(entry, struct device_domain_info, link);
2854                 if (info->bus == pdev->bus->number &&
2855                     info->devfn == pdev->devfn) {
2856                         list_del(&info->link);
2857                         list_del(&info->global);
2858                         if (info->dev)
2859                                 info->dev->dev.archdata.iommu = NULL;
2860                         spin_unlock_irqrestore(&device_domain_lock, flags);
2861
2862                         iommu_detach_dev(iommu, info->bus, info->devfn);
2863                         iommu_detach_dependent_devices(iommu, pdev);
2864                         free_devinfo_mem(info);
2865
2866                         spin_lock_irqsave(&device_domain_lock, flags);
2867
2868                         if (found)
2869                                 break;
2870                         else
2871                                 continue;
2872                 }
2873
2874                 /* if there is no other devices under the same iommu
2875                  * owned by this domain, clear this iommu in iommu_bmp
2876                  * update iommu count and coherency
2877                  */
2878                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2879                         found = 1;
2880         }
2881
2882         if (found == 0) {
2883                 unsigned long tmp_flags;
2884                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2885                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2886                 domain->iommu_count--;
2887                 domain_update_iommu_cap(domain);
2888                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2889         }
2890
2891         spin_unlock_irqrestore(&device_domain_lock, flags);
2892 }
2893
2894 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2895 {
2896         struct device_domain_info *info;
2897         struct intel_iommu *iommu;
2898         unsigned long flags1, flags2;
2899
2900         spin_lock_irqsave(&device_domain_lock, flags1);
2901         while (!list_empty(&domain->devices)) {
2902                 info = list_entry(domain->devices.next,
2903                         struct device_domain_info, link);
2904                 list_del(&info->link);
2905                 list_del(&info->global);
2906                 if (info->dev)
2907                         info->dev->dev.archdata.iommu = NULL;
2908
2909                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2910
2911                 iommu = device_to_iommu(info->bus, info->devfn);
2912                 iommu_detach_dev(iommu, info->bus, info->devfn);
2913                 iommu_detach_dependent_devices(iommu, info->dev);
2914
2915                 /* clear this iommu in iommu_bmp, update iommu count
2916                  * and capabilities
2917                  */
2918                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2919                 if (test_and_clear_bit(iommu->seq_id,
2920                                        &domain->iommu_bmp)) {
2921                         domain->iommu_count--;
2922                         domain_update_iommu_cap(domain);
2923                 }
2924                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2925
2926                 free_devinfo_mem(info);
2927                 spin_lock_irqsave(&device_domain_lock, flags1);
2928         }
2929         spin_unlock_irqrestore(&device_domain_lock, flags1);
2930 }
2931
2932 /* domain id for virtual machine, it won't be set in context */
2933 static unsigned long vm_domid;
2934
2935 static int vm_domain_min_agaw(struct dmar_domain *domain)
2936 {
2937         int i;
2938         int min_agaw = domain->agaw;
2939
2940         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2941         for (; i < g_num_of_iommus; ) {
2942                 if (min_agaw > g_iommus[i]->agaw)
2943                         min_agaw = g_iommus[i]->agaw;
2944
2945                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2946         }
2947
2948         return min_agaw;
2949 }
2950
2951 static struct dmar_domain *iommu_alloc_vm_domain(void)
2952 {
2953         struct dmar_domain *domain;
2954
2955         domain = alloc_domain_mem();
2956         if (!domain)
2957                 return NULL;
2958
2959         domain->id = vm_domid++;
2960         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2961         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2962
2963         return domain;
2964 }
2965
2966 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2967 {
2968         int adjust_width;
2969
2970         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2971         spin_lock_init(&domain->mapping_lock);
2972         spin_lock_init(&domain->iommu_lock);
2973
2974         domain_reserve_special_ranges(domain);
2975
2976         /* calculate AGAW */
2977         domain->gaw = guest_width;
2978         adjust_width = guestwidth_to_adjustwidth(guest_width);
2979         domain->agaw = width_to_agaw(adjust_width);
2980
2981         INIT_LIST_HEAD(&domain->devices);
2982
2983         domain->iommu_count = 0;
2984         domain->iommu_coherency = 0;
2985         domain->max_addr = 0;
2986
2987         /* always allocate the top pgd */
2988         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2989         if (!domain->pgd)
2990                 return -ENOMEM;
2991         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2992         return 0;
2993 }
2994
2995 static void iommu_free_vm_domain(struct dmar_domain *domain)
2996 {
2997         unsigned long flags;
2998         struct dmar_drhd_unit *drhd;
2999         struct intel_iommu *iommu;
3000         unsigned long i;
3001         unsigned long ndomains;
3002
3003         for_each_drhd_unit(drhd) {
3004                 if (drhd->ignored)
3005                         continue;
3006                 iommu = drhd->iommu;
3007
3008                 ndomains = cap_ndoms(iommu->cap);
3009                 i = find_first_bit(iommu->domain_ids, ndomains);
3010                 for (; i < ndomains; ) {
3011                         if (iommu->domains[i] == domain) {
3012                                 spin_lock_irqsave(&iommu->lock, flags);
3013                                 clear_bit(i, iommu->domain_ids);
3014                                 iommu->domains[i] = NULL;
3015                                 spin_unlock_irqrestore(&iommu->lock, flags);
3016                                 break;
3017                         }
3018                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3019                 }
3020         }
3021 }
3022
3023 static void vm_domain_exit(struct dmar_domain *domain)
3024 {
3025         u64 end;
3026
3027         /* Domain 0 is reserved, so dont process it */
3028         if (!domain)
3029                 return;
3030
3031         vm_domain_remove_all_dev_info(domain);
3032         /* destroy iovas */
3033         put_iova_domain(&domain->iovad);
3034         end = DOMAIN_MAX_ADDR(domain->gaw);
3035         end = end & (~VTD_PAGE_MASK);
3036
3037         /* clear ptes */
3038         dma_pte_clear_range(domain, 0, end);
3039
3040         /* free page tables */
3041         dma_pte_free_pagetable(domain, 0, end);
3042
3043         iommu_free_vm_domain(domain);
3044         free_domain_mem(domain);
3045 }
3046
3047 static int intel_iommu_domain_init(struct iommu_domain *domain)
3048 {
3049         struct dmar_domain *dmar_domain;
3050
3051         dmar_domain = iommu_alloc_vm_domain();
3052         if (!dmar_domain) {
3053                 printk(KERN_ERR
3054                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3055                 return -ENOMEM;
3056         }
3057         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3058                 printk(KERN_ERR
3059                         "intel_iommu_domain_init() failed\n");
3060                 vm_domain_exit(dmar_domain);
3061                 return -ENOMEM;
3062         }
3063         domain->priv = dmar_domain;
3064
3065         return 0;
3066 }
3067
3068 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3069 {
3070         struct dmar_domain *dmar_domain = domain->priv;
3071
3072         domain->priv = NULL;
3073         vm_domain_exit(dmar_domain);
3074 }
3075
3076 static int intel_iommu_attach_device(struct iommu_domain *domain,
3077                                      struct device *dev)
3078 {
3079         struct dmar_domain *dmar_domain = domain->priv;
3080         struct pci_dev *pdev = to_pci_dev(dev);
3081         struct intel_iommu *iommu;
3082         int addr_width;
3083         u64 end;
3084         int ret;
3085
3086         /* normally pdev is not mapped */
3087         if (unlikely(domain_context_mapped(pdev))) {
3088                 struct dmar_domain *old_domain;
3089
3090                 old_domain = find_domain(pdev);
3091                 if (old_domain) {
3092                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3093                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3094                         else
3095                                 domain_remove_dev_info(old_domain);
3096                 }
3097         }
3098
3099         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3100         if (!iommu)
3101                 return -ENODEV;
3102
3103         /* check if this iommu agaw is sufficient for max mapped address */
3104         addr_width = agaw_to_width(iommu->agaw);
3105         end = DOMAIN_MAX_ADDR(addr_width);
3106         end = end & VTD_PAGE_MASK;
3107         if (end < dmar_domain->max_addr) {
3108                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3109                        "sufficient for the mapped address (%llx)\n",
3110                        __func__, iommu->agaw, dmar_domain->max_addr);
3111                 return -EFAULT;
3112         }
3113
3114         ret = domain_context_mapping(dmar_domain, pdev);
3115         if (ret)
3116                 return ret;
3117
3118         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3119         return ret;
3120 }
3121
3122 static void intel_iommu_detach_device(struct iommu_domain *domain,
3123                                       struct device *dev)
3124 {
3125         struct dmar_domain *dmar_domain = domain->priv;
3126         struct pci_dev *pdev = to_pci_dev(dev);
3127
3128         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3129 }
3130
3131 static int intel_iommu_map_range(struct iommu_domain *domain,
3132                                  unsigned long iova, phys_addr_t hpa,
3133                                  size_t size, int iommu_prot)
3134 {
3135         struct dmar_domain *dmar_domain = domain->priv;
3136         u64 max_addr;
3137         int addr_width;
3138         int prot = 0;
3139         int ret;
3140
3141         if (iommu_prot & IOMMU_READ)
3142                 prot |= DMA_PTE_READ;
3143         if (iommu_prot & IOMMU_WRITE)
3144                 prot |= DMA_PTE_WRITE;
3145         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3146                 prot |= DMA_PTE_SNP;
3147
3148         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3149         if (dmar_domain->max_addr < max_addr) {
3150                 int min_agaw;
3151                 u64 end;
3152
3153                 /* check if minimum agaw is sufficient for mapped address */
3154                 min_agaw = vm_domain_min_agaw(dmar_domain);
3155                 addr_width = agaw_to_width(min_agaw);
3156                 end = DOMAIN_MAX_ADDR(addr_width);
3157                 end = end & VTD_PAGE_MASK;
3158                 if (end < max_addr) {
3159                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3160                                "sufficient for the mapped address (%llx)\n",
3161                                __func__, min_agaw, max_addr);
3162                         return -EFAULT;
3163                 }
3164                 dmar_domain->max_addr = max_addr;
3165         }
3166
3167         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3168         return ret;
3169 }
3170
3171 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3172                                     unsigned long iova, size_t size)
3173 {
3174         struct dmar_domain *dmar_domain = domain->priv;
3175         dma_addr_t base;
3176
3177         /* The address might not be aligned */
3178         base = iova & VTD_PAGE_MASK;
3179         size = VTD_PAGE_ALIGN(size);
3180         dma_pte_clear_range(dmar_domain, base, base + size);
3181
3182         if (dmar_domain->max_addr == base + size)
3183                 dmar_domain->max_addr = base;
3184 }
3185
3186 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3187                                             unsigned long iova)
3188 {
3189         struct dmar_domain *dmar_domain = domain->priv;
3190         struct dma_pte *pte;
3191         u64 phys = 0;
3192
3193         pte = addr_to_dma_pte(dmar_domain, iova);
3194         if (pte)
3195                 phys = dma_pte_addr(pte);
3196
3197         return phys;
3198 }
3199
3200 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3201                                       unsigned long cap)
3202 {
3203         struct dmar_domain *dmar_domain = domain->priv;
3204
3205         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3206                 return dmar_domain->iommu_snooping;
3207
3208         return 0;
3209 }
3210
3211 static struct iommu_ops intel_iommu_ops = {
3212         .domain_init    = intel_iommu_domain_init,
3213         .domain_destroy = intel_iommu_domain_destroy,
3214         .attach_dev     = intel_iommu_attach_device,
3215         .detach_dev     = intel_iommu_detach_device,
3216         .map            = intel_iommu_map_range,
3217         .unmap          = intel_iommu_unmap_range,
3218         .iova_to_phys   = intel_iommu_iova_to_phys,
3219         .domain_has_cap = intel_iommu_domain_has_cap,
3220 };
3221
3222 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3223 {
3224         /*
3225          * Mobile 4 Series Chipset neglects to set RWBF capability,
3226          * but needs it:
3227          */
3228         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3229         rwbf_quirk = 1;
3230 }
3231
3232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);