drivers/iommu/intel-svm.c

   1 /*
   2  * Copyright © 2015 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>
  14  */
  15
  16 #include <linux/intel-iommu.h>
  17 #include <linux/mmu_notifier.h>
  18 #include <linux/sched.h>
  19 #include <linux/slab.h>
  20 #include <linux/intel-svm.h>
  21 #include <linux/rculist.h>
  22 #include <linux/pci.h>
  23 #include <linux/pci-ats.h>
  24 #include <linux/dmar.h>
  25 #include <linux/interrupt.h>
  26
  27 static irqreturn_t prq_event_thread(int irq, void *d);
  28
  29 struct pasid_entry {
  30         u64 val;
  31 };
  32
  33 struct pasid_state_entry {
  34         u64 val;
  35 };
  36
  37 int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
  38 {
  39         struct page *pages;
  40         int order;
  41
  42         order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT;
  43         if (order < 0)
  44                 order = 0;
  45
  46         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
  47         if (!pages) {
  48                 pr_warn("IOMMU: %s: Failed to allocate PASID table\n",
  49                         iommu->name);
  50                 return -ENOMEM;
  51         }
  52         iommu->pasid_table = page_address(pages);
  53         pr_info("%s: Allocated order %d PASID table.\n", iommu->name, order);
  54
  55         if (ecap_dis(iommu->ecap)) {
  56                 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
  57                 if (pages)
  58                         iommu->pasid_state_table = page_address(pages);
  59                 else
  60                         pr_warn("IOMMU: %s: Failed to allocate PASID state table\n",
  61                                 iommu->name);
  62         }
  63
  64         idr_init(&iommu->pasid_idr);
  65
  66         return 0;
  67 }
  68
  69 int intel_svm_free_pasid_tables(struct intel_iommu *iommu)
  70 {
  71         int order;
  72
  73         order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT;
  74         if (order < 0)
  75                 order = 0;
  76
  77         if (iommu->pasid_table) {
  78                 free_pages((unsigned long)iommu->pasid_table, order);
  79                 iommu->pasid_table = NULL;
  80         }
  81         if (iommu->pasid_state_table) {
  82                 free_pages((unsigned long)iommu->pasid_state_table, order);
  83                 iommu->pasid_state_table = NULL;
  84         }
  85         idr_destroy(&iommu->pasid_idr);
  86         return 0;
  87 }
  88
  89 #define PRQ_ORDER 0
  90
  91 int intel_svm_enable_prq(struct intel_iommu *iommu)
  92 {
  93         struct page *pages;
  94         int irq, ret;
  95
  96         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
  97         if (!pages) {
  98                 pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
  99                         iommu->name);
 100                 return -ENOMEM;
 101         }
 102         iommu->prq = page_address(pages);
 103
 104         irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
 105         if (irq <= 0) {
 106                 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
 107                        iommu->name);
 108                 ret = -EINVAL;
 109         err:
 110                 free_pages((unsigned long)iommu->prq, PRQ_ORDER);
 111                 iommu->prq = NULL;
 112                 return ret;
 113         }
 114         iommu->pr_irq = irq;
 115
 116         snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
 117
 118         ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
 119                                    iommu->prq_name, iommu);
 120         if (ret) {
 121                 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
 122                        iommu->name);
 123                 dmar_free_hwirq(irq);
 124                 goto err;
 125         }
 126         dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 127         dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
 128         dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
 129
 130         return 0;
 131 }
 132
 133 int intel_svm_finish_prq(struct intel_iommu *iommu)
 134 {
 135         dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 136         dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
 137         dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
 138
 139         free_irq(iommu->pr_irq, iommu);
 140         dmar_free_hwirq(iommu->pr_irq);
 141         iommu->pr_irq = 0;
 142
 143         free_pages((unsigned long)iommu->prq, PRQ_ORDER);
 144         iommu->prq = NULL;
 145
 146         return 0;
 147 }
 148
 149 static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
 150                                        unsigned long address, int pages, int ih)
 151 {
 152         struct qi_desc desc;
 153         int mask = ilog2(__roundup_pow_of_two(pages));
 154
 155         if (pages == -1 || !cap_pgsel_inv(svm->iommu->cap) ||
 156             mask > cap_max_amask_val(svm->iommu->cap)) {
 157                 desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) |
 158                         QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
 159                 desc.high = 0;
 160         } else {
 161                 desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) |
 162                         QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE;
 163                 desc.high = QI_EIOTLB_ADDR(address) | QI_EIOTLB_GL(1) |
 164                         QI_EIOTLB_IH(ih) | QI_EIOTLB_AM(mask);
 165         }
 166
 167         qi_submit_sync(&desc, svm->iommu);
 168
 169         if (sdev->dev_iotlb) {
 170                 desc.low = QI_DEV_EIOTLB_PASID(svm->pasid) | QI_DEV_EIOTLB_SID(sdev->sid) |
 171                         QI_DEV_EIOTLB_QDEP(sdev->qdep) | QI_DEIOTLB_TYPE;
 172                 if (mask) {
 173                         unsigned long adr, delta;
 174
 175                         /* Least significant zero bits in the address indicate the
 176                          * range of the request. So mask them out according to the
 177                          * size. */
 178                         adr = address & ((1<<(VTD_PAGE_SHIFT + mask)) - 1);
 179
 180                         /* Now ensure that we round down further if the original
 181                          * request was not aligned w.r.t. its size */
 182                         delta = address - adr;
 183                         if (delta + (pages << VTD_PAGE_SHIFT) >= (1 << (VTD_PAGE_SHIFT + mask)))
 184                                 adr &= ~(1 << (VTD_PAGE_SHIFT + mask));
 185                         desc.high = QI_DEV_EIOTLB_ADDR(adr) | QI_DEV_EIOTLB_SIZE;
 186                 } else {
 187                         desc.high = QI_DEV_EIOTLB_ADDR(address);
 188                 }
 189                 qi_submit_sync(&desc, svm->iommu);
 190         }
 191 }
 192
 193 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
 194                                   int pages, int ih)
 195 {
 196         struct intel_svm_dev *sdev;
 197
 198         /* Try deferred invalidate if available */
 199         if (svm->iommu->pasid_state_table &&
 200             !cmpxchg64(&svm->iommu->pasid_state_table[svm->pasid].val, 0, 1ULL << 63))
 201                 return;
 202
 203         rcu_read_lock();
 204         list_for_each_entry_rcu(sdev, &svm->devs, list)
 205                 intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
 206         rcu_read_unlock();
 207 }
 208
 209 static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
 210                              unsigned long address, pte_t pte)
 211 {
 212         struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 213
 214         intel_flush_svm_range(svm, address, 1, 1);
 215 }
 216
 217 static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
 218                                   unsigned long address)
 219 {
 220         struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 221
 222         intel_flush_svm_range(svm, address, 1, 1);
 223 }
 224
 225 /* Pages have been freed at this point */
 226 static void intel_invalidate_range(struct mmu_notifier *mn,
 227                                    struct mm_struct *mm,
 228                                    unsigned long start, unsigned long end)
 229 {
 230         struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 231
 232         intel_flush_svm_range(svm, start,
 233                               (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT , 0);
 234 }
 235
 236
 237 static void intel_flush_pasid_dev(struct intel_svm *svm, struct intel_svm_dev *sdev)
 238 {
 239         struct qi_desc desc;
 240
 241         desc.high = 0;
 242         desc.low = QI_PC_TYPE | QI_PC_DID(sdev->did) | QI_PC_PASID_SEL | QI_PC_PASID(svm->pasid);
 243
 244         qi_submit_sync(&desc, svm->iommu);
 245 }
 246
 247 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 248 {
 249         struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 250
 251         svm->iommu->pasid_table[svm->pasid].val = 0;
 252
 253         /* There's no need to do any flush because we can't get here if there
 254          * are any devices left anyway. */
 255         WARN_ON(!list_empty(&svm->devs));
 256 }
 257
 258 static const struct mmu_notifier_ops intel_mmuops = {
 259         .release = intel_mm_release,
 260         .change_pte = intel_change_pte,
 261         .invalidate_page = intel_invalidate_page,
 262         .invalidate_range = intel_invalidate_range,
 263 };
 264
 265 static DEFINE_MUTEX(pasid_mutex);
 266
 267 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
 268 {
 269         struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
 270         struct intel_svm_dev *sdev;
 271         struct intel_svm *svm = NULL;
 272         struct mm_struct *mm = NULL;
 273         int pasid_max;
 274         int ret;
 275
 276         if (WARN_ON(!iommu))
 277                 return -EINVAL;
 278
 279         if (dev_is_pci(dev)) {
 280                 pasid_max = pci_max_pasids(to_pci_dev(dev));
 281                 if (pasid_max < 0)
 282                         return -EINVAL;
 283         } else
 284                 pasid_max = 1 << 20;
 285
 286         if ((flags & SVM_FLAG_SUPERVISOR_MODE)) {
 287                 if (!ecap_srs(iommu->ecap))
 288                         return -EINVAL;
 289         } else if (pasid) {
 290                 mm = get_task_mm(current);
 291                 BUG_ON(!mm);
 292         }
 293
 294         mutex_lock(&pasid_mutex);
 295         if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
 296                 int i;
 297
 298                 idr_for_each_entry(&iommu->pasid_idr, svm, i) {
 299                         if (svm->mm != mm ||
 300                             (svm->flags & SVM_FLAG_PRIVATE_PASID))
 301                                 continue;
 302
 303                         if (svm->pasid >= pasid_max) {
 304                                 dev_warn(dev,
 305                                          "Limited PASID width. Cannot use existing PASID %d\n",
 306                                          svm->pasid);
 307                                 ret = -ENOSPC;
 308                                 goto out;
 309                         }
 310
 311                         list_for_each_entry(sdev, &svm->devs, list) {
 312                                 if (dev == sdev->dev) {
 313                                         if (sdev->ops != ops) {
 314                                                 ret = -EBUSY;
 315                                                 goto out;
 316                                         }
 317                                         sdev->users++;
 318                                         goto success;
 319                                 }
 320                         }
 321
 322                         break;
 323                 }
 324         }
 325
 326         sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 327         if (!sdev) {
 328                 ret = -ENOMEM;
 329                 goto out;
 330         }
 331         sdev->dev = dev;
 332
 333         ret = intel_iommu_enable_pasid(iommu, sdev);
 334         if (ret || !pasid) {
 335                 /* If they don't actually want to assign a PASID, this is
 336                  * just an enabling check/preparation. */
 337                 kfree(sdev);
 338                 goto out;
 339         }
 340         /* Finish the setup now we know we're keeping it */
 341         sdev->users = 1;
 342         sdev->ops = ops;
 343         init_rcu_head(&sdev->rcu);
 344
 345         if (!svm) {
 346                 svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 347                 if (!svm) {
 348                         ret = -ENOMEM;
 349                         kfree(sdev);
 350                         goto out;
 351                 }
 352                 svm->iommu = iommu;
 353
 354                 if (pasid_max > 2 << ecap_pss(iommu->ecap))
 355                         pasid_max = 2 << ecap_pss(iommu->ecap);
 356
 357                 ret = idr_alloc(&iommu->pasid_idr, svm, 0, pasid_max - 1,
 358                                 GFP_KERNEL);
 359                 if (ret < 0) {
 360                         kfree(svm);
 361                         goto out;
 362                 }
 363                 svm->pasid = ret;
 364                 svm->notifier.ops = &intel_mmuops;
 365                 svm->mm = mm;
 366                 svm->flags = flags;
 367                 INIT_LIST_HEAD_RCU(&svm->devs);
 368                 ret = -ENOMEM;
 369                 if (mm) {
 370                         ret = mmu_notifier_register(&svm->notifier, mm);
 371                         if (ret) {
 372                                 idr_remove(&svm->iommu->pasid_idr, svm->pasid);
 373                                 kfree(svm);
 374                                 kfree(sdev);
 375                                 goto out;
 376                         }
 377                         iommu->pasid_table[svm->pasid].val = (u64)__pa(mm->pgd) | 1;
 378                         mm = NULL;
 379                 } else
 380                         iommu->pasid_table[svm->pasid].val = (u64)__pa(init_mm.pgd) | 1 | (1ULL << 11);
 381                 wmb();
 382         }
 383         list_add_rcu(&sdev->list, &svm->devs);
 384
 385  success:
 386         *pasid = svm->pasid;
 387         ret = 0;
 388  out:
 389         mutex_unlock(&pasid_mutex);
 390         if (mm)
 391                 mmput(mm);
 392         return ret;
 393 }
 394 EXPORT_SYMBOL_GPL(intel_svm_bind_mm);
 395
 396 int intel_svm_unbind_mm(struct device *dev, int pasid)
 397 {
 398         struct intel_svm_dev *sdev;
 399         struct intel_iommu *iommu;
 400         struct intel_svm *svm;
 401         int ret = -EINVAL;
 402
 403         mutex_lock(&pasid_mutex);
 404         iommu = intel_svm_device_to_iommu(dev);
 405         if (!iommu || !iommu->pasid_table)
 406                 goto out;
 407
 408         svm = idr_find(&iommu->pasid_idr, pasid);
 409         if (!svm)
 410                 goto out;
 411
 412         list_for_each_entry(sdev, &svm->devs, list) {
 413                 if (dev == sdev->dev) {
 414                         ret = 0;
 415                         sdev->users--;
 416                         if (!sdev->users) {
 417                                 list_del_rcu(&sdev->list);
 418                                 /* Flush the PASID cache and IOTLB for this device.
 419                                  * Note that we do depend on the hardware *not* using
 420                                  * the PASID any more. Just as we depend on other
 421                                  * devices never using PASIDs that they have no right
 422                                  * to use. We have a *shared* PASID table, because it's
 423                                  * large and has to be physically contiguous. So it's
 424                                  * hard to be as defensive as we might like. */
 425                                 intel_flush_pasid_dev(svm, sdev);
 426                                 intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
 427                                 kfree_rcu(sdev, rcu);
 428
 429                                 if (list_empty(&svm->devs)) {
 430                                         mmu_notifier_unregister(&svm->notifier, svm->mm);
 431
 432                                         idr_remove(&svm->iommu->pasid_idr, svm->pasid);
 433                                         if (svm->mm)
 434                                                 mmput(svm->mm);
 435                                         /* We mandate that no page faults may be outstanding
 436                                          * for the PASID when intel_svm_unbind_mm() is called.
 437                                          * If that is not obeyed, subtle errors will happen.
 438                                          * Let's make them less subtle... */
 439                                         memset(svm, 0x6b, sizeof(*svm));
 440                                         kfree(svm);
 441                                 }
 442                         }
 443                         break;
 444                 }
 445         }
 446  out:
 447         mutex_unlock(&pasid_mutex);
 448
 449         return ret;
 450 }
 451 EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);
 452
 453 /* Page request queue descriptor */
 454 struct page_req_dsc {
 455         u64 srr:1;
 456         u64 bof:1;
 457         u64 pasid_present:1;
 458         u64 lpig:1;
 459         u64 pasid:20;
 460         u64 bus:8;
 461         u64 private:23;
 462         u64 prg_index:9;
 463         u64 rd_req:1;
 464         u64 wr_req:1;
 465         u64 exe_req:1;
 466         u64 priv_req:1;
 467         u64 devfn:8;
 468         u64 addr:52;
 469 };
 470
 471 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
 472 static irqreturn_t prq_event_thread(int irq, void *d)
 473 {
 474         struct intel_iommu *iommu = d;
 475         struct intel_svm *svm = NULL;
 476         int head, tail, handled = 0;
 477
 478         tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 479         head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 480         while (head != tail) {
 481                 struct intel_svm_dev *sdev;
 482                 struct vm_area_struct *vma;
 483                 struct page_req_dsc *req;
 484                 struct qi_desc resp;
 485                 int ret, result;
 486                 u64 address;
 487
 488                 handled = 1;
 489
 490                 req = &iommu->prq[head / sizeof(*req)];
 491
 492                 result = QI_RESP_FAILURE;
 493                 address = (u64)req->addr << PAGE_SHIFT;
 494                 if (!req->pasid_present) {
 495                         pr_err("%s: Page request without PASID: %08llx %08llx\n",
 496                                iommu->name, ((unsigned long long *)req)[0],
 497                                ((unsigned long long *)req)[1]);
 498                         goto bad_req;
 499                 }
 500
 501                 if (!svm || svm->pasid != req->pasid) {
 502                         rcu_read_lock();
 503                         svm = idr_find(&iommu->pasid_idr, req->pasid);
 504                         /* It *can't* go away, because the driver is not permitted
 505                          * to unbind the mm while any page faults are outstanding.
 506                          * So we only need RCU to protect the internal idr code. */
 507                         rcu_read_unlock();
 508
 509                         if (!svm) {
 510                                 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
 511                                        iommu->name, req->pasid, ((unsigned long long *)req)[0],
 512                                        ((unsigned long long *)req)[1]);
 513                                 goto no_pasid;
 514                         }
 515                 }
 516
 517                 result = QI_RESP_INVALID;
 518                 /* Since we're using init_mm.pgd directly, we should never take
 519                  * any faults on kernel addresses. */
 520                 if (!svm->mm)
 521                         goto bad_req;
 522                 down_read(&svm->mm->mmap_sem);
 523                 vma = find_extend_vma(svm->mm, address);
 524                 if (!vma || address < vma->vm_start)
 525                         goto invalid;
 526
 527                 ret = handle_mm_fault(svm->mm, vma, address,
 528                                       req->wr_req ? FAULT_FLAG_WRITE : 0);
 529                 if (ret & VM_FAULT_ERROR)
 530                         goto invalid;
 531
 532                 result = QI_RESP_SUCCESS;
 533         invalid:
 534                 up_read(&svm->mm->mmap_sem);
 535         bad_req:
 536                 /* Accounting for major/minor faults? */
 537                 rcu_read_lock();
 538                 list_for_each_entry_rcu(sdev, &svm->devs, list) {
 539                         if (sdev->sid == PCI_DEVID(req->bus, req->devfn));
 540                                 break;
 541                 }
 542                 /* Other devices can go away, but the drivers are not permitted
 543                  * to unbind while any page faults might be in flight. So it's
 544                  * OK to drop the 'lock' here now we have it. */
 545                 rcu_read_unlock();
 546
 547                 if (WARN_ON(&sdev->list == &svm->devs))
 548                         sdev = NULL;
 549
 550                 if (sdev && sdev->ops && sdev->ops->fault_cb) {
 551                         int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
 552                                 (req->wr_req << 1) | (req->exe_req);
 553                         sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, req->private, rwxp, result);
 554                 }
 555                 /* We get here in the error case where the PASID lookup failed,
 556                    and these can be NULL. Do not use them below this point! */
 557                 sdev = NULL;
 558                 svm = NULL;
 559         no_pasid:
 560                 if (req->lpig) {
 561                         /* Page Group Response */
 562                         resp.low = QI_PGRP_PASID(req->pasid) |
 563                                 QI_PGRP_DID((req->bus << 8) | req->devfn) |
 564                                 QI_PGRP_PASID_P(req->pasid_present) |
 565                                 QI_PGRP_RESP_TYPE;
 566                         resp.high = QI_PGRP_IDX(req->prg_index) |
 567                                 QI_PGRP_PRIV(req->private) | QI_PGRP_RESP_CODE(result);
 568
 569                         qi_submit_sync(&resp, iommu);
 570                 } else if (req->srr) {
 571                         /* Page Stream Response */
 572                         resp.low = QI_PSTRM_IDX(req->prg_index) |
 573                                 QI_PSTRM_PRIV(req->private) | QI_PSTRM_BUS(req->bus) |
 574                                 QI_PSTRM_PASID(req->pasid) | QI_PSTRM_RESP_TYPE;
 575                         resp.high = QI_PSTRM_ADDR(address) | QI_PSTRM_DEVFN(req->devfn) |
 576                                 QI_PSTRM_RESP_CODE(result);
 577
 578                         qi_submit_sync(&resp, iommu);
 579                 }
 580
 581                 head = (head + sizeof(*req)) & PRQ_RING_MASK;
 582         }
 583
 584         dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
 585
 586         return IRQ_RETVAL(handled);
 587 }