Merge commit 'v3.2-rc3' into stable/for-linus-3.3

author Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

Tue, 20 Dec 2011 22:01:18 +0000 (17:01 -0500)

committer Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

Tue, 20 Dec 2011 22:01:18 +0000 (17:01 -0500)
author Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tue, 20 Dec 2011 22:01:18 +0000 (17:01 -0500)
committer Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tue, 20 Dec 2011 22:01:18 +0000 (17:01 -0500)
diff --git a/Documentation/ABI/stable/sysfs-bus-xen-backend b/Documentation/ABI/stable/sysfs-bus-xen-backend

new file mode 100644 (file)

index 0000000..3d5951c
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-bus-xen-backend
@@ -0,0 +1,75 @@
+What:          /sys/bus/xen-backend/devices/*/devtype
+Date:          Feb 2009
+KernelVersion: 2.6.38
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                The type of the device.  e.g., one of: 'vbd' (block),
+                'vif' (network), or 'vfb' (framebuffer).
+
+What:          /sys/bus/xen-backend/devices/*/nodename
+Date:          Feb 2009
+KernelVersion: 2.6.38
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                XenStore node (under /local/domain/NNN/) for this
+                backend device.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/physical_device
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                The major:minor number (in hexidecimal) of the
+                physical device providing the storage for this backend
+                block device.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/mode
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Whether the block device is read-only ('r') or
+                read-write ('w').
+
+What:          /sys/bus/xen-backend/devices/vbd-*/statistics/f_req
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Number of flush requests from the frontend.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/statistics/oo_req
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Number of requests delayed because the backend was too
+                busy processing previous requests.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/statistics/rd_req
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Number of read requests from the frontend.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/statistics/rd_sect
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Number of sectors read by the frontend.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/statistics/wr_req
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Number of write requests from the frontend.
+
+What:          /sys/bus/xen-backend/devices/vbd-*/statistics/wr_sect
+Date:          April 2011
+KernelVersion: 3.0
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+                Number of sectors written by the frontend.
diff --git a/Documentation/ABI/stable/sysfs-devices-system-xen_memory b/Documentation/ABI/stable/sysfs-devices-system-xen_memory

new file mode 100644 (file)

index 0000000..caa311d
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-devices-system-xen_memory
@@ -0,0 +1,77 @@
+What:          /sys/devices/system/xen_memory/xen_memory0/max_retry_count
+Date:          May 2011
+KernelVersion: 2.6.39
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               The maximum number of times the balloon driver will
+               attempt to increase the balloon before giving up.  See
+               also 'retry_count' below.
+               A value of zero means retry forever and is the default one.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/max_schedule_delay
+Date:          May 2011
+KernelVersion: 2.6.39
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               The limit that 'schedule_delay' (see below) will be
+               increased to. The default value is 32 seconds.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/retry_count
+Date:          May 2011
+KernelVersion: 2.6.39
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               The current number of times that the balloon driver
+               has attempted to increase the size of the balloon.
+               The default value is one. With max_retry_count being
+               zero (unlimited), this means that the driver will attempt
+               to retry with a 'schedule_delay' delay.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/schedule_delay
+Date:          May 2011
+KernelVersion: 2.6.39
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               The time (in seconds) to wait between attempts to
+               increase the balloon.  Each time the balloon cannot be
+               increased, 'schedule_delay' is increased (until
+               'max_schedule_delay' is reached at which point it
+               will use the max value).
+
+What:          /sys/devices/system/xen_memory/xen_memory0/target
+Date:          April 2008
+KernelVersion: 2.6.26
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               The target number of pages to adjust this domain's
+               memory reservation to.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/target_kb
+Date:          April 2008
+KernelVersion: 2.6.26
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               As target above, except the value is in KiB.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/info/current_kb
+Date:          April 2008
+KernelVersion: 2.6.26
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               Current size (in KiB) of this domain's memory
+               reservation.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/info/high_kb
+Date:          April 2008
+KernelVersion: 2.6.26
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               Amount (in KiB) of high memory in the balloon.
+
+What:          /sys/devices/system/xen_memory/xen_memory0/info/low_kb
+Date:          April 2008
+KernelVersion: 2.6.26
+Contact:       Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+               Amount (in KiB) of low (or normal) memory in the
+               balloon.
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h

index 1d2427d116e363dbecbb98559dd002cc9b67f4a6..fbb519828aa179e3ac75af8e8c2ea07fda944fc7 100644 (file)
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -71,7 +71,7 @@
  __DEFINE_GUEST_HANDLE(uchar, unsigned char);
  __DEFINE_GUEST_HANDLE(uint, unsigned int);
  __DEFINE_GUEST_HANDLE(ulong, unsigned long);
-__DEFINE_GUEST_HANDLE(u64, unsigned long);
+
  DEFINE_GUEST_HANDLE(char);
  DEFINE_GUEST_HANDLE(int);
  DEFINE_GUEST_HANDLE(long);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig

index 26c731a106afd11b8db6a37577a84b36a221aff0..fdce49c7aff6cf2ce931f25154d8e7c0ada8338c 100644 (file)
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,7 +29,8 @@ config XEN_PVHVM
  
  config XEN_MAX_DOMAIN_MEMORY
         int
-       default 128
+       default 500 if X86_64
+       default 64 if X86_32
         depends on XEN
         help
           This only affects the sizing of some bss arrays, the unused
@@ -48,3 +49,4 @@ config XEN_DEBUG_FS
         help
           Enable statistics output and various tuning options in debugfs.
           Enabling this option may incur a significant performance overhead.
+
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c

index 5a40d24ba3316b85b42e5ef5d00280b89a2033b9..3a5f55d51907aa16a8a58acfce22436f56366951 100644 (file)
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
         return 0;
  }
  
+/*
+ * This function is used to map shared frames to store grant status. It is
+ * different from map_pte_fn above, the frames type here is uint64_t.
+ */
+static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
+                            unsigned long addr, void *data)
+{
+       uint64_t **frames = (uint64_t **)data;
+
+       set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+       (*frames)++;
+       return 0;
+}
+
  static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
                         unsigned long addr, void *data)
  {
@@ -64,10 +78,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
  
  int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
                            unsigned long max_nr_gframes,
-                          struct grant_entry **__shared)
+                          void **__shared)
  {
         int rc;
-       struct grant_entry *shared = *__shared;
+       void *shared = *__shared;
  
         if (shared == NULL) {
                 struct vm_struct *area =
@@ -83,8 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
         return rc;
  }
  
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
-                             unsigned long nr_gframes)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+                          unsigned long max_nr_gframes,
+                          grant_status_t **__shared)
+{
+       int rc;
+       grant_status_t *shared = *__shared;
+
+       if (shared == NULL) {
+               /* No need to pass in PTE as we are going to do it
+                * in apply_to_page_range anyhow. */
+               struct vm_struct *area =
+                       alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
+               BUG_ON(area == NULL);
+               shared = area->addr;
+               *__shared = shared;
+       }
+
+       rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+                                PAGE_SIZE * nr_gframes,
+                                map_pte_fn_status, &frames);
+       return rc;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
  {
         apply_to_page_range(&init_mm, (unsigned long)shared,
                             PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig

index 8795480c2350301362640120ac02e09709cce3d4..a1ced521cf744600e93a4f9bb2a94663abaebdca 100644 (file)
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -86,6 +86,7 @@ config XEN_BACKEND
  
  config XENFS
         tristate "Xen filesystem"
+       select XEN_PRIVCMD
         default y
         help
           The xen filesystem provides a way for domains to share
@@ -171,4 +172,10 @@ config XEN_PCIDEV_BACKEND
           xen-pciback.hide=(03:00.0)(04:00.0)
  
           If in doubt, say m.
+
+config XEN_PRIVCMD
+       tristate
+       depends on XEN
+       default m
+
  endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile

index 974fffdf22b2e15031faa6396721fa9800883b2b..aa31337192cc5421f2df8d1ebc24a34d75952449 100644 (file)
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -19,7 +19,9 @@ obj-$(CONFIG_XEN_TMEM)                        += tmem.o
  obj-$(CONFIG_SWIOTLB_XEN)              += swiotlb-xen.o
  obj-$(CONFIG_XEN_DOM0)                 += pci.o
  obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD)              += xen-privcmd.o
  
  xen-evtchn-y                           := evtchn.o
  xen-gntdev-y                           := gntdev.o
  xen-gntalloc-y                         := gntalloc.o
+xen-privcmd-y                          := privcmd.o
diff --git a/drivers/xen/events.c b/drivers/xen/events.c

index 6e075cdd0c6bf56ff8daacf986d443de23c49998..e5e5812a1014cbf2c306ec802391980a7926754c 100644 (file)
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -87,6 +87,7 @@ enum xen_irq_type {
   */
  struct irq_info {
         struct list_head list;
+       int refcnt;
         enum xen_irq_type type; /* type */
         unsigned irq;
         unsigned short evtchn;  /* event channel */
@@ -406,6 +407,7 @@ static void xen_irq_init(unsigned irq)
                 panic("Unable to allocate metadata for IRQ%d\n", irq);
  
         info->type = IRQT_UNBOUND;
+       info->refcnt = -1;
  
         irq_set_handler_data(irq, info);
  
@@ -469,6 +471,8 @@ static void xen_free_irq(unsigned irq)
  
         irq_set_handler_data(irq, NULL);
  
+       WARN_ON(info->refcnt > 0);
+
         kfree(info);
  
         /* Legacy IRQ descriptors are managed by the arch. */
@@ -637,7 +641,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
         if (irq != -1) {
                 printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
                        irq, gsi);
-               goto out;       /* XXX need refcount? */
+               goto out;
         }
  
         irq = xen_allocate_irq_gsi(gsi);
@@ -939,9 +943,16 @@ static void unbind_from_irq(unsigned int irq)
  {
         struct evtchn_close close;
         int evtchn = evtchn_from_irq(irq);
+       struct irq_info *info = irq_get_handler_data(irq);
  
         mutex_lock(&irq_mapping_update_lock);
  
+       if (info->refcnt > 0) {
+               info->refcnt--;
+               if (info->refcnt != 0)
+                       goto done;
+       }
+
         if (VALID_EVTCHN(evtchn)) {
                 close.port = evtchn;
                 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
@@ -970,6 +981,7 @@ static void unbind_from_irq(unsigned int irq)
  
         xen_free_irq(irq);
  
+ done:
         mutex_unlock(&irq_mapping_update_lock);
  }
  
@@ -1065,6 +1077,69 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id)
  }
  EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
  
+int evtchn_make_refcounted(unsigned int evtchn)
+{
+       int irq = evtchn_to_irq[evtchn];
+       struct irq_info *info;
+
+       if (irq == -1)
+               return -ENOENT;
+
+       info = irq_get_handler_data(irq);
+
+       if (!info)
+               return -ENOENT;
+
+       WARN_ON(info->refcnt != -1);
+
+       info->refcnt = 1;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
+
+int evtchn_get(unsigned int evtchn)
+{
+       int irq;
+       struct irq_info *info;
+       int err = -ENOENT;
+
+       if (evtchn >= NR_EVENT_CHANNELS)
+               return -EINVAL;
+
+       mutex_lock(&irq_mapping_update_lock);
+
+       irq = evtchn_to_irq[evtchn];
+       if (irq == -1)
+               goto done;
+
+       info = irq_get_handler_data(irq);
+
+       if (!info)
+               goto done;
+
+       err = -EINVAL;
+       if (info->refcnt <= 0)
+               goto done;
+
+       info->refcnt++;
+       err = 0;
+ done:
+       mutex_unlock(&irq_mapping_update_lock);
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(evtchn_get);
+
+void evtchn_put(unsigned int evtchn)
+{
+       int irq = evtchn_to_irq[evtchn];
+       if (WARN_ON(irq == -1))
+               return;
+       unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(evtchn_put);
+
  void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
  {
         int irq = per_cpu(ipi_to_irq, cpu)[vector];
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c

index dbc13e94b612b39759a01fb3605e956dbca807f6..b1f60a0c0bea3e00627293101fd3dd38bb1fd96e 100644 (file)
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -268,7 +268,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
         rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
                                        u->name, (void *)(unsigned long)port);
         if (rc >= 0)
-               rc = 0;
+               rc = evtchn_make_refcounted(port);
  
         return rc;
  }
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c

index e1c4c6e5b469c44449f68e9e841d264eb08dbc48..e2400c8963fa6ed8f92198c0731142ff646bd684 100644 (file)
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -74,7 +74,7 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by "
                 "the gntalloc device");
  
  static LIST_HEAD(gref_list);
-static DEFINE_SPINLOCK(gref_lock);
+static DEFINE_MUTEX(gref_mutex);
  static int gref_size;
  
  struct notify_info {
@@ -99,6 +99,12 @@ struct gntalloc_file_private_data {
         uint64_t index;
  };
  
+struct gntalloc_vma_private_data {
+       struct gntalloc_gref *gref;
+       int users;
+       int count;
+};
+
  static void __del_gref(struct gntalloc_gref *gref);
  
  static void do_cleanup(void)
@@ -143,15 +149,15 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
         }
  
         /* Add to gref lists. */
-       spin_lock(&gref_lock);
+       mutex_lock(&gref_mutex);
         list_splice_tail(&queue_gref, &gref_list);
         list_splice_tail(&queue_file, &priv->list);
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
  
         return 0;
  
  undo:
-       spin_lock(&gref_lock);
+       mutex_lock(&gref_mutex);
         gref_size -= (op->count - i);
  
         list_for_each_entry(gref, &queue_file, next_file) {
@@ -167,7 +173,7 @@ undo:
          */
         if (unlikely(!list_empty(&queue_gref)))
                 list_splice_tail(&queue_gref, &gref_list);
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
         return rc;
  }
  
@@ -178,8 +184,10 @@ static void __del_gref(struct gntalloc_gref *gref)
                 tmp[gref->notify.pgoff] = 0;
                 kunmap(gref->page);
         }
-       if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+       if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
                 notify_remote_via_evtchn(gref->notify.event);
+               evtchn_put(gref->notify.event);
+       }
  
         gref->notify.flags = 0;
  
@@ -189,6 +197,8 @@ static void __del_gref(struct gntalloc_gref *gref)
  
                 if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
                         return;
+
+               gnttab_free_grant_reference(gref->gref_id);
         }
  
         gref_size--;
@@ -251,7 +261,7 @@ static int gntalloc_release(struct inode *inode, struct file *filp)
  
         pr_debug("%s: priv %p\n", __func__, priv);
  
-       spin_lock(&gref_lock);
+       mutex_lock(&gref_mutex);
         while (!list_empty(&priv->list)) {
                 gref = list_entry(priv->list.next,
                         struct gntalloc_gref, next_file);
@@ -261,7 +271,7 @@ static int gntalloc_release(struct inode *inode, struct file *filp)
                         __del_gref(gref);
         }
         kfree(priv);
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
  
         return 0;
  }
@@ -286,21 +296,21 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
                 goto out;
         }
  
-       spin_lock(&gref_lock);
+       mutex_lock(&gref_mutex);
         /* Clean up pages that were at zero (local) users but were still mapped
          * by remote domains. Since those pages count towards the limit that we
          * are about to enforce, removing them here is a good idea.
          */
         do_cleanup();
         if (gref_size + op.count > limit) {
-               spin_unlock(&gref_lock);
+               mutex_unlock(&gref_mutex);
                 rc = -ENOSPC;
                 goto out_free;
         }
         gref_size += op.count;
         op.index = priv->index;
         priv->index += op.count * PAGE_SIZE;
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
  
         rc = add_grefs(&op, gref_ids, priv);
         if (rc < 0)
@@ -343,7 +353,7 @@ static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
                 goto dealloc_grant_out;
         }
  
-       spin_lock(&gref_lock);
+       mutex_lock(&gref_mutex);
         gref = find_grefs(priv, op.index, op.count);
         if (gref) {
                 /* Remove from the file list only, and decrease reference count.
@@ -363,7 +373,7 @@ static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
  
         do_cleanup();
  
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
  dealloc_grant_out:
         return rc;
  }
@@ -383,7 +393,7 @@ static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
         index = op.index & ~(PAGE_SIZE - 1);
         pgoff = op.index & (PAGE_SIZE - 1);
  
-       spin_lock(&gref_lock);
+       mutex_lock(&gref_mutex);
  
         gref = find_grefs(priv, index, 1);
         if (!gref) {
@@ -396,12 +406,30 @@ static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
                 goto unlock_out;
         }
  
+       /* We need to grab a reference to the event channel we are going to use
+        * to send the notify before releasing the reference we may already have
+        * (if someone has called this ioctl twice). This is required so that
+        * it is possible to change the clear_byte part of the notification
+        * without disturbing the event channel part, which may now be the last
+        * reference to that event channel.
+        */
+       if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+               if (evtchn_get(op.event_channel_port)) {
+                       rc = -EINVAL;
+                       goto unlock_out;
+               }
+       }
+
+       if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+               evtchn_put(gref->notify.event);
+
         gref->notify.flags = op.action;
         gref->notify.pgoff = pgoff;
         gref->notify.event = op.event_channel_port;
         rc = 0;
+
   unlock_out:
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
         return rc;
  }
  
@@ -429,26 +457,40 @@ static long gntalloc_ioctl(struct file *filp, unsigned int cmd,
  
  static void gntalloc_vma_open(struct vm_area_struct *vma)
  {
-       struct gntalloc_gref *gref = vma->vm_private_data;
-       if (!gref)
+       struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+
+       if (!priv)
                 return;
  
-       spin_lock(&gref_lock);
-       gref->users++;
-       spin_unlock(&gref_lock);
+       mutex_lock(&gref_mutex);
+       priv->users++;
+       mutex_unlock(&gref_mutex);
  }
  
  static void gntalloc_vma_close(struct vm_area_struct *vma)
  {
-       struct gntalloc_gref *gref = vma->vm_private_data;
-       if (!gref)
+       struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+       struct gntalloc_gref *gref, *next;
+       int i;
+
+       if (!priv)
                 return;
  
-       spin_lock(&gref_lock);
-       gref->users--;
-       if (gref->users == 0)
-               __del_gref(gref);
-       spin_unlock(&gref_lock);
+       mutex_lock(&gref_mutex);
+       priv->users--;
+       if (priv->users == 0) {
+               gref = priv->gref;
+               for (i = 0; i < priv->count; i++) {
+                       gref->users--;
+                       next = list_entry(gref->next_gref.next,
+                                         struct gntalloc_gref, next_gref);
+                       if (gref->users == 0)
+                               __del_gref(gref);
+                       gref = next;
+               }
+               kfree(priv);
+       }
+       mutex_unlock(&gref_mutex);
  }
  
  static struct vm_operations_struct gntalloc_vmops = {
@@ -459,19 +501,25 @@ static struct vm_operations_struct gntalloc_vmops = {
  static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
  {
         struct gntalloc_file_private_data *priv = filp->private_data;
+       struct gntalloc_vma_private_data *vm_priv;
         struct gntalloc_gref *gref;
         int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
         int rv, i;
  
-       pr_debug("%s: priv %p, page %lu+%d\n", __func__,
-                      priv, vma->vm_pgoff, count);
-
         if (!(vma->vm_flags & VM_SHARED)) {
                 printk(KERN_ERR "%s: Mapping must be shared.\n", __func__);
                 return -EINVAL;
         }
  
-       spin_lock(&gref_lock);
+       vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL);
+       if (!vm_priv)
+               return -ENOMEM;
+
+       mutex_lock(&gref_mutex);
+
+       pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__,
+                      priv, vm_priv, vma->vm_pgoff, count);
+
         gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
         if (gref == NULL) {
                 rv = -ENOENT;
@@ -480,9 +528,13 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
                 goto out_unlock;
         }
  
-       vma->vm_private_data = gref;
+       vm_priv->gref = gref;
+       vm_priv->users = 1;
+       vm_priv->count = count;
+
+       vma->vm_private_data = vm_priv;
  
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
  
         vma->vm_ops = &gntalloc_vmops;
  
@@ -499,7 +551,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
         rv = 0;
  
  out_unlock:
-       spin_unlock(&gref_lock);
+       mutex_unlock(&gref_mutex);
         return rv;
  }
  
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c

index afca14d9042e6cd2ae03f238143ac8188aaf35fb..f52f661f8f82c731b678891bf1d27149b6f63421 100644 (file)
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -193,8 +193,10 @@ static void gntdev_put_map(struct grant_map *map)
  
         atomic_sub(map->count, &pages_mapped);
  
-       if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+       if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
                 notify_remote_via_evtchn(map->notify.event);
+               evtchn_put(map->notify.event);
+       }
  
         if (map->pages) {
                 if (!use_ptemod)
@@ -599,6 +601,8 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
         struct ioctl_gntdev_unmap_notify op;
         struct grant_map *map;
         int rc;
+       int out_flags;
+       unsigned int out_event;
  
         if (copy_from_user(&op, u, sizeof(op)))
                 return -EFAULT;
@@ -606,6 +610,21 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
         if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
                 return -EINVAL;
  
+       /* We need to grab a reference to the event channel we are going to use
+        * to send the notify before releasing the reference we may already have
+        * (if someone has called this ioctl twice). This is required so that
+        * it is possible to change the clear_byte part of the notification
+        * without disturbing the event channel part, which may now be the last
+        * reference to that event channel.
+        */
+       if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+               if (evtchn_get(op.event_channel_port))
+                       return -EINVAL;
+       }
+
+       out_flags = op.action;
+       out_event = op.event_channel_port;
+
         spin_lock(&priv->lock);
  
         list_for_each_entry(map, &priv->maps, next) {
@@ -624,12 +643,22 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
                 goto unlock_out;
         }
  
+       out_flags = map->notify.flags;
+       out_event = map->notify.event;
+
         map->notify.flags = op.action;
         map->notify.addr = op.index - (map->index << PAGE_SHIFT);
         map->notify.event = op.event_channel_port;
+
         rc = 0;
+
   unlock_out:
         spin_unlock(&priv->lock);
+
+       /* Drop the reference to the event channel we did not save in the map */
+       if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
+               evtchn_put(out_event);
+
         return rc;
  }
  
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c

index bf1c094f4ebf12ea234b9a60fdaea118d6217d8e..a3d0e1e278c1e12c31a353d360a47acda025552c 100644 (file)
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -44,16 +44,19 @@
  #include <xen/page.h>
  #include <xen/grant_table.h>
  #include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
  #include <asm/xen/hypercall.h>
  
  #include <asm/pgtable.h>
  #include <asm/sync_bitops.h>
  
-
  /* External tools reserve first few grant table entries. */
  #define NR_RESERVED_ENTRIES 8
  #define GNTTAB_LIST_END 0xffffffff
-#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
+#define GREFS_PER_GRANT_FRAME \
+(grant_table_version == 1 ?                      \
+(PAGE_SIZE / sizeof(struct grant_entry_v1)) :   \
+(PAGE_SIZE / sizeof(union grant_entry_v2)))
  
  static grant_ref_t **gnttab_list;
  static unsigned int nr_grant_frames;
@@ -64,13 +67,97 @@ static DEFINE_SPINLOCK(gnttab_list_lock);
  unsigned long xen_hvm_resume_frames;
  EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
  
-static struct grant_entry *shared;
+static union {
+       struct grant_entry_v1 *v1;
+       union grant_entry_v2 *v2;
+       void *addr;
+} gnttab_shared;
+
+/*This is a structure of function pointers for grant table*/
+struct gnttab_ops {
+       /*
+        * Mapping a list of frames for storing grant entries. Frames parameter
+        * is used to store grant table address when grant table being setup,
+        * nr_gframes is the number of frames to map grant table. Returning
+        * GNTST_okay means success and negative value means failure.
+        */
+       int (*map_frames)(unsigned long *frames, unsigned int nr_gframes);
+       /*
+        * Release a list of frames which are mapped in map_frames for grant
+        * entry status.
+        */
+       void (*unmap_frames)(void);
+       /*
+        * Introducing a valid entry into the grant table, granting the frame of
+        * this grant entry to domain for accessing or transfering. Ref
+        * parameter is reference of this introduced grant entry, domid is id of
+        * granted domain, frame is the page frame to be granted, and flags is
+        * status of the grant entry to be updated.
+        */
+       void (*update_entry)(grant_ref_t ref, domid_t domid,
+                            unsigned long frame, unsigned flags);
+       /*
+        * Stop granting a grant entry to domain for accessing. Ref parameter is
+        * reference of a grant entry whose grant access will be stopped,
+        * readonly is not in use in this function. If the grant entry is
+        * currently mapped for reading or writing, just return failure(==0)
+        * directly and don't tear down the grant access. Otherwise, stop grant
+        * access for this entry and return success(==1).
+        */
+       int (*end_foreign_access_ref)(grant_ref_t ref, int readonly);
+       /*
+        * Stop granting a grant entry to domain for transfer. Ref parameter is
+        * reference of a grant entry whose grant transfer will be stopped. If
+        * tranfer has not started, just reclaim the grant entry and return
+        * failure(==0). Otherwise, wait for the transfer to complete and then
+        * return the frame.
+        */
+       unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref);
+       /*
+        * Query the status of a grant entry. Ref parameter is reference of
+        * queried grant entry, return value is the status of queried entry.
+        * Detailed status(writing/reading) can be gotten from the return value
+        * by bit operations.
+        */
+       int (*query_foreign_access)(grant_ref_t ref);
+       /*
+        * Grant a domain to access a range of bytes within the page referred by
+        * an available grant entry. Ref parameter is reference of a grant entry
+        * which will be sub-page accessed, domid is id of grantee domain, frame
+        * is frame address of subpage grant, flags is grant type and flag
+        * information, page_off is offset of the range of bytes, and length is
+        * length of bytes to be accessed.
+        */
+       void (*update_subpage_entry)(grant_ref_t ref, domid_t domid,
+                                    unsigned long frame, int flags,
+                                    unsigned page_off, unsigned length);
+       /*
+        * Redirect an available grant entry on domain A to another grant
+        * reference of domain B, then allow domain C to use grant reference
+        * of domain B transitively. Ref parameter is an available grant entry
+        * reference on domain A, domid is id of domain C which accesses grant
+        * entry transitively, flags is grant type and flag information,
+        * trans_domid is id of domain B whose grant entry is finally accessed
+        * transitively, trans_gref is grant entry transitive reference of
+        * domain B.
+        */
+       void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags,
+                                  domid_t trans_domid, grant_ref_t trans_gref);
+};
+
+static struct gnttab_ops *gnttab_interface;
+
+/*This reflects status of grant entries, so act as a global value*/
+static grant_status_t *grstatus;
+
+static int grant_table_version;
  
  static struct gnttab_free_callback *gnttab_free_callback_list;
  
  static int gnttab_expand(unsigned int req_entries);
  
  #define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define SPP (PAGE_SIZE / sizeof(grant_status_t))
  
  static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
  {
@@ -142,23 +229,33 @@ static void put_free_entry(grant_ref_t ref)
         spin_unlock_irqrestore(&gnttab_list_lock, flags);
  }
  
-static void update_grant_entry(grant_ref_t ref, domid_t domid,
-                              unsigned long frame, unsigned flags)
+/*
+ * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
+ * Introducing a valid entry into the grant table:
+ *  1. Write ent->domid.
+ *  2. Write ent->frame:
+ *      GTF_permit_access:   Frame to which access is permitted.
+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ *                           frame, or zero if none.
+ *  3. Write memory barrier (WMB).
+ *  4. Write ent->flags, inc. valid type.
+ */
+static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
+                                  unsigned long frame, unsigned flags)
+{
+       gnttab_shared.v1[ref].domid = domid;
+       gnttab_shared.v1[ref].frame = frame;
+       wmb();
+       gnttab_shared.v1[ref].flags = flags;
+}
+
+static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid,
+                                  unsigned long frame, unsigned flags)
  {
-       /*
-        * Introducing a valid entry into the grant table:
-        *  1. Write ent->domid.
-        *  2. Write ent->frame:
-        *      GTF_permit_access:   Frame to which access is permitted.
-        *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
-        *                           frame, or zero if none.
-        *  3. Write memory barrier (WMB).
-        *  4. Write ent->flags, inc. valid type.
-        */
-       shared[ref].frame = frame;
-       shared[ref].domid = domid;
+       gnttab_shared.v2[ref].hdr.domid = domid;
+       gnttab_shared.v2[ref].full_page.frame = frame;
         wmb();
-       shared[ref].flags = flags;
+       gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;
  }
  
  /*
@@ -167,7 +264,7 @@ static void update_grant_entry(grant_ref_t ref, domid_t domid,
  void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
                                      unsigned long frame, int readonly)
  {
-       update_grant_entry(ref, domid, frame,
+       gnttab_interface->update_entry(ref, domid, frame,
                            GTF_permit_access | (readonly ? GTF_readonly : 0));
  }
  EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
@@ -187,31 +284,184 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
  }
  EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
  
-int gnttab_query_foreign_access(grant_ref_t ref)
+void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid,
+                                   unsigned long frame, int flags,
+                                   unsigned page_off,
+                                   unsigned length)
+{
+       gnttab_shared.v2[ref].sub_page.frame = frame;
+       gnttab_shared.v2[ref].sub_page.page_off = page_off;
+       gnttab_shared.v2[ref].sub_page.length = length;
+       gnttab_shared.v2[ref].hdr.domid = domid;
+       wmb();
+       gnttab_shared.v2[ref].hdr.flags =
+                               GTF_permit_access | GTF_sub_page | flags;
+}
+
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+                                           unsigned long frame, int flags,
+                                           unsigned page_off,
+                                           unsigned length)
  {
-       u16 nflags;
+       if (flags & (GTF_accept_transfer | GTF_reading |
+                    GTF_writing | GTF_transitive))
+               return -EPERM;
  
-       nflags = shared[ref].flags;
+       if (gnttab_interface->update_subpage_entry == NULL)
+               return -ENOSYS;
  
-       return nflags & (GTF_reading|GTF_writing);
+       gnttab_interface->update_subpage_entry(ref, domid, frame, flags,
+                                              page_off, length);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref);
+
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+                                       int flags, unsigned page_off,
+                                       unsigned length)
+{
+       int ref, rc;
+
+       ref = get_free_entries(1);
+       if (unlikely(ref < 0))
+               return -ENOSPC;
+
+       rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags,
+                                                    page_off, length);
+       if (rc < 0) {
+               put_free_entry(ref);
+               return rc;
+       }
+
+       return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage);
+
+bool gnttab_subpage_grants_available(void)
+{
+       return gnttab_interface->update_subpage_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available);
+
+void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid,
+                                 int flags, domid_t trans_domid,
+                                 grant_ref_t trans_gref)
+{
+       gnttab_shared.v2[ref].transitive.trans_domid = trans_domid;
+       gnttab_shared.v2[ref].transitive.gref = trans_gref;
+       gnttab_shared.v2[ref].hdr.domid = domid;
+       wmb();
+       gnttab_shared.v2[ref].hdr.flags =
+                               GTF_permit_access | GTF_transitive | flags;
+}
+
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+                                         int flags, domid_t trans_domid,
+                                         grant_ref_t trans_gref)
+{
+       if (flags & (GTF_accept_transfer | GTF_reading |
+                    GTF_writing | GTF_sub_page))
+               return -EPERM;
+
+       if (gnttab_interface->update_trans_entry == NULL)
+               return -ENOSYS;
+
+       gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid,
+                                            trans_gref);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref);
+
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+                                     domid_t trans_domid,
+                                     grant_ref_t trans_gref)
+{
+       int ref, rc;
+
+       ref = get_free_entries(1);
+       if (unlikely(ref < 0))
+               return -ENOSPC;
+
+       rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags,
+                                                  trans_domid, trans_gref);
+       if (rc < 0) {
+               put_free_entry(ref);
+               return rc;
+       }
+
+       return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans);
+
+bool gnttab_trans_grants_available(void)
+{
+       return gnttab_interface->update_trans_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_trans_grants_available);
+
+static int gnttab_query_foreign_access_v1(grant_ref_t ref)
+{
+       return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing);
+}
+
+static int gnttab_query_foreign_access_v2(grant_ref_t ref)
+{
+       return grstatus[ref] & (GTF_reading|GTF_writing);
+}
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+       return gnttab_interface->query_foreign_access(ref);
  }
  EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
  
-int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
  {
         u16 flags, nflags;
+       u16 *pflags;
  
-       nflags = shared[ref].flags;
+       pflags = &gnttab_shared.v1[ref].flags;
+       nflags = *pflags;
         do {
                 flags = nflags;
                 if (flags & (GTF_reading|GTF_writing)) {
                         printk(KERN_ALERT "WARNING: g.e. still in use!\n");
                         return 0;
                 }
-       } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
+       } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags);
+
+       return 1;
+}
+
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+{
+       gnttab_shared.v2[ref].hdr.flags = 0;
+       mb();
+       if (grstatus[ref] & (GTF_reading|GTF_writing)) {
+               return 0;
+       } else {
+               /* The read of grstatus needs to have acquire
+               semantics.  On x86, reads already have
+               that, and we just need to protect against
+               compiler reorderings.  On other
+               architectures we may need a full
+               barrier. */
+#ifdef CONFIG_X86
+               barrier();
+#else
+               mb();
+#endif
+       }
  
         return 1;
  }
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+       return gnttab_interface->end_foreign_access_ref(ref, readonly);
+}
  EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
  
  void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
@@ -246,37 +496,76 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
  void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
                                        unsigned long pfn)
  {
-       update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
+       gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);
  }
  EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
  
-unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
  {
         unsigned long frame;
         u16           flags;
+       u16          *pflags;
+
+       pflags = &gnttab_shared.v1[ref].flags;
  
         /*
          * If a transfer is not even yet started, try to reclaim the grant
          * reference and return failure (== 0).
          */
-       while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
-               if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+       while (!((flags = *pflags) & GTF_transfer_committed)) {
+               if (sync_cmpxchg(pflags, flags, 0) == flags)
                         return 0;
                 cpu_relax();
         }
  
         /* If a transfer is in progress then wait until it is completed. */
         while (!(flags & GTF_transfer_completed)) {
-               flags = shared[ref].flags;
+               flags = *pflags;
                 cpu_relax();
         }
  
         rmb();  /* Read the frame number /after/ reading completion status. */
-       frame = shared[ref].frame;
+       frame = gnttab_shared.v1[ref].frame;
+       BUG_ON(frame == 0);
+
+       return frame;
+}
+
+static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
+{
+       unsigned long frame;
+       u16           flags;
+       u16          *pflags;
+
+       pflags = &gnttab_shared.v2[ref].hdr.flags;
+
+       /*
+        * If a transfer is not even yet started, try to reclaim the grant
+        * reference and return failure (== 0).
+        */
+       while (!((flags = *pflags) & GTF_transfer_committed)) {
+               if (sync_cmpxchg(pflags, flags, 0) == flags)
+                       return 0;
+               cpu_relax();
+       }
+
+       /* If a transfer is in progress then wait until it is completed. */
+       while (!(flags & GTF_transfer_completed)) {
+               flags = *pflags;
+               cpu_relax();
+       }
+
+       rmb();  /* Read the frame number /after/ reading completion status. */
+       frame = gnttab_shared.v2[ref].full_page.frame;
         BUG_ON(frame == 0);
  
         return frame;
  }
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+       return gnttab_interface->end_foreign_transfer_ref(ref);
+}
  EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
  
  unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
@@ -448,8 +737,8 @@ unsigned int gnttab_max_grant_frames(void)
  EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
  
  int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
-                       struct gnttab_map_grant_ref *kmap_ops,
-                       struct page **pages, unsigned int count)
+                   struct gnttab_map_grant_ref *kmap_ops,
+                   struct page **pages, unsigned int count)
  {
         int i, ret;
         pte_t *pte;
@@ -499,7 +788,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
  EXPORT_SYMBOL_GPL(gnttab_map_refs);
  
  int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
-               struct page **pages, unsigned int count)
+                     struct page **pages, unsigned int count)
  {
         int i, ret;
  
@@ -520,6 +809,77 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
  }
  EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
  
+static unsigned nr_status_frames(unsigned nr_grant_frames)
+{
+       return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP;
+}
+
+static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes)
+{
+       int rc;
+
+       rc = arch_gnttab_map_shared(frames, nr_gframes,
+                                   gnttab_max_grant_frames(),
+                                   &gnttab_shared.addr);
+       BUG_ON(rc);
+
+       return 0;
+}
+
+static void gnttab_unmap_frames_v1(void)
+{
+       arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+}
+
+static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes)
+{
+       uint64_t *sframes;
+       unsigned int nr_sframes;
+       struct gnttab_get_status_frames getframes;
+       int rc;
+
+       nr_sframes = nr_status_frames(nr_gframes);
+
+       /* No need for kzalloc as it is initialized in following hypercall
+        * GNTTABOP_get_status_frames.
+        */
+       sframes = kmalloc(nr_sframes  * sizeof(uint64_t), GFP_ATOMIC);
+       if (!sframes)
+               return -ENOMEM;
+
+       getframes.dom        = DOMID_SELF;
+       getframes.nr_frames  = nr_sframes;
+       set_xen_guest_handle(getframes.frame_list, sframes);
+
+       rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames,
+                                      &getframes, 1);
+       if (rc == -ENOSYS) {
+               kfree(sframes);
+               return -ENOSYS;
+       }
+
+       BUG_ON(rc || getframes.status);
+
+       rc = arch_gnttab_map_status(sframes, nr_sframes,
+                                   nr_status_frames(gnttab_max_grant_frames()),
+                                   &grstatus);
+       BUG_ON(rc);
+       kfree(sframes);
+
+       rc = arch_gnttab_map_shared(frames, nr_gframes,
+                                   gnttab_max_grant_frames(),
+                                   &gnttab_shared.addr);
+       BUG_ON(rc);
+
+       return 0;
+}
+
+static void gnttab_unmap_frames_v2(void)
+{
+       arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+       arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames));
+}
+
  static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
  {
         struct gnttab_setup_table setup;
@@ -551,6 +911,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
                 return rc;
         }
  
+       /* No need for kzalloc as it is initialized in following hypercall
+        * GNTTABOP_setup_table.
+        */
         frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
         if (!frames)
                 return -ENOMEM;
@@ -567,19 +930,65 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
  
         BUG_ON(rc || setup.status);
  
-       rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
-                                   &shared);
-       BUG_ON(rc);
+       rc = gnttab_interface->map_frames(frames, nr_gframes);
  
         kfree(frames);
  
-       return 0;
+       return rc;
+}
+
+static struct gnttab_ops gnttab_v1_ops = {
+       .map_frames                     = gnttab_map_frames_v1,
+       .unmap_frames                   = gnttab_unmap_frames_v1,
+       .update_entry                   = gnttab_update_entry_v1,
+       .end_foreign_access_ref         = gnttab_end_foreign_access_ref_v1,
+       .end_foreign_transfer_ref       = gnttab_end_foreign_transfer_ref_v1,
+       .query_foreign_access           = gnttab_query_foreign_access_v1,
+};
+
+static struct gnttab_ops gnttab_v2_ops = {
+       .map_frames                     = gnttab_map_frames_v2,
+       .unmap_frames                   = gnttab_unmap_frames_v2,
+       .update_entry                   = gnttab_update_entry_v2,
+       .end_foreign_access_ref         = gnttab_end_foreign_access_ref_v2,
+       .end_foreign_transfer_ref       = gnttab_end_foreign_transfer_ref_v2,
+       .query_foreign_access           = gnttab_query_foreign_access_v2,
+       .update_subpage_entry           = gnttab_update_subpage_entry_v2,
+       .update_trans_entry             = gnttab_update_trans_entry_v2,
+};
+
+static void gnttab_request_version(void)
+{
+       int rc;
+       struct gnttab_set_version gsv;
+
+       gsv.version = 2;
+       rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
+       if (rc == 0) {
+               grant_table_version = 2;
+               gnttab_interface = &gnttab_v2_ops;
+       } else if (grant_table_version == 2) {
+               /*
+                * If we've already used version 2 features,
+                * but then suddenly discover that they're not
+                * available (e.g. migrating to an older
+                * version of Xen), almost unbounded badness
+                * can happen.
+                */
+               panic("we need grant tables version 2, but only version 1 is available");
+       } else {
+               grant_table_version = 1;
+               gnttab_interface = &gnttab_v1_ops;
+       }
+       printk(KERN_INFO "Grant tables using version %d layout.\n",
+               grant_table_version);
  }
  
  int gnttab_resume(void)
  {
         unsigned int max_nr_gframes;
  
+       gnttab_request_version();
         max_nr_gframes = gnttab_max_grant_frames();
         if (max_nr_gframes < nr_grant_frames)
                 return -ENOSYS;
@@ -587,9 +996,10 @@ int gnttab_resume(void)
         if (xen_pv_domain())
                 return gnttab_map(0, nr_grant_frames - 1);
  
-       if (!shared) {
-               shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
-               if (shared == NULL) {
+       if (gnttab_shared.addr == NULL) {
+               gnttab_shared.addr = ioremap(xen_hvm_resume_frames,
+                                               PAGE_SIZE * max_nr_gframes);
+               if (gnttab_shared.addr == NULL) {
                         printk(KERN_WARNING
                                         "Failed to ioremap gnttab share frames!");
                         return -ENOMEM;
@@ -603,7 +1013,7 @@ int gnttab_resume(void)
  
  int gnttab_suspend(void)
  {
-       arch_gnttab_unmap_shared(shared, nr_grant_frames);
+       gnttab_interface->unmap_frames();
         return 0;
  }
  
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c

new file mode 100644 (file)

index 0000000..ccee0f1
--- /dev/null
+++ b/drivers/xen/privcmd.c
@@ -0,0 +1,435 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+       struct privcmd_hypercall hypercall;
+       long ret;
+
+       if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+               return -EFAULT;
+
+       ret = privcmd_call(hypercall.op,
+                          hypercall.arg[0], hypercall.arg[1],
+                          hypercall.arg[2], hypercall.arg[3],
+                          hypercall.arg[4]);
+
+       return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+       struct page *p, *n;
+
+       list_for_each_entry_safe(p, n, pages, lru)
+               __free_page(p);
+
+       INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data.  If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+                       unsigned nelem, size_t size,
+                       void __user *data)
+{
+       unsigned pageidx;
+       void *pagedata;
+       int ret;
+
+       if (size > PAGE_SIZE)
+               return 0;
+
+       pageidx = PAGE_SIZE;
+       pagedata = NULL;        /* quiet, gcc */
+       while (nelem--) {
+               if (pageidx > PAGE_SIZE-size) {
+                       struct page *page = alloc_page(GFP_KERNEL);
+
+                       ret = -ENOMEM;
+                       if (page == NULL)
+                               goto fail;
+
+                       pagedata = page_address(page);
+
+                       list_add_tail(&page->lru, pagelist);
+                       pageidx = 0;
+               }
+
+               ret = -EFAULT;
+               if (copy_from_user(pagedata + pageidx, data, size))
+                       goto fail;
+
+               data += size;
+               pageidx += size;
+       }
+
+       ret = 0;
+
+fail:
+       return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+                         struct list_head *pos,
+                         int (*fn)(void *data, void *state),
+                         void *state)
+{
+       void *pagedata;
+       unsigned pageidx;
+       int ret = 0;
+
+       BUG_ON(size > PAGE_SIZE);
+
+       pageidx = PAGE_SIZE;
+       pagedata = NULL;        /* hush, gcc */
+
+       while (nelem--) {
+               if (pageidx > PAGE_SIZE-size) {
+                       struct page *page;
+                       pos = pos->next;
+                       page = list_entry(pos, struct page, lru);
+                       pagedata = page_address(page);
+                       pageidx = 0;
+               }
+
+               ret = (*fn)(pagedata + pageidx, state);
+               if (ret)
+                       break;
+               pageidx += size;
+       }
+
+       return ret;
+}
+
+struct mmap_mfn_state {
+       unsigned long va;
+       struct vm_area_struct *vma;
+       domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+       struct privcmd_mmap_entry *msg = data;
+       struct mmap_mfn_state *st = state;
+       struct vm_area_struct *vma = st->vma;
+       int rc;
+
+       /* Do not allow range to wrap the address space. */
+       if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+           ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+               return -EINVAL;
+
+       /* Range chunks must be contiguous in va space. */
+       if ((msg->va != st->va) ||
+           ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+               return -EINVAL;
+
+       rc = xen_remap_domain_mfn_range(vma,
+                                       msg->va & PAGE_MASK,
+                                       msg->mfn, msg->npages,
+                                       vma->vm_page_prot,
+                                       st->domain);
+       if (rc < 0)
+               return rc;
+
+       st->va += msg->npages << PAGE_SHIFT;
+
+       return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+       struct privcmd_mmap mmapcmd;
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       int rc;
+       LIST_HEAD(pagelist);
+       struct mmap_mfn_state state;
+
+       if (!xen_initial_domain())
+               return -EPERM;
+
+       if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+               return -EFAULT;
+
+       rc = gather_array(&pagelist,
+                         mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+                         mmapcmd.entry);
+
+       if (rc || list_empty(&pagelist))
+               goto out;
+
+       down_write(&mm->mmap_sem);
+
+       {
+               struct page *page = list_first_entry(&pagelist,
+                                                    struct page, lru);
+               struct privcmd_mmap_entry *msg = page_address(page);
+
+               vma = find_vma(mm, msg->va);
+               rc = -EINVAL;
+
+               if (!vma || (msg->va != vma->vm_start) ||
+                   !privcmd_enforce_singleshot_mapping(vma))
+                       goto out_up;
+       }
+
+       state.va = vma->vm_start;
+       state.vma = vma;
+       state.domain = mmapcmd.dom;
+
+       rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+                           &pagelist,
+                           mmap_mfn_range, &state);
+
+
+out_up:
+       up_write(&mm->mmap_sem);
+
+out:
+       free_page_list(&pagelist);
+
+       return rc;
+}
+
+struct mmap_batch_state {
+       domid_t domain;
+       unsigned long va;
+       struct vm_area_struct *vma;
+       int err;
+
+       xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+       xen_pfn_t *mfnp = data;
+       struct mmap_batch_state *st = state;
+
+       if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+                                      st->vma->vm_page_prot, st->domain) < 0) {
+               *mfnp |= 0xf0000000U;
+               st->err++;
+       }
+       st->va += PAGE_SIZE;
+
+       return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+       xen_pfn_t *mfnp = data;
+       struct mmap_batch_state *st = state;
+
+       return put_user(*mfnp, st->user++);
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+       int ret;
+       struct privcmd_mmapbatch m;
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long nr_pages;
+       LIST_HEAD(pagelist);
+       struct mmap_batch_state state;
+
+       if (!xen_initial_domain())
+               return -EPERM;
+
+       if (copy_from_user(&m, udata, sizeof(m)))
+               return -EFAULT;
+
+       nr_pages = m.num;
+       if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+               return -EINVAL;
+
+       ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+                          m.arr);
+
+       if (ret || list_empty(&pagelist))
+               goto out;
+
+       down_write(&mm->mmap_sem);
+
+       vma = find_vma(mm, m.addr);
+       ret = -EINVAL;
+       if (!vma ||
+           vma->vm_ops != &privcmd_vm_ops ||
+           (m.addr != vma->vm_start) ||
+           ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+           !privcmd_enforce_singleshot_mapping(vma)) {
+               up_write(&mm->mmap_sem);
+               goto out;
+       }
+
+       state.domain = m.dom;
+       state.vma = vma;
+       state.va = m.addr;
+       state.err = 0;
+
+       ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+                            &pagelist, mmap_batch_fn, &state);
+
+       up_write(&mm->mmap_sem);
+
+       if (state.err > 0) {
+               state.user = m.arr;
+               ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+                              &pagelist,
+                              mmap_return_errors, &state);
+       }
+
+out:
+       free_page_list(&pagelist);
+
+       return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+                         unsigned int cmd, unsigned long data)
+{
+       int ret = -ENOSYS;
+       void __user *udata = (void __user *) data;
+
+       switch (cmd) {
+       case IOCTL_PRIVCMD_HYPERCALL:
+               ret = privcmd_ioctl_hypercall(udata);
+               break;
+
+       case IOCTL_PRIVCMD_MMAP:
+               ret = privcmd_ioctl_mmap(udata);
+               break;
+
+       case IOCTL_PRIVCMD_MMAPBATCH:
+               ret = privcmd_ioctl_mmap_batch(udata);
+               break;
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+              vma, vma->vm_start, vma->vm_end,
+              vmf->pgoff, vmf->virtual_address);
+
+       return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+       .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       /* Unsupported for auto-translate guests. */
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return -ENOSYS;
+
+       /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+        * how to recreate these mappings */
+       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+       vma->vm_ops = &privcmd_vm_ops;
+       vma->vm_private_data = NULL;
+
+       return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+       return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+
+const struct file_operations xen_privcmd_fops = {
+       .owner = THIS_MODULE,
+       .unlocked_ioctl = privcmd_ioctl,
+       .mmap = privcmd_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmd_fops);
+
+static struct miscdevice privcmd_dev = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "xen/privcmd",
+       .fops = &xen_privcmd_fops,
+};
+
+static int __init privcmd_init(void)
+{
+       int err;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       err = misc_register(&privcmd_dev);
+       if (err != 0) {
+               printk(KERN_ERR "Could not register Xen privcmd device\n");
+               return err;
+       }
+       return 0;
+}
+
+static void __exit privcmd_exit(void)
+{
+       misc_deregister(&privcmd_dev);
+}
+
+module_init(privcmd_init);
+module_exit(privcmd_exit);
diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h

new file mode 100644 (file)

index 0000000..14facae
--- /dev/null
+++ b/drivers/xen/privcmd.h
@@ -0,0 +1,3 @@
+#include <linux/fs.h>
+
+extern const struct file_operations xen_privcmd_fops;
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile

index 8dca685358b4286f17b5f4d362288d83ee4ab3bb..31e2e9050c7a5b5f511bfe780c69ddb1590e358c 100644 (file)
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -1,4 +1,5 @@
  obj-y  += xenbus.o
+obj-y  += xenbus_dev_frontend.o
  
  xenbus-objs =
  xenbus-objs += xenbus_client.o
@@ -9,4 +10,5 @@ xenbus-objs += xenbus_probe.o
  xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
  xenbus-objs += $(xenbus-be-objs-y)
  
+obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o
  obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h

index c21db7513736a90c399874f3f94524253d25181a..6e42800fa499bc6a303efabd300e763129d9888e 100644 (file)
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -31,6 +31,8 @@
  #ifndef _XENBUS_COMMS_H
  #define _XENBUS_COMMS_H
  
+#include <linux/fs.h>
+
  int xs_init(void);
  int xb_init_comms(void);
  
@@ -43,4 +45,6 @@ int xs_input_avail(void);
  extern struct xenstore_domain_interface *xen_store_interface;
  extern int xen_store_evtchn;
  
+extern const struct file_operations xen_xenbus_fops;
+
  #endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c

new file mode 100644 (file)

index 0000000..a2092bd
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -0,0 +1,89 @@
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+
+#include <xen/page.h>
+#include <xen/xenbus_dev.h>
+
+#include "xenbus_comms.h"
+
+MODULE_LICENSE("GPL");
+
+static int xenbus_backend_open(struct inode *inode, struct file *filp)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       return nonseekable_open(inode, filp);
+}
+
+static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned long data)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+               case IOCTL_XENBUS_BACKEND_EVTCHN:
+                       if (xen_store_evtchn > 0)
+                               return xen_store_evtchn;
+                       return -ENODEV;
+
+               default:
+                       return -ENOTTY;
+       }
+}
+
+static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       size_t size = vma->vm_end - vma->vm_start;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+               return -EINVAL;
+
+       if (remap_pfn_range(vma, vma->vm_start,
+                           virt_to_pfn(xen_store_interface),
+                           size, vma->vm_page_prot))
+               return -EAGAIN;
+
+       return 0;
+}
+
+const struct file_operations xenbus_backend_fops = {
+       .open = xenbus_backend_open,
+       .mmap = xenbus_backend_mmap,
+       .unlocked_ioctl = xenbus_backend_ioctl,
+};
+
+static struct miscdevice xenbus_backend_dev = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "xen/xenbus_backend",
+       .fops = &xenbus_backend_fops,
+};
+
+static int __init xenbus_backend_init(void)
+{
+       int err;
+
+       if (!xen_initial_domain())
+               return -ENODEV;
+
+       err = misc_register(&xenbus_backend_dev);
+       if (err)
+               printk(KERN_ERR "Could not register xenbus backend device\n");
+       return err;
+}
+
+static void __exit xenbus_backend_exit(void)
+{
+       misc_deregister(&xenbus_backend_dev);
+}
+
+module_init(xenbus_backend_init);
+module_exit(xenbus_backend_exit);
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c

new file mode 100644 (file)

index 0000000..aec0142
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -0,0 +1,625 @@
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include "xenbus_comms.h"
+
+#include <xen/xenbus.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
+MODULE_LICENSE("GPL");
+
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+       struct list_head list;
+       struct xenbus_transaction handle;
+};
+
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+       struct list_head list;
+       unsigned int cons;
+       unsigned int len;
+       char msg[];
+};
+
+struct xenbus_file_priv {
+       /*
+        * msgbuffer_mutex is held while partial requests are built up
+        * and complete requests are acted on.  It therefore protects
+        * the "transactions" and "watches" lists, and the partial
+        * request length and buffer.
+        *
+        * reply_mutex protects the reply being built up to return to
+        * usermode.  It nests inside msgbuffer_mutex but may be held
+        * alone during a watch callback.
+        */
+       struct mutex msgbuffer_mutex;
+
+       /* In-progress transactions */
+       struct list_head transactions;
+
+       /* Active watches. */
+       struct list_head watches;
+
+       /* Partial request. */
+       unsigned int len;
+       union {
+               struct xsd_sockmsg msg;
+               char buffer[PAGE_SIZE];
+       } u;
+
+       /* Response queue. */
+       struct mutex reply_mutex;
+       struct list_head read_buffers;
+       wait_queue_head_t read_waitq;
+
+};
+
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+                              char __user *ubuf,
+                              size_t len, loff_t *ppos)
+{
+       struct xenbus_file_priv *u = filp->private_data;
+       struct read_buffer *rb;
+       unsigned i;
+       int ret;
+
+       mutex_lock(&u->reply_mutex);
+again:
+       while (list_empty(&u->read_buffers)) {
+               mutex_unlock(&u->reply_mutex);
+               if (filp->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               ret = wait_event_interruptible(u->read_waitq,
+                                              !list_empty(&u->read_buffers));
+               if (ret)
+                       return ret;
+               mutex_lock(&u->reply_mutex);
+       }
+
+       rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+       i = 0;
+       while (i < len) {
+               unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+
+               ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+
+               i += sz - ret;
+               rb->cons += sz - ret;
+
+               if (ret != 0) {
+                       if (i == 0)
+                               i = -EFAULT;
+                       goto out;
+               }
+
+               /* Clear out buffer if it has been consumed */
+               if (rb->cons == rb->len) {
+                       list_del(&rb->list);
+                       kfree(rb);
+                       if (list_empty(&u->read_buffers))
+                               break;
+                       rb = list_entry(u->read_buffers.next,
+                                       struct read_buffer, list);
+               }
+       }
+       if (i == 0)
+               goto again;
+
+out:
+       mutex_unlock(&u->reply_mutex);
+       return i;
+}
+
+/*
+ * Add a buffer to the queue.  Caller must hold the appropriate lock
+ * if the queue is not local.  (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+       struct read_buffer *rb;
+
+       if (len == 0)
+               return 0;
+
+       rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+       if (rb == NULL)
+               return -ENOMEM;
+
+       rb->cons = 0;
+       rb->len = len;
+
+       memcpy(rb->msg, data, len);
+
+       list_add_tail(&rb->list, queue);
+       return 0;
+}
+
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+       struct read_buffer *rb;
+
+       while (!list_empty(list)) {
+               rb = list_entry(list->next, struct read_buffer, list);
+               list_del(list->next);
+               kfree(rb);
+       }
+}
+
+struct watch_adapter {
+       struct list_head list;
+       struct xenbus_watch watch;
+       struct xenbus_file_priv *dev_data;
+       char *token;
+};
+
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+       kfree(watch->watch.node);
+       kfree(watch->token);
+       kfree(watch);
+}
+
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+                                                const char *token)
+{
+       struct watch_adapter *watch;
+
+       watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+       if (watch == NULL)
+               goto out_fail;
+
+       watch->watch.node = kstrdup(path, GFP_KERNEL);
+       if (watch->watch.node == NULL)
+               goto out_free;
+
+       watch->token = kstrdup(token, GFP_KERNEL);
+       if (watch->token == NULL)
+               goto out_free;
+
+       return watch;
+
+out_free:
+       free_watch_adapter(watch);
+
+out_fail:
+       return NULL;
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+                       const char **vec,
+                       unsigned int len)
+{
+       struct watch_adapter *adap;
+       struct xsd_sockmsg hdr;
+       const char *path, *token;
+       int path_len, tok_len, body_len, data_len = 0;
+       int ret;
+       LIST_HEAD(staging_q);
+
+       adap = container_of(watch, struct watch_adapter, watch);
+
+       path = vec[XS_WATCH_PATH];
+       token = adap->token;
+
+       path_len = strlen(path) + 1;
+       tok_len = strlen(token) + 1;
+       if (len > 2)
+               data_len = vec[len] - vec[2] + 1;
+       body_len = path_len + tok_len + data_len;
+
+       hdr.type = XS_WATCH_EVENT;
+       hdr.len = body_len;
+
+       mutex_lock(&adap->dev_data->reply_mutex);
+
+       ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+       if (!ret)
+               ret = queue_reply(&staging_q, path, path_len);
+       if (!ret)
+               ret = queue_reply(&staging_q, token, tok_len);
+       if (!ret && len > 2)
+               ret = queue_reply(&staging_q, vec[2], data_len);
+
+       if (!ret) {
+               /* success: pass reply list onto watcher */
+               list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+               wake_up(&adap->dev_data->read_waitq);
+       } else
+               queue_cleanup(&staging_q);
+
+       mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static int xenbus_write_transaction(unsigned msg_type,
+                                   struct xenbus_file_priv *u)
+{
+       int rc;
+       void *reply;
+       struct xenbus_transaction_holder *trans = NULL;
+       LIST_HEAD(staging_q);
+
+       if (msg_type == XS_TRANSACTION_START) {
+               trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+               if (!trans) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+       }
+
+       reply = xenbus_dev_request_and_reply(&u->u.msg);
+       if (IS_ERR(reply)) {
+               kfree(trans);
+               rc = PTR_ERR(reply);
+               goto out;
+       }
+
+       if (msg_type == XS_TRANSACTION_START) {
+               trans->handle.id = simple_strtoul(reply, NULL, 0);
+
+               list_add(&trans->list, &u->transactions);
+       } else if (msg_type == XS_TRANSACTION_END) {
+               list_for_each_entry(trans, &u->transactions, list)
+                       if (trans->handle.id == u->u.msg.tx_id)
+                               break;
+               BUG_ON(&trans->list == &u->transactions);
+               list_del(&trans->list);
+
+               kfree(trans);
+       }
+
+       mutex_lock(&u->reply_mutex);
+       rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+       if (!rc)
+               rc = queue_reply(&staging_q, reply, u->u.msg.len);
+       if (!rc) {
+               list_splice_tail(&staging_q, &u->read_buffers);
+               wake_up(&u->read_waitq);
+       } else {
+               queue_cleanup(&staging_q);
+       }
+       mutex_unlock(&u->reply_mutex);
+
+       kfree(reply);
+
+out:
+       return rc;
+}
+
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+       struct watch_adapter *watch, *tmp_watch;
+       char *path, *token;
+       int err, rc;
+       LIST_HEAD(staging_q);
+
+       path = u->u.buffer + sizeof(u->u.msg);
+       token = memchr(path, 0, u->u.msg.len);
+       if (token == NULL) {
+               rc = -EILSEQ;
+               goto out;
+       }
+       token++;
+
+       if (msg_type == XS_WATCH) {
+               watch = alloc_watch_adapter(path, token);
+               if (watch == NULL) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+
+               watch->watch.callback = watch_fired;
+               watch->dev_data = u;
+
+               err = register_xenbus_watch(&watch->watch);
+               if (err) {
+                       free_watch_adapter(watch);
+                       rc = err;
+                       goto out;
+               }
+               list_add(&watch->list, &u->watches);
+       } else {
+               list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+                       if (!strcmp(watch->token, token) &&
+                           !strcmp(watch->watch.node, path)) {
+                               unregister_xenbus_watch(&watch->watch);
+                               list_del(&watch->list);
+                               free_watch_adapter(watch);
+                               break;
+                       }
+               }
+       }
+
+       /* Success.  Synthesize a reply to say all is OK. */
+       {
+               struct {
+                       struct xsd_sockmsg hdr;
+                       char body[3];
+               } __packed reply = {
+                       {
+                               .type = msg_type,
+                               .len = sizeof(reply.body)
+                       },
+                       "OK"
+               };
+
+               mutex_lock(&u->reply_mutex);
+               rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+               wake_up(&u->read_waitq);
+               mutex_unlock(&u->reply_mutex);
+       }
+
+out:
+       return rc;
+}
+
+static ssize_t xenbus_file_write(struct file *filp,
+                               const char __user *ubuf,
+                               size_t len, loff_t *ppos)
+{
+       struct xenbus_file_priv *u = filp->private_data;
+       uint32_t msg_type;
+       int rc = len;
+       int ret;
+       LIST_HEAD(staging_q);
+
+       /*
+        * We're expecting usermode to be writing properly formed
+        * xenbus messages.  If they write an incomplete message we
+        * buffer it up.  Once it is complete, we act on it.
+        */
+
+       /*
+        * Make sure concurrent writers can't stomp all over each
+        * other's messages and make a mess of our partial message
+        * buffer.  We don't make any attemppt to stop multiple
+        * writers from making a mess of each other's incomplete
+        * messages; we're just trying to guarantee our own internal
+        * consistency and make sure that single writes are handled
+        * atomically.
+        */
+       mutex_lock(&u->msgbuffer_mutex);
+
+       /* Get this out of the way early to avoid confusion */
+       if (len == 0)
+               goto out;
+
+       /* Can't write a xenbus message larger we can buffer */
+       if ((len + u->len) > sizeof(u->u.buffer)) {
+               /* On error, dump existing buffer */
+               u->len = 0;
+               rc = -EINVAL;
+               goto out;
+       }
+
+       ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+
+       if (ret != 0) {
+               rc = -EFAULT;
+               goto out;
+       }
+
+       /* Deal with a partial copy. */
+       len -= ret;
+       rc = len;
+
+       u->len += len;
+
+       /* Return if we haven't got a full message yet */
+       if (u->len < sizeof(u->u.msg))
+               goto out;       /* not even the header yet */
+
+       /* If we're expecting a message that's larger than we can
+          possibly send, dump what we have and return an error. */
+       if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+               rc = -E2BIG;
+               u->len = 0;
+               goto out;
+       }
+
+       if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+               goto out;       /* incomplete data portion */
+
+       /*
+        * OK, now we have a complete message.  Do something with it.
+        */
+
+       msg_type = u->u.msg.type;
+
+       switch (msg_type) {
+       case XS_WATCH:
+       case XS_UNWATCH:
+               /* (Un)Ask for some path to be watched for changes */
+               ret = xenbus_write_watch(msg_type, u);
+               break;
+
+       default:
+               /* Send out a transaction */
+               ret = xenbus_write_transaction(msg_type, u);
+               break;
+       }
+       if (ret != 0)
+               rc = ret;
+
+       /* Buffered message consumed */
+       u->len = 0;
+
+ out:
+       mutex_unlock(&u->msgbuffer_mutex);
+       return rc;
+}
+
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+       struct xenbus_file_priv *u;
+
+       if (xen_store_evtchn == 0)
+               return -ENOENT;
+
+       nonseekable_open(inode, filp);
+
+       u = kzalloc(sizeof(*u), GFP_KERNEL);
+       if (u == NULL)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&u->transactions);
+       INIT_LIST_HEAD(&u->watches);
+       INIT_LIST_HEAD(&u->read_buffers);
+       init_waitqueue_head(&u->read_waitq);
+
+       mutex_init(&u->reply_mutex);
+       mutex_init(&u->msgbuffer_mutex);
+
+       filp->private_data = u;
+
+       return 0;
+}
+
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+       struct xenbus_file_priv *u = filp->private_data;
+       struct xenbus_transaction_holder *trans, *tmp;
+       struct watch_adapter *watch, *tmp_watch;
+       struct read_buffer *rb, *tmp_rb;
+
+       /*
+        * No need for locking here because there are no other users,
+        * by definition.
+        */
+
+       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+               xenbus_transaction_end(trans->handle, 1);
+               list_del(&trans->list);
+               kfree(trans);
+       }
+
+       list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+               unregister_xenbus_watch(&watch->watch);
+               list_del(&watch->list);
+               free_watch_adapter(watch);
+       }
+
+       list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+               list_del(&rb->list);
+               kfree(rb);
+       }
+       kfree(u);
+
+       return 0;
+}
+
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+       struct xenbus_file_priv *u = file->private_data;
+
+       poll_wait(file, &u->read_waitq, wait);
+       if (!list_empty(&u->read_buffers))
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+const struct file_operations xen_xenbus_fops = {
+       .read = xenbus_file_read,
+       .write = xenbus_file_write,
+       .open = xenbus_file_open,
+       .release = xenbus_file_release,
+       .poll = xenbus_file_poll,
+       .llseek = no_llseek,
+};
+EXPORT_SYMBOL_GPL(xen_xenbus_fops);
+
+static struct miscdevice xenbus_dev = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "xen/xenbus",
+       .fops = &xen_xenbus_fops,
+};
+
+static int __init xenbus_init(void)
+{
+       int err;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       err = misc_register(&xenbus_dev);
+       if (err)
+               printk(KERN_ERR "Could not register xenbus frontend device\n");
+       return err;
+}
+
+static void __exit xenbus_exit(void)
+{
+       misc_deregister(&xenbus_dev);
+}
+
+module_init(xenbus_init);
+module_exit(xenbus_exit);
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile

index 4fde9440fe1f453aad9a4d6e4a98dceecbf6dbe4..b019865fcc56b779098c792bc7653cdc2537f2bc 100644 (file)
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
  obj-$(CONFIG_XENFS) += xenfs.o
  
-xenfs-y                          = super.o xenbus.o privcmd.o
+xenfs-y                          = super.o
  xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c

deleted file mode 100644 (file)

index dbd3b16..0000000
--- a/drivers/xen/xenfs/privcmd.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/******************************************************************************
- * privcmd.c
- *
- * Interface to privileged domain-0 commands.
- *
- * Copyright (c) 2002-2004, K A Fraser, B Dragovic
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/uaccess.h>
-#include <linux/swap.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/tlb.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/xen.h>
-#include <xen/privcmd.h>
-#include <xen/interface/xen.h>
-#include <xen/features.h>
-#include <xen/page.h>
-#include <xen/xen-ops.h>
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
-#endif
-
-static long privcmd_ioctl_hypercall(void __user *udata)
-{
-       struct privcmd_hypercall hypercall;
-       long ret;
-
-       if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
-               return -EFAULT;
-
-       ret = privcmd_call(hypercall.op,
-                          hypercall.arg[0], hypercall.arg[1],
-                          hypercall.arg[2], hypercall.arg[3],
-                          hypercall.arg[4]);
-
-       return ret;
-}
-
-static void free_page_list(struct list_head *pages)
-{
-       struct page *p, *n;
-
-       list_for_each_entry_safe(p, n, pages, lru)
-               __free_page(p);
-
-       INIT_LIST_HEAD(pages);
-}
-
-/*
- * Given an array of items in userspace, return a list of pages
- * containing the data.  If copying fails, either because of memory
- * allocation failure or a problem reading user memory, return an
- * error code; its up to the caller to dispose of any partial list.
- */
-static int gather_array(struct list_head *pagelist,
-                       unsigned nelem, size_t size,
-                       void __user *data)
-{
-       unsigned pageidx;
-       void *pagedata;
-       int ret;
-
-       if (size > PAGE_SIZE)
-               return 0;
-
-       pageidx = PAGE_SIZE;
-       pagedata = NULL;        /* quiet, gcc */
-       while (nelem--) {
-               if (pageidx > PAGE_SIZE-size) {
-                       struct page *page = alloc_page(GFP_KERNEL);
-
-                       ret = -ENOMEM;
-                       if (page == NULL)
-                               goto fail;
-
-                       pagedata = page_address(page);
-
-                       list_add_tail(&page->lru, pagelist);
-                       pageidx = 0;
-               }
-
-               ret = -EFAULT;
-               if (copy_from_user(pagedata + pageidx, data, size))
-                       goto fail;
-
-               data += size;
-               pageidx += size;
-       }
-
-       ret = 0;
-
-fail:
-       return ret;
-}
-
-/*
- * Call function "fn" on each element of the array fragmented
- * over a list of pages.
- */
-static int traverse_pages(unsigned nelem, size_t size,
-                         struct list_head *pos,
-                         int (*fn)(void *data, void *state),
-                         void *state)
-{
-       void *pagedata;
-       unsigned pageidx;
-       int ret = 0;
-
-       BUG_ON(size > PAGE_SIZE);
-
-       pageidx = PAGE_SIZE;
-       pagedata = NULL;        /* hush, gcc */
-
-       while (nelem--) {
-               if (pageidx > PAGE_SIZE-size) {
-                       struct page *page;
-                       pos = pos->next;
-                       page = list_entry(pos, struct page, lru);
-                       pagedata = page_address(page);
-                       pageidx = 0;
-               }
-
-               ret = (*fn)(pagedata + pageidx, state);
-               if (ret)
-                       break;
-               pageidx += size;
-       }
-
-       return ret;
-}
-
-struct mmap_mfn_state {
-       unsigned long va;
-       struct vm_area_struct *vma;
-       domid_t domain;
-};
-
-static int mmap_mfn_range(void *data, void *state)
-{
-       struct privcmd_mmap_entry *msg = data;
-       struct mmap_mfn_state *st = state;
-       struct vm_area_struct *vma = st->vma;
-       int rc;
-
-       /* Do not allow range to wrap the address space. */
-       if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
-           ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
-               return -EINVAL;
-
-       /* Range chunks must be contiguous in va space. */
-       if ((msg->va != st->va) ||
-           ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
-               return -EINVAL;
-
-       rc = xen_remap_domain_mfn_range(vma,
-                                       msg->va & PAGE_MASK,
-                                       msg->mfn, msg->npages,
-                                       vma->vm_page_prot,
-                                       st->domain);
-       if (rc < 0)
-               return rc;
-
-       st->va += msg->npages << PAGE_SHIFT;
-
-       return 0;
-}
-
-static long privcmd_ioctl_mmap(void __user *udata)
-{
-       struct privcmd_mmap mmapcmd;
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       int rc;
-       LIST_HEAD(pagelist);
-       struct mmap_mfn_state state;
-
-       if (!xen_initial_domain())
-               return -EPERM;
-
-       if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
-               return -EFAULT;
-
-       rc = gather_array(&pagelist,
-                         mmapcmd.num, sizeof(struct privcmd_mmap_entry),
-                         mmapcmd.entry);
-
-       if (rc || list_empty(&pagelist))
-               goto out;
-
-       down_write(&mm->mmap_sem);
-
-       {
-               struct page *page = list_first_entry(&pagelist,
-                                                    struct page, lru);
-               struct privcmd_mmap_entry *msg = page_address(page);
-
-               vma = find_vma(mm, msg->va);
-               rc = -EINVAL;
-
-               if (!vma || (msg->va != vma->vm_start) ||
-                   !privcmd_enforce_singleshot_mapping(vma))
-                       goto out_up;
-       }
-
-       state.va = vma->vm_start;
-       state.vma = vma;
-       state.domain = mmapcmd.dom;
-
-       rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
-                           &pagelist,
-                           mmap_mfn_range, &state);
-
-
-out_up:
-       up_write(&mm->mmap_sem);
-
-out:
-       free_page_list(&pagelist);
-
-       return rc;
-}
-
-struct mmap_batch_state {
-       domid_t domain;
-       unsigned long va;
-       struct vm_area_struct *vma;
-       int err;
-
-       xen_pfn_t __user *user;
-};
-
-static int mmap_batch_fn(void *data, void *state)
-{
-       xen_pfn_t *mfnp = data;
-       struct mmap_batch_state *st = state;
-
-       if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
-                                      st->vma->vm_page_prot, st->domain) < 0) {
-               *mfnp |= 0xf0000000U;
-               st->err++;
-       }
-       st->va += PAGE_SIZE;
-
-       return 0;
-}
-
-static int mmap_return_errors(void *data, void *state)
-{
-       xen_pfn_t *mfnp = data;
-       struct mmap_batch_state *st = state;
-
-       return put_user(*mfnp, st->user++);
-}
-
-static struct vm_operations_struct privcmd_vm_ops;
-
-static long privcmd_ioctl_mmap_batch(void __user *udata)
-{
-       int ret;
-       struct privcmd_mmapbatch m;
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long nr_pages;
-       LIST_HEAD(pagelist);
-       struct mmap_batch_state state;
-
-       if (!xen_initial_domain())
-               return -EPERM;
-
-       if (copy_from_user(&m, udata, sizeof(m)))
-               return -EFAULT;
-
-       nr_pages = m.num;
-       if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
-               return -EINVAL;
-
-       ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
-                          m.arr);
-
-       if (ret || list_empty(&pagelist))
-               goto out;
-
-       down_write(&mm->mmap_sem);
-
-       vma = find_vma(mm, m.addr);
-       ret = -EINVAL;
-       if (!vma ||
-           vma->vm_ops != &privcmd_vm_ops ||
-           (m.addr != vma->vm_start) ||
-           ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
-           !privcmd_enforce_singleshot_mapping(vma)) {
-               up_write(&mm->mmap_sem);
-               goto out;
-       }
-
-       state.domain = m.dom;
-       state.vma = vma;
-       state.va = m.addr;
-       state.err = 0;
-
-       ret = traverse_pages(m.num, sizeof(xen_pfn_t),
-                            &pagelist, mmap_batch_fn, &state);
-
-       up_write(&mm->mmap_sem);
-
-       if (state.err > 0) {
-               state.user = m.arr;
-               ret = traverse_pages(m.num, sizeof(xen_pfn_t),
-                              &pagelist,
-                              mmap_return_errors, &state);
-       }
-
-out:
-       free_page_list(&pagelist);
-
-       return ret;
-}
-
-static long privcmd_ioctl(struct file *file,
-                         unsigned int cmd, unsigned long data)
-{
-       int ret = -ENOSYS;
-       void __user *udata = (void __user *) data;
-
-       switch (cmd) {
-       case IOCTL_PRIVCMD_HYPERCALL:
-               ret = privcmd_ioctl_hypercall(udata);
-               break;
-
-       case IOCTL_PRIVCMD_MMAP:
-               ret = privcmd_ioctl_mmap(udata);
-               break;
-
-       case IOCTL_PRIVCMD_MMAPBATCH:
-               ret = privcmd_ioctl_mmap_batch(udata);
-               break;
-
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       return ret;
-}
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
-              vma, vma->vm_start, vma->vm_end,
-              vmf->pgoff, vmf->virtual_address);
-
-       return VM_FAULT_SIGBUS;
-}
-
-static struct vm_operations_struct privcmd_vm_ops = {
-       .fault = privcmd_fault
-};
-
-static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       /* Unsupported for auto-translate guests. */
-       if (xen_feature(XENFEAT_auto_translated_physmap))
-               return -ENOSYS;
-
-       /* DONTCOPY is essential for Xen because copy_page_range doesn't know
-        * how to recreate these mappings */
-       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
-       vma->vm_ops = &privcmd_vm_ops;
-       vma->vm_private_data = NULL;
-
-       return 0;
-}
-
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
-{
-       return (xchg(&vma->vm_private_data, (void *)1) == NULL);
-}
-#endif
-
-const struct file_operations privcmd_file_ops = {
-       .unlocked_ioctl = privcmd_ioctl,
-       .mmap = privcmd_mmap,
-};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c

index 1aa3897198462112a3bc1db479986653cf0a716f..a84b53c01436334b68688dca58e75bd54462b023 100644 (file)
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -16,6 +16,8 @@
  #include <xen/xen.h>
  
  #include "xenfs.h"
+#include "../privcmd.h"
+#include "../xenbus/xenbus_comms.h"
  
  #include <asm/xen/hypervisor.h>
  
@@ -82,9 +84,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
  {
         static struct tree_descr xenfs_files[] = {
                 [1] = {},
-               { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+               { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
                 { "capabilities", &capabilities_file_ops, S_IRUGO },
-               { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
+               { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
                 {""},
         };
         int rc;
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c

deleted file mode 100644 (file)

index bbd000f..0000000
--- a/drivers/xen/xenfs/xenbus.c
+++ /dev/null
@@ -1,593 +0,0 @@
-/*
- * Driver giving user-space access to the kernel's xenbus connection
- * to xenstore.
- *
- * Copyright (c) 2005, Christian Limpach
- * Copyright (c) 2005, Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Changes:
- * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
- *                              and /proc/xen compatibility mount point.
- *                              Turned xenfs into a loadable module.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/uio.h>
-#include <linux/notifier.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-
-#include "xenfs.h"
-#include "../xenbus/xenbus_comms.h"
-
-#include <xen/xenbus.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * An element of a list of outstanding transactions, for which we're
- * still waiting a reply.
- */
-struct xenbus_transaction_holder {
-       struct list_head list;
-       struct xenbus_transaction handle;
-};
-
-/*
- * A buffer of data on the queue.
- */
-struct read_buffer {
-       struct list_head list;
-       unsigned int cons;
-       unsigned int len;
-       char msg[];
-};
-
-struct xenbus_file_priv {
-       /*
-        * msgbuffer_mutex is held while partial requests are built up
-        * and complete requests are acted on.  It therefore protects
-        * the "transactions" and "watches" lists, and the partial
-        * request length and buffer.
-        *
-        * reply_mutex protects the reply being built up to return to
-        * usermode.  It nests inside msgbuffer_mutex but may be held
-        * alone during a watch callback.
-        */
-       struct mutex msgbuffer_mutex;
-
-       /* In-progress transactions */
-       struct list_head transactions;
-
-       /* Active watches. */
-       struct list_head watches;
-
-       /* Partial request. */
-       unsigned int len;
-       union {
-               struct xsd_sockmsg msg;
-               char buffer[PAGE_SIZE];
-       } u;
-
-       /* Response queue. */
-       struct mutex reply_mutex;
-       struct list_head read_buffers;
-       wait_queue_head_t read_waitq;
-
-};
-
-/* Read out any raw xenbus messages queued up. */
-static ssize_t xenbus_file_read(struct file *filp,
-                              char __user *ubuf,
-                              size_t len, loff_t *ppos)
-{
-       struct xenbus_file_priv *u = filp->private_data;
-       struct read_buffer *rb;
-       unsigned i;
-       int ret;
-
-       mutex_lock(&u->reply_mutex);
-again:
-       while (list_empty(&u->read_buffers)) {
-               mutex_unlock(&u->reply_mutex);
-               if (filp->f_flags & O_NONBLOCK)
-                       return -EAGAIN;
-
-               ret = wait_event_interruptible(u->read_waitq,
-                                              !list_empty(&u->read_buffers));
-               if (ret)
-                       return ret;
-               mutex_lock(&u->reply_mutex);
-       }
-
-       rb = list_entry(u->read_buffers.next, struct read_buffer, list);
-       i = 0;
-       while (i < len) {
-               unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
-
-               ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
-
-               i += sz - ret;
-               rb->cons += sz - ret;
-
-               if (ret != 0) {
-                       if (i == 0)
-                               i = -EFAULT;
-                       goto out;
-               }
-
-               /* Clear out buffer if it has been consumed */
-               if (rb->cons == rb->len) {
-                       list_del(&rb->list);
-                       kfree(rb);
-                       if (list_empty(&u->read_buffers))
-                               break;
-                       rb = list_entry(u->read_buffers.next,
-                                       struct read_buffer, list);
-               }
-       }
-       if (i == 0)
-               goto again;
-
-out:
-       mutex_unlock(&u->reply_mutex);
-       return i;
-}
-
-/*
- * Add a buffer to the queue.  Caller must hold the appropriate lock
- * if the queue is not local.  (Commonly the caller will build up
- * multiple queued buffers on a temporary local list, and then add it
- * to the appropriate list under lock once all the buffers have een
- * successfully allocated.)
- */
-static int queue_reply(struct list_head *queue, const void *data, size_t len)
-{
-       struct read_buffer *rb;
-
-       if (len == 0)
-               return 0;
-
-       rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
-       if (rb == NULL)
-               return -ENOMEM;
-
-       rb->cons = 0;
-       rb->len = len;
-
-       memcpy(rb->msg, data, len);
-
-       list_add_tail(&rb->list, queue);
-       return 0;
-}
-
-/*
- * Free all the read_buffer s on a list.
- * Caller must have sole reference to list.
- */
-static void queue_cleanup(struct list_head *list)
-{
-       struct read_buffer *rb;
-
-       while (!list_empty(list)) {
-               rb = list_entry(list->next, struct read_buffer, list);
-               list_del(list->next);
-               kfree(rb);
-       }
-}
-
-struct watch_adapter {
-       struct list_head list;
-       struct xenbus_watch watch;
-       struct xenbus_file_priv *dev_data;
-       char *token;
-};
-
-static void free_watch_adapter(struct watch_adapter *watch)
-{
-       kfree(watch->watch.node);
-       kfree(watch->token);
-       kfree(watch);
-}
-
-static struct watch_adapter *alloc_watch_adapter(const char *path,
-                                                const char *token)
-{
-       struct watch_adapter *watch;
-
-       watch = kzalloc(sizeof(*watch), GFP_KERNEL);
-       if (watch == NULL)
-               goto out_fail;
-
-       watch->watch.node = kstrdup(path, GFP_KERNEL);
-       if (watch->watch.node == NULL)
-               goto out_free;
-
-       watch->token = kstrdup(token, GFP_KERNEL);
-       if (watch->token == NULL)
-               goto out_free;
-
-       return watch;
-
-out_free:
-       free_watch_adapter(watch);
-
-out_fail:
-       return NULL;
-}
-
-static void watch_fired(struct xenbus_watch *watch,
-                       const char **vec,
-                       unsigned int len)
-{
-       struct watch_adapter *adap;
-       struct xsd_sockmsg hdr;
-       const char *path, *token;
-       int path_len, tok_len, body_len, data_len = 0;
-       int ret;
-       LIST_HEAD(staging_q);
-
-       adap = container_of(watch, struct watch_adapter, watch);
-
-       path = vec[XS_WATCH_PATH];
-       token = adap->token;
-
-       path_len = strlen(path) + 1;
-       tok_len = strlen(token) + 1;
-       if (len > 2)
-               data_len = vec[len] - vec[2] + 1;
-       body_len = path_len + tok_len + data_len;
-
-       hdr.type = XS_WATCH_EVENT;
-       hdr.len = body_len;
-
-       mutex_lock(&adap->dev_data->reply_mutex);
-
-       ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
-       if (!ret)
-               ret = queue_reply(&staging_q, path, path_len);
-       if (!ret)
-               ret = queue_reply(&staging_q, token, tok_len);
-       if (!ret && len > 2)
-               ret = queue_reply(&staging_q, vec[2], data_len);
-
-       if (!ret) {
-               /* success: pass reply list onto watcher */
-               list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
-               wake_up(&adap->dev_data->read_waitq);
-       } else
-               queue_cleanup(&staging_q);
-
-       mutex_unlock(&adap->dev_data->reply_mutex);
-}
-
-static int xenbus_write_transaction(unsigned msg_type,
-                                   struct xenbus_file_priv *u)
-{
-       int rc;
-       void *reply;
-       struct xenbus_transaction_holder *trans = NULL;
-       LIST_HEAD(staging_q);
-
-       if (msg_type == XS_TRANSACTION_START) {
-               trans = kmalloc(sizeof(*trans), GFP_KERNEL);
-               if (!trans) {
-                       rc = -ENOMEM;
-                       goto out;
-               }
-       }
-
-       reply = xenbus_dev_request_and_reply(&u->u.msg);
-       if (IS_ERR(reply)) {
-               kfree(trans);
-               rc = PTR_ERR(reply);
-               goto out;
-       }
-
-       if (msg_type == XS_TRANSACTION_START) {
-               trans->handle.id = simple_strtoul(reply, NULL, 0);
-
-               list_add(&trans->list, &u->transactions);
-       } else if (msg_type == XS_TRANSACTION_END) {
-               list_for_each_entry(trans, &u->transactions, list)
-                       if (trans->handle.id == u->u.msg.tx_id)
-                               break;
-               BUG_ON(&trans->list == &u->transactions);
-               list_del(&trans->list);
-
-               kfree(trans);
-       }
-
-       mutex_lock(&u->reply_mutex);
-       rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
-       if (!rc)
-               rc = queue_reply(&staging_q, reply, u->u.msg.len);
-       if (!rc) {
-               list_splice_tail(&staging_q, &u->read_buffers);
-               wake_up(&u->read_waitq);
-       } else {
-               queue_cleanup(&staging_q);
-       }
-       mutex_unlock(&u->reply_mutex);
-
-       kfree(reply);
-
-out:
-       return rc;
-}
-
-static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
-{
-       struct watch_adapter *watch, *tmp_watch;
-       char *path, *token;
-       int err, rc;
-       LIST_HEAD(staging_q);
-
-       path = u->u.buffer + sizeof(u->u.msg);
-       token = memchr(path, 0, u->u.msg.len);
-       if (token == NULL) {
-               rc = -EILSEQ;
-               goto out;
-       }
-       token++;
-
-       if (msg_type == XS_WATCH) {
-               watch = alloc_watch_adapter(path, token);
-               if (watch == NULL) {
-                       rc = -ENOMEM;
-                       goto out;
-               }
-
-               watch->watch.callback = watch_fired;
-               watch->dev_data = u;
-
-               err = register_xenbus_watch(&watch->watch);
-               if (err) {
-                       free_watch_adapter(watch);
-                       rc = err;
-                       goto out;
-               }
-               list_add(&watch->list, &u->watches);
-       } else {
-               list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
-                       if (!strcmp(watch->token, token) &&
-                           !strcmp(watch->watch.node, path)) {
-                               unregister_xenbus_watch(&watch->watch);
-                               list_del(&watch->list);
-                               free_watch_adapter(watch);
-                               break;
-                       }
-               }
-       }
-
-       /* Success.  Synthesize a reply to say all is OK. */
-       {
-               struct {
-                       struct xsd_sockmsg hdr;
-                       char body[3];
-               } __packed reply = {
-                       {
-                               .type = msg_type,
-                               .len = sizeof(reply.body)
-                       },
-                       "OK"
-               };
-
-               mutex_lock(&u->reply_mutex);
-               rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
-               wake_up(&u->read_waitq);
-               mutex_unlock(&u->reply_mutex);
-       }
-
-out:
-       return rc;
-}
-
-static ssize_t xenbus_file_write(struct file *filp,
-                               const char __user *ubuf,
-                               size_t len, loff_t *ppos)
-{
-       struct xenbus_file_priv *u = filp->private_data;
-       uint32_t msg_type;
-       int rc = len;
-       int ret;
-       LIST_HEAD(staging_q);
-
-       /*
-        * We're expecting usermode to be writing properly formed
-        * xenbus messages.  If they write an incomplete message we
-        * buffer it up.  Once it is complete, we act on it.
-        */
-
-       /*
-        * Make sure concurrent writers can't stomp all over each
-        * other's messages and make a mess of our partial message
-        * buffer.  We don't make any attemppt to stop multiple
-        * writers from making a mess of each other's incomplete
-        * messages; we're just trying to guarantee our own internal
-        * consistency and make sure that single writes are handled
-        * atomically.
-        */
-       mutex_lock(&u->msgbuffer_mutex);
-
-       /* Get this out of the way early to avoid confusion */
-       if (len == 0)
-               goto out;
-
-       /* Can't write a xenbus message larger we can buffer */
-       if ((len + u->len) > sizeof(u->u.buffer)) {
-               /* On error, dump existing buffer */
-               u->len = 0;
-               rc = -EINVAL;
-               goto out;
-       }
-
-       ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
-
-       if (ret != 0) {
-               rc = -EFAULT;
-               goto out;
-       }
-
-       /* Deal with a partial copy. */
-       len -= ret;
-       rc = len;
-
-       u->len += len;
-
-       /* Return if we haven't got a full message yet */
-       if (u->len < sizeof(u->u.msg))
-               goto out;       /* not even the header yet */
-
-       /* If we're expecting a message that's larger than we can
-          possibly send, dump what we have and return an error. */
-       if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
-               rc = -E2BIG;
-               u->len = 0;
-               goto out;
-       }
-
-       if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
-               goto out;       /* incomplete data portion */
-
-       /*
-        * OK, now we have a complete message.  Do something with it.
-        */
-
-       msg_type = u->u.msg.type;
-
-       switch (msg_type) {
-       case XS_WATCH:
-       case XS_UNWATCH:
-               /* (Un)Ask for some path to be watched for changes */
-               ret = xenbus_write_watch(msg_type, u);
-               break;
-
-       default:
-               /* Send out a transaction */
-               ret = xenbus_write_transaction(msg_type, u);
-               break;
-       }
-       if (ret != 0)
-               rc = ret;
-
-       /* Buffered message consumed */
-       u->len = 0;
-
- out:
-       mutex_unlock(&u->msgbuffer_mutex);
-       return rc;
-}
-
-static int xenbus_file_open(struct inode *inode, struct file *filp)
-{
-       struct xenbus_file_priv *u;
-
-       if (xen_store_evtchn == 0)
-               return -ENOENT;
-
-       nonseekable_open(inode, filp);
-
-       u = kzalloc(sizeof(*u), GFP_KERNEL);
-       if (u == NULL)
-               return -ENOMEM;
-
-       INIT_LIST_HEAD(&u->transactions);
-       INIT_LIST_HEAD(&u->watches);
-       INIT_LIST_HEAD(&u->read_buffers);
-       init_waitqueue_head(&u->read_waitq);
-
-       mutex_init(&u->reply_mutex);
-       mutex_init(&u->msgbuffer_mutex);
-
-       filp->private_data = u;
-
-       return 0;
-}
-
-static int xenbus_file_release(struct inode *inode, struct file *filp)
-{
-       struct xenbus_file_priv *u = filp->private_data;
-       struct xenbus_transaction_holder *trans, *tmp;
-       struct watch_adapter *watch, *tmp_watch;
-       struct read_buffer *rb, *tmp_rb;
-
-       /*
-        * No need for locking here because there are no other users,
-        * by definition.
-        */
-
-       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
-               xenbus_transaction_end(trans->handle, 1);
-               list_del(&trans->list);
-               kfree(trans);
-       }
-
-       list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
-               unregister_xenbus_watch(&watch->watch);
-               list_del(&watch->list);
-               free_watch_adapter(watch);
-       }
-
-       list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
-               list_del(&rb->list);
-               kfree(rb);
-       }
-       kfree(u);
-
-       return 0;
-}
-
-static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
-{
-       struct xenbus_file_priv *u = file->private_data;
-
-       poll_wait(file, &u->read_waitq, wait);
-       if (!list_empty(&u->read_buffers))
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-const struct file_operations xenbus_file_ops = {
-       .read = xenbus_file_read,
-       .write = xenbus_file_write,
-       .open = xenbus_file_open,
-       .release = xenbus_file_release,
-       .poll = xenbus_file_poll,
-       .llseek = no_llseek,
-};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h

index b68aa6200003575549e7a5844ae7b43b6bec135f..6b80c7779c0217bdddca45d977e5d823df150b5a 100644 (file)
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -1,8 +1,6 @@
  #ifndef _XENFS_XENBUS_H
  #define _XENFS_XENBUS_H
  
-extern const struct file_operations xenbus_file_ops;
-extern const struct file_operations privcmd_file_ops;
  extern const struct file_operations xsd_kva_file_ops;
  extern const struct file_operations xsd_port_file_ops;
  
diff --git a/include/xen/events.h b/include/xen/events.h

index d287997d3eab3ab9eeea0b802bf41eaa9238f97c..0f773708e02c034f624ca61a3e186bb8e9d68080 100644 (file)
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -37,6 +37,13 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
   */
  void unbind_from_irqhandler(unsigned int irq, void *dev_id);
  
+/*
+ * Allow extra references to event channels exposed to userspace by evtchn
+ */
+int evtchn_make_refcounted(unsigned int evtchn);
+int evtchn_get(unsigned int evtchn);
+void evtchn_put(unsigned int evtchn);
+
  void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
  int resend_irq_on_evtchn(unsigned int irq);
  void rebind_evtchn_irq(int evtchn, int irq);
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h

index 11e2dfce42f82ecca25fec9e8e6508d47d3615c4..f1e17b70588bb4f5dc4e8584edd9e6db02c86dee 100644 (file)
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -62,6 +62,24 @@ int gnttab_resume(void);
  
  int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
                                 int readonly);
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+                                       int flags, unsigned page_off,
+                                       unsigned length);
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+                                     domid_t trans_domid,
+                                     grant_ref_t trans_gref);
+
+/*
+ * Are sub-page grants available on this version of Xen?  Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_subpage_grants_available(void);
+
+/*
+ * Are transitive grants available on this version of Xen?  Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_trans_grants_available(void);
  
  /*
   * End access through the given grant reference, iff the grant entry is no
@@ -108,6 +126,13 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
  
  void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
                                      unsigned long frame, int readonly);
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+                                           unsigned long frame, int flags,
+                                           unsigned page_off,
+                                           unsigned length);
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+                                         int flags, domid_t trans_domid,
+                                         grant_ref_t trans_gref);
  
  void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
                                        unsigned long pfn);
@@ -145,9 +170,11 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
  
  int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
                            unsigned long max_nr_gframes,
-                          struct grant_entry **__shared);
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
-                             unsigned long nr_gframes);
+                          void **__shared);
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+                          unsigned long max_nr_gframes,
+                          grant_status_t **__shared);
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
  
  extern unsigned long xen_hvm_resume_frames;
  unsigned int gnttab_max_grant_frames(void);
@@ -155,7 +182,7 @@ unsigned int gnttab_max_grant_frames(void);
  #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
  
  int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
-                       struct gnttab_map_grant_ref *kmap_ops,
+                   struct gnttab_map_grant_ref *kmap_ops,
                     struct page **pages, unsigned int count);
  int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
                       struct page **pages, unsigned int count);
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h

index 39e571796e324fac9782cf75fe27563bf6be0f78..a17d84433e6a1ab54e660fc4f25493a180ce15f2 100644 (file)
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -84,13 +84,23 @@
   *  Use SMP-safe bit-setting instruction.
   */
  
+/*
+ * Reference to a grant entry in a specified domain's grant table.
+ */
+typedef uint32_t grant_ref_t;
+
  /*
   * A grant table comprises a packed array of grant entries in one or more
   * page frames shared between Xen and a guest.
   * [XEN]: This field is written by Xen and read by the sharing guest.
   * [GST]: This field is written by the guest and read by Xen.
   */
-struct grant_entry {
+
+/*
+ * Version 1 of the grant table entry structure is maintained purely
+ * for backwards compatibility.  New guests should use version 2.
+ */
+struct grant_entry_v1 {
      /* GTF_xxx: various type and flag information.  [XEN,GST] */
      uint16_t flags;
      /* The domain being granted foreign privileges. [GST] */
@@ -108,10 +118,13 @@ struct grant_entry {
   *  GTF_permit_access: Allow @domid to map/access @frame.
   *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
   *                       to this guest. Xen writes the page number to @frame.
+ *  GTF_transitive: Allow @domid to transitively access a subrange of
+ *                  @trans_grant in @trans_domid.  No mappings are allowed.
   */
  #define GTF_invalid         (0U<<0)
  #define GTF_permit_access   (1U<<0)
  #define GTF_accept_transfer (2U<<0)
+#define GTF_transitive      (3U<<0)
  #define GTF_type_mask       (3U<<0)
  
  /*
@@ -119,6 +132,9 @@ struct grant_entry {
   *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
   *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
   *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ *  GTF_sub_page: Grant access to only a subrange of the page.  @domid
+ *                will only be allowed to copy from the grant, and not
+ *                map it. [GST]
   */
  #define _GTF_readonly       (2)
  #define GTF_readonly        (1U<<_GTF_readonly)
@@ -126,6 +142,8 @@ struct grant_entry {
  #define GTF_reading         (1U<<_GTF_reading)
  #define _GTF_writing        (4)
  #define GTF_writing         (1U<<_GTF_writing)
+#define _GTF_sub_page       (8)
+#define GTF_sub_page        (1U<<_GTF_sub_page)
  
  /*
   * Subflags for GTF_accept_transfer:
@@ -142,15 +160,81 @@ struct grant_entry {
  #define _GTF_transfer_completed (3)
  #define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
  
+/*
+ * Version 2 grant table entries.  These fulfil the same role as
+ * version 1 entries, but can represent more complicated operations.
+ * Any given domain will have either a version 1 or a version 2 table,
+ * and every entry in the table will be the same version.
+ *
+ * The interface by which domains use grant references does not depend
+ * on the grant table version in use by the other domain.
+ */
  
-/***********************************
- * GRANT TABLE QUERIES AND USES
+/*
+ * Version 1 and version 2 grant entries share a common prefix.  The
+ * fields of the prefix are documented as part of struct
+ * grant_entry_v1.
   */
+struct grant_entry_header {
+    uint16_t flags;
+    domid_t  domid;
+};
  
  /*
- * Reference to a grant entry in a specified domain's grant table.
+ * Version 2 of the grant entry structure, here is an union because three
+ * different types are suppotted: full_page, sub_page and transitive.
+ */
+union grant_entry_v2 {
+    struct grant_entry_header hdr;
+
+    /*
+     * This member is used for V1-style full page grants, where either:
+     *
+     * -- hdr.type is GTF_accept_transfer, or
+     * -- hdr.type is GTF_permit_access and GTF_sub_page is not set.
+     *
+     * In that case, the frame field has the same semantics as the
+     * field of the same name in the V1 entry structure.
+     */
+    struct {
+       struct grant_entry_header hdr;
+       uint32_t pad0;
+       uint64_t frame;
+    } full_page;
+
+    /*
+     * If the grant type is GTF_grant_access and GTF_sub_page is set,
+     * @domid is allowed to access bytes [@page_off,@page_off+@length)
+     * in frame @frame.
+     */
+    struct {
+       struct grant_entry_header hdr;
+       uint16_t page_off;
+       uint16_t length;
+       uint64_t frame;
+    } sub_page;
+
+    /*
+     * If the grant is GTF_transitive, @domid is allowed to use the
+     * grant @gref in domain @trans_domid, as if it was the local
+     * domain.  Obviously, the transitive access must be compatible
+     * with the original grant.
+     */
+    struct {
+       struct grant_entry_header hdr;
+       domid_t trans_domid;
+       uint16_t pad0;
+       grant_ref_t gref;
+    } transitive;
+
+    uint32_t __spacer[4]; /* Pad to a power of two */
+};
+
+typedef uint16_t grant_status_t;
+
+/***********************************
+ * GRANT TABLE QUERIES AND USES
   */
-typedef uint32_t grant_ref_t;
  
  /*
   * Handle to track a mapping created via a grant reference.
@@ -321,6 +405,79 @@ struct gnttab_query_size {
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
  
+/*
+ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
+ * tracked by <handle> but atomically replace the page table entry with one
+ * pointing to the machine address under <new_addr>.  <new_addr> will be
+ * redirected to the null entry.
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping is not
+ *     tracked by <handle>.
+ *  2. After executing a batch of unmaps, it is guaranteed that no stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_and_replace    7
+struct gnttab_unmap_and_replace {
+    /* IN parameters. */
+    uint64_t host_addr;
+    uint64_t new_addr;
+    grant_handle_t handle;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+
+/*
+ * GNTTABOP_set_version: Request a particular version of the grant
+ * table shared table structure.  This operation can only be performed
+ * once in any given domain.  It must be performed before any grants
+ * are activated; otherwise, the domain will be stuck with version 1.
+ * The only defined versions are 1 and 2.
+ */
+#define GNTTABOP_set_version          8
+struct gnttab_set_version {
+    /* IN parameters */
+    uint32_t version;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
+
+/*
+ * GNTTABOP_get_status_frames: Get the list of frames used to store grant
+ * status for <dom>. In grant format version 2, the status is separated
+ * from the other shared grant fields to allow more efficient synchronization
+ * using barriers instead of atomic cmpexch operations.
+ * <nr_frames> specify the size of vector <frame_list>.
+ * The frame addresses are returned in the <frame_list>.
+ * Only <nr_frames> addresses are returned, even if the table is larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ */
+#define GNTTABOP_get_status_frames     9
+struct gnttab_get_status_frames {
+    /* IN parameters. */
+    uint32_t nr_frames;
+    domid_t  dom;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+    GUEST_HANDLE(uint64_t) frame_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
+
+/*
+ * GNTTABOP_get_version: Get the grant table version which is in
+ * effect for domain <dom>.
+ */
+#define GNTTABOP_get_version          10
+struct gnttab_get_version {
+    /* IN parameters */
+    domid_t dom;
+    uint16_t pad;
+    /* OUT parameters */
+    uint32_t version;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
+
  /*
   * Bitfield values for update_pin_status.flags.
   */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h

index 6a6e91449347078a3ac2c3d66a10578e19e465fd..a890804945e3eda793ebe5e7b78a5d80fab974c0 100644 (file)
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -523,6 +523,8 @@ struct tmem_op {
         } u;
  };
  
+DEFINE_GUEST_HANDLE(u64);
+
  #else /* __ASSEMBLY__ */
  
  /* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/include/xen/xenbus_dev.h b/include/xen/xenbus_dev.h

new file mode 100644 (file)

index 0000000..ac5f0fe
--- /dev/null
+++ b/include/xen/xenbus_dev.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/xenbus_backend.
+ *
+ * Copyright (c) 2011 Bastian Blank <waldi@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_XEN_XENBUS_DEV_H__
+#define __LINUX_XEN_XENBUS_DEV_H__
+
+#include <linux/ioctl.h>
+
+#define IOCTL_XENBUS_BACKEND_EVTCHN                    \
+       _IOC(_IOC_NONE, 'B', 0, 0)
+
+#endif /* __LINUX_XEN_XENBUS_DEV_H__ */
author	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
	Tue, 20 Dec 2011 22:01:18 +0000 (17:01 -0500)
committer	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
	Tue, 20 Dec 2011 22:01:18 +0000 (17:01 -0500)
Documentation/ABI/stable/sysfs-bus-xen-backend	[new file with mode: 0644]	patch \| blob
Documentation/ABI/stable/sysfs-devices-system-xen_memory	[new file with mode: 0644]	patch \| blob
arch/ia64/include/asm/xen/interface.h		patch \| blob \| history
arch/x86/xen/Kconfig		patch \| blob \| history
arch/x86/xen/grant-table.c		patch \| blob \| history
drivers/xen/Kconfig		patch \| blob \| history
drivers/xen/Makefile		patch \| blob \| history
drivers/xen/events.c		patch \| blob \| history
drivers/xen/evtchn.c		patch \| blob \| history
drivers/xen/gntalloc.c		patch \| blob \| history
drivers/xen/gntdev.c		patch \| blob \| history
drivers/xen/grant-table.c		patch \| blob \| history
drivers/xen/privcmd.c	[new file with mode: 0644]	patch \| blob
drivers/xen/privcmd.h	[new file with mode: 0644]	patch \| blob
drivers/xen/xenbus/Makefile		patch \| blob \| history
drivers/xen/xenbus/xenbus_comms.h		patch \| blob \| history
drivers/xen/xenbus/xenbus_dev_backend.c	[new file with mode: 0644]	patch \| blob
drivers/xen/xenbus/xenbus_dev_frontend.c	[new file with mode: 0644]	patch \| blob
drivers/xen/xenfs/Makefile		patch \| blob \| history
drivers/xen/xenfs/privcmd.c	[deleted file]	patch \| blob \| history
drivers/xen/xenfs/super.c		patch \| blob \| history
drivers/xen/xenfs/xenbus.c	[deleted file]	patch \| blob \| history
drivers/xen/xenfs/xenfs.h		patch \| blob \| history
include/xen/events.h		patch \| blob \| history
include/xen/grant_table.h		patch \| blob \| history
include/xen/interface/grant_table.h		patch \| blob \| history
include/xen/interface/xen.h		patch \| blob \| history
include/xen/xenbus_dev.h	[new file with mode: 0644]	patch \| blob