--- /dev/null
+What: /sys/bus/xen-backend/devices/*/devtype
+Date: Feb 2009
+KernelVersion: 2.6.38
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The type of the device. e.g., one of: 'vbd' (block),
+ 'vif' (network), or 'vfb' (framebuffer).
+
+What: /sys/bus/xen-backend/devices/*/nodename
+Date: Feb 2009
+KernelVersion: 2.6.38
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ XenStore node (under /local/domain/NNN/) for this
+ backend device.
+
+What: /sys/bus/xen-backend/devices/vbd-*/physical_device
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The major:minor number (in hexidecimal) of the
+ physical device providing the storage for this backend
+ block device.
+
+What: /sys/bus/xen-backend/devices/vbd-*/mode
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Whether the block device is read-only ('r') or
+ read-write ('w').
+
+What: /sys/bus/xen-backend/devices/vbd-*/statistics/f_req
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Number of flush requests from the frontend.
+
+What: /sys/bus/xen-backend/devices/vbd-*/statistics/oo_req
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Number of requests delayed because the backend was too
+ busy processing previous requests.
+
+What: /sys/bus/xen-backend/devices/vbd-*/statistics/rd_req
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Number of read requests from the frontend.
+
+What: /sys/bus/xen-backend/devices/vbd-*/statistics/rd_sect
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Number of sectors read by the frontend.
+
+What: /sys/bus/xen-backend/devices/vbd-*/statistics/wr_req
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Number of write requests from the frontend.
+
+What: /sys/bus/xen-backend/devices/vbd-*/statistics/wr_sect
+Date: April 2011
+KernelVersion: 3.0
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Number of sectors written by the frontend.
--- /dev/null
+What: /sys/devices/system/xen_memory/xen_memory0/max_retry_count
+Date: May 2011
+KernelVersion: 2.6.39
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The maximum number of times the balloon driver will
+ attempt to increase the balloon before giving up. See
+ also 'retry_count' below.
+ A value of zero means retry forever and is the default one.
+
+What: /sys/devices/system/xen_memory/xen_memory0/max_schedule_delay
+Date: May 2011
+KernelVersion: 2.6.39
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The limit that 'schedule_delay' (see below) will be
+ increased to. The default value is 32 seconds.
+
+What: /sys/devices/system/xen_memory/xen_memory0/retry_count
+Date: May 2011
+KernelVersion: 2.6.39
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The current number of times that the balloon driver
+ has attempted to increase the size of the balloon.
+ The default value is one. With max_retry_count being
+ zero (unlimited), this means that the driver will attempt
+ to retry with a 'schedule_delay' delay.
+
+What: /sys/devices/system/xen_memory/xen_memory0/schedule_delay
+Date: May 2011
+KernelVersion: 2.6.39
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The time (in seconds) to wait between attempts to
+ increase the balloon. Each time the balloon cannot be
+ increased, 'schedule_delay' is increased (until
+ 'max_schedule_delay' is reached at which point it
+ will use the max value).
+
+What: /sys/devices/system/xen_memory/xen_memory0/target
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ The target number of pages to adjust this domain's
+ memory reservation to.
+
+What: /sys/devices/system/xen_memory/xen_memory0/target_kb
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ As target above, except the value is in KiB.
+
+What: /sys/devices/system/xen_memory/xen_memory0/info/current_kb
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Current size (in KiB) of this domain's memory
+ reservation.
+
+What: /sys/devices/system/xen_memory/xen_memory0/info/high_kb
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Amount (in KiB) of high memory in the balloon.
+
+What: /sys/devices/system/xen_memory/xen_memory0/info/low_kb
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Amount (in KiB) of low (or normal) memory in the
+ balloon.
__DEFINE_GUEST_HANDLE(uchar, unsigned char);
__DEFINE_GUEST_HANDLE(uint, unsigned int);
__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-__DEFINE_GUEST_HANDLE(u64, unsigned long);
+
DEFINE_GUEST_HANDLE(char);
DEFINE_GUEST_HANDLE(int);
DEFINE_GUEST_HANDLE(long);
config XEN_MAX_DOMAIN_MEMORY
int
- default 128
+ default 500 if X86_64
+ default 64 if X86_32
depends on XEN
help
This only affects the sizing of some bss arrays, the unused
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+
return 0;
}
+/*
+ * This function is used to map shared frames to store grant status. It is
+ * different from map_pte_fn above, the frames type here is uint64_t.
+ */
+static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ uint64_t **frames = (uint64_t **)data;
+
+ set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+ (*frames)++;
+ return 0;
+}
+
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
- struct grant_entry **__shared)
+ void **__shared)
{
int rc;
- struct grant_entry *shared = *__shared;
+ void *shared = *__shared;
if (shared == NULL) {
struct vm_struct *area =
return rc;
}
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
- unsigned long nr_gframes)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
+{
+ int rc;
+ grant_status_t *shared = *__shared;
+
+ if (shared == NULL) {
+ /* No need to pass in PTE as we are going to do it
+ * in apply_to_page_range anyhow. */
+ struct vm_struct *area =
+ alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
+ BUG_ON(area == NULL);
+ shared = area->addr;
+ *__shared = shared;
+ }
+
+ rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+ PAGE_SIZE * nr_gframes,
+ map_pte_fn_status, &frames);
+ return rc;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
config XENFS
tristate "Xen filesystem"
+ select XEN_PRIVCMD
default y
help
The xen filesystem provides a way for domains to share
xen-pciback.hide=(03:00.0)(04:00.0)
If in doubt, say m.
+
+config XEN_PRIVCMD
+ tristate
+ depends on XEN
+ default m
+
endmenu
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
xen-gntalloc-y := gntalloc.o
+xen-privcmd-y := privcmd.o
*/
struct irq_info {
struct list_head list;
+ int refcnt;
enum xen_irq_type type; /* type */
unsigned irq;
unsigned short evtchn; /* event channel */
panic("Unable to allocate metadata for IRQ%d\n", irq);
info->type = IRQT_UNBOUND;
+ info->refcnt = -1;
irq_set_handler_data(irq, info);
irq_set_handler_data(irq, NULL);
+ WARN_ON(info->refcnt > 0);
+
kfree(info);
/* Legacy IRQ descriptors are managed by the arch. */
if (irq != -1) {
printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
irq, gsi);
- goto out; /* XXX need refcount? */
+ goto out;
}
irq = xen_allocate_irq_gsi(gsi);
{
struct evtchn_close close;
int evtchn = evtchn_from_irq(irq);
+ struct irq_info *info = irq_get_handler_data(irq);
mutex_lock(&irq_mapping_update_lock);
+ if (info->refcnt > 0) {
+ info->refcnt--;
+ if (info->refcnt != 0)
+ goto done;
+ }
+
if (VALID_EVTCHN(evtchn)) {
close.port = evtchn;
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
xen_free_irq(irq);
+ done:
mutex_unlock(&irq_mapping_update_lock);
}
}
EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+int evtchn_make_refcounted(unsigned int evtchn)
+{
+ int irq = evtchn_to_irq[evtchn];
+ struct irq_info *info;
+
+ if (irq == -1)
+ return -ENOENT;
+
+ info = irq_get_handler_data(irq);
+
+ if (!info)
+ return -ENOENT;
+
+ WARN_ON(info->refcnt != -1);
+
+ info->refcnt = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
+
+int evtchn_get(unsigned int evtchn)
+{
+ int irq;
+ struct irq_info *info;
+ int err = -ENOENT;
+
+ if (evtchn >= NR_EVENT_CHANNELS)
+ return -EINVAL;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = evtchn_to_irq[evtchn];
+ if (irq == -1)
+ goto done;
+
+ info = irq_get_handler_data(irq);
+
+ if (!info)
+ goto done;
+
+ err = -EINVAL;
+ if (info->refcnt <= 0)
+ goto done;
+
+ info->refcnt++;
+ err = 0;
+ done:
+ mutex_unlock(&irq_mapping_update_lock);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(evtchn_get);
+
+void evtchn_put(unsigned int evtchn)
+{
+ int irq = evtchn_to_irq[evtchn];
+ if (WARN_ON(irq == -1))
+ return;
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(evtchn_put);
+
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
{
int irq = per_cpu(ipi_to_irq, cpu)[vector];
rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
u->name, (void *)(unsigned long)port);
if (rc >= 0)
- rc = 0;
+ rc = evtchn_make_refcounted(port);
return rc;
}
"the gntalloc device");
static LIST_HEAD(gref_list);
-static DEFINE_SPINLOCK(gref_lock);
+static DEFINE_MUTEX(gref_mutex);
static int gref_size;
struct notify_info {
uint64_t index;
};
+struct gntalloc_vma_private_data {
+ struct gntalloc_gref *gref;
+ int users;
+ int count;
+};
+
static void __del_gref(struct gntalloc_gref *gref);
static void do_cleanup(void)
}
/* Add to gref lists. */
- spin_lock(&gref_lock);
+ mutex_lock(&gref_mutex);
list_splice_tail(&queue_gref, &gref_list);
list_splice_tail(&queue_file, &priv->list);
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
return 0;
undo:
- spin_lock(&gref_lock);
+ mutex_lock(&gref_mutex);
gref_size -= (op->count - i);
list_for_each_entry(gref, &queue_file, next_file) {
*/
if (unlikely(!list_empty(&queue_gref)))
list_splice_tail(&queue_gref, &gref_list);
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
return rc;
}
tmp[gref->notify.pgoff] = 0;
kunmap(gref->page);
}
- if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+ if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
notify_remote_via_evtchn(gref->notify.event);
+ evtchn_put(gref->notify.event);
+ }
gref->notify.flags = 0;
if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
return;
+
+ gnttab_free_grant_reference(gref->gref_id);
}
gref_size--;
pr_debug("%s: priv %p\n", __func__, priv);
- spin_lock(&gref_lock);
+ mutex_lock(&gref_mutex);
while (!list_empty(&priv->list)) {
gref = list_entry(priv->list.next,
struct gntalloc_gref, next_file);
__del_gref(gref);
}
kfree(priv);
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
return 0;
}
goto out;
}
- spin_lock(&gref_lock);
+ mutex_lock(&gref_mutex);
/* Clean up pages that were at zero (local) users but were still mapped
* by remote domains. Since those pages count towards the limit that we
* are about to enforce, removing them here is a good idea.
*/
do_cleanup();
if (gref_size + op.count > limit) {
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
rc = -ENOSPC;
goto out_free;
}
gref_size += op.count;
op.index = priv->index;
priv->index += op.count * PAGE_SIZE;
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
rc = add_grefs(&op, gref_ids, priv);
if (rc < 0)
goto dealloc_grant_out;
}
- spin_lock(&gref_lock);
+ mutex_lock(&gref_mutex);
gref = find_grefs(priv, op.index, op.count);
if (gref) {
/* Remove from the file list only, and decrease reference count.
do_cleanup();
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
dealloc_grant_out:
return rc;
}
index = op.index & ~(PAGE_SIZE - 1);
pgoff = op.index & (PAGE_SIZE - 1);
- spin_lock(&gref_lock);
+ mutex_lock(&gref_mutex);
gref = find_grefs(priv, index, 1);
if (!gref) {
goto unlock_out;
}
+ /* We need to grab a reference to the event channel we are going to use
+ * to send the notify before releasing the reference we may already have
+ * (if someone has called this ioctl twice). This is required so that
+ * it is possible to change the clear_byte part of the notification
+ * without disturbing the event channel part, which may now be the last
+ * reference to that event channel.
+ */
+ if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+ if (evtchn_get(op.event_channel_port)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+ }
+
+ if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+ evtchn_put(gref->notify.event);
+
gref->notify.flags = op.action;
gref->notify.pgoff = pgoff;
gref->notify.event = op.event_channel_port;
rc = 0;
+
unlock_out:
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
return rc;
}
static void gntalloc_vma_open(struct vm_area_struct *vma)
{
- struct gntalloc_gref *gref = vma->vm_private_data;
- if (!gref)
+ struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+
+ if (!priv)
return;
- spin_lock(&gref_lock);
- gref->users++;
- spin_unlock(&gref_lock);
+ mutex_lock(&gref_mutex);
+ priv->users++;
+ mutex_unlock(&gref_mutex);
}
static void gntalloc_vma_close(struct vm_area_struct *vma)
{
- struct gntalloc_gref *gref = vma->vm_private_data;
- if (!gref)
+ struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+ struct gntalloc_gref *gref, *next;
+ int i;
+
+ if (!priv)
return;
- spin_lock(&gref_lock);
- gref->users--;
- if (gref->users == 0)
- __del_gref(gref);
- spin_unlock(&gref_lock);
+ mutex_lock(&gref_mutex);
+ priv->users--;
+ if (priv->users == 0) {
+ gref = priv->gref;
+ for (i = 0; i < priv->count; i++) {
+ gref->users--;
+ next = list_entry(gref->next_gref.next,
+ struct gntalloc_gref, next_gref);
+ if (gref->users == 0)
+ __del_gref(gref);
+ gref = next;
+ }
+ kfree(priv);
+ }
+ mutex_unlock(&gref_mutex);
}
static struct vm_operations_struct gntalloc_vmops = {
static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_vma_private_data *vm_priv;
struct gntalloc_gref *gref;
int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
int rv, i;
- pr_debug("%s: priv %p, page %lu+%d\n", __func__,
- priv, vma->vm_pgoff, count);
-
if (!(vma->vm_flags & VM_SHARED)) {
printk(KERN_ERR "%s: Mapping must be shared.\n", __func__);
return -EINVAL;
}
- spin_lock(&gref_lock);
+ vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL);
+ if (!vm_priv)
+ return -ENOMEM;
+
+ mutex_lock(&gref_mutex);
+
+ pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__,
+ priv, vm_priv, vma->vm_pgoff, count);
+
gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
if (gref == NULL) {
rv = -ENOENT;
goto out_unlock;
}
- vma->vm_private_data = gref;
+ vm_priv->gref = gref;
+ vm_priv->users = 1;
+ vm_priv->count = count;
+
+ vma->vm_private_data = vm_priv;
- vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
vma->vm_ops = &gntalloc_vmops;
rv = 0;
out_unlock:
- spin_unlock(&gref_lock);
+ mutex_unlock(&gref_mutex);
return rv;
}
atomic_sub(map->count, &pages_mapped);
- if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+ if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
notify_remote_via_evtchn(map->notify.event);
+ evtchn_put(map->notify.event);
+ }
if (map->pages) {
if (!use_ptemod)
struct ioctl_gntdev_unmap_notify op;
struct grant_map *map;
int rc;
+ int out_flags;
+ unsigned int out_event;
if (copy_from_user(&op, u, sizeof(op)))
return -EFAULT;
if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
return -EINVAL;
+ /* We need to grab a reference to the event channel we are going to use
+ * to send the notify before releasing the reference we may already have
+ * (if someone has called this ioctl twice). This is required so that
+ * it is possible to change the clear_byte part of the notification
+ * without disturbing the event channel part, which may now be the last
+ * reference to that event channel.
+ */
+ if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+ if (evtchn_get(op.event_channel_port))
+ return -EINVAL;
+ }
+
+ out_flags = op.action;
+ out_event = op.event_channel_port;
+
spin_lock(&priv->lock);
list_for_each_entry(map, &priv->maps, next) {
goto unlock_out;
}
+ out_flags = map->notify.flags;
+ out_event = map->notify.event;
+
map->notify.flags = op.action;
map->notify.addr = op.index - (map->index << PAGE_SHIFT);
map->notify.event = op.event_channel_port;
+
rc = 0;
+
unlock_out:
spin_unlock(&priv->lock);
+
+ /* Drop the reference to the event channel we did not save in the map */
+ if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
+ evtchn_put(out_event);
+
return rc;
}
#include <xen/page.h>
#include <xen/grant_table.h>
#include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
#include <asm/xen/hypercall.h>
#include <asm/pgtable.h>
#include <asm/sync_bitops.h>
-
/* External tools reserve first few grant table entries. */
#define NR_RESERVED_ENTRIES 8
#define GNTTAB_LIST_END 0xffffffff
-#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
+#define GREFS_PER_GRANT_FRAME \
+(grant_table_version == 1 ? \
+(PAGE_SIZE / sizeof(struct grant_entry_v1)) : \
+(PAGE_SIZE / sizeof(union grant_entry_v2)))
static grant_ref_t **gnttab_list;
static unsigned int nr_grant_frames;
unsigned long xen_hvm_resume_frames;
EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
-static struct grant_entry *shared;
+static union {
+ struct grant_entry_v1 *v1;
+ union grant_entry_v2 *v2;
+ void *addr;
+} gnttab_shared;
+
+/*This is a structure of function pointers for grant table*/
+struct gnttab_ops {
+ /*
+ * Mapping a list of frames for storing grant entries. Frames parameter
+ * is used to store grant table address when grant table being setup,
+ * nr_gframes is the number of frames to map grant table. Returning
+ * GNTST_okay means success and negative value means failure.
+ */
+ int (*map_frames)(unsigned long *frames, unsigned int nr_gframes);
+ /*
+ * Release a list of frames which are mapped in map_frames for grant
+ * entry status.
+ */
+ void (*unmap_frames)(void);
+ /*
+ * Introducing a valid entry into the grant table, granting the frame of
+ * this grant entry to domain for accessing or transfering. Ref
+ * parameter is reference of this introduced grant entry, domid is id of
+ * granted domain, frame is the page frame to be granted, and flags is
+ * status of the grant entry to be updated.
+ */
+ void (*update_entry)(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags);
+ /*
+ * Stop granting a grant entry to domain for accessing. Ref parameter is
+ * reference of a grant entry whose grant access will be stopped,
+ * readonly is not in use in this function. If the grant entry is
+ * currently mapped for reading or writing, just return failure(==0)
+ * directly and don't tear down the grant access. Otherwise, stop grant
+ * access for this entry and return success(==1).
+ */
+ int (*end_foreign_access_ref)(grant_ref_t ref, int readonly);
+ /*
+ * Stop granting a grant entry to domain for transfer. Ref parameter is
+ * reference of a grant entry whose grant transfer will be stopped. If
+ * tranfer has not started, just reclaim the grant entry and return
+ * failure(==0). Otherwise, wait for the transfer to complete and then
+ * return the frame.
+ */
+ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref);
+ /*
+ * Query the status of a grant entry. Ref parameter is reference of
+ * queried grant entry, return value is the status of queried entry.
+ * Detailed status(writing/reading) can be gotten from the return value
+ * by bit operations.
+ */
+ int (*query_foreign_access)(grant_ref_t ref);
+ /*
+ * Grant a domain to access a range of bytes within the page referred by
+ * an available grant entry. Ref parameter is reference of a grant entry
+ * which will be sub-page accessed, domid is id of grantee domain, frame
+ * is frame address of subpage grant, flags is grant type and flag
+ * information, page_off is offset of the range of bytes, and length is
+ * length of bytes to be accessed.
+ */
+ void (*update_subpage_entry)(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off, unsigned length);
+ /*
+ * Redirect an available grant entry on domain A to another grant
+ * reference of domain B, then allow domain C to use grant reference
+ * of domain B transitively. Ref parameter is an available grant entry
+ * reference on domain A, domid is id of domain C which accesses grant
+ * entry transitively, flags is grant type and flag information,
+ * trans_domid is id of domain B whose grant entry is finally accessed
+ * transitively, trans_gref is grant entry transitive reference of
+ * domain B.
+ */
+ void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags,
+ domid_t trans_domid, grant_ref_t trans_gref);
+};
+
+static struct gnttab_ops *gnttab_interface;
+
+/*This reflects status of grant entries, so act as a global value*/
+static grant_status_t *grstatus;
+
+static int grant_table_version;
static struct gnttab_free_callback *gnttab_free_callback_list;
static int gnttab_expand(unsigned int req_entries);
#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define SPP (PAGE_SIZE / sizeof(grant_status_t))
static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
{
spin_unlock_irqrestore(&gnttab_list_lock, flags);
}
-static void update_grant_entry(grant_ref_t ref, domid_t domid,
- unsigned long frame, unsigned flags)
+/*
+ * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
+ * Introducing a valid entry into the grant table:
+ * 1. Write ent->domid.
+ * 2. Write ent->frame:
+ * GTF_permit_access: Frame to which access is permitted.
+ * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ * frame, or zero if none.
+ * 3. Write memory barrier (WMB).
+ * 4. Write ent->flags, inc. valid type.
+ */
+static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags)
+{
+ gnttab_shared.v1[ref].domid = domid;
+ gnttab_shared.v1[ref].frame = frame;
+ wmb();
+ gnttab_shared.v1[ref].flags = flags;
+}
+
+static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags)
{
- /*
- * Introducing a valid entry into the grant table:
- * 1. Write ent->domid.
- * 2. Write ent->frame:
- * GTF_permit_access: Frame to which access is permitted.
- * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
- * frame, or zero if none.
- * 3. Write memory barrier (WMB).
- * 4. Write ent->flags, inc. valid type.
- */
- shared[ref].frame = frame;
- shared[ref].domid = domid;
+ gnttab_shared.v2[ref].hdr.domid = domid;
+ gnttab_shared.v2[ref].full_page.frame = frame;
wmb();
- shared[ref].flags = flags;
+ gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;
}
/*
void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
unsigned long frame, int readonly)
{
- update_grant_entry(ref, domid, frame,
+ gnttab_interface->update_entry(ref, domid, frame,
GTF_permit_access | (readonly ? GTF_readonly : 0));
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
-int gnttab_query_foreign_access(grant_ref_t ref)
+void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off,
+ unsigned length)
+{
+ gnttab_shared.v2[ref].sub_page.frame = frame;
+ gnttab_shared.v2[ref].sub_page.page_off = page_off;
+ gnttab_shared.v2[ref].sub_page.length = length;
+ gnttab_shared.v2[ref].hdr.domid = domid;
+ wmb();
+ gnttab_shared.v2[ref].hdr.flags =
+ GTF_permit_access | GTF_sub_page | flags;
+}
+
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off,
+ unsigned length)
{
- u16 nflags;
+ if (flags & (GTF_accept_transfer | GTF_reading |
+ GTF_writing | GTF_transitive))
+ return -EPERM;
- nflags = shared[ref].flags;
+ if (gnttab_interface->update_subpage_entry == NULL)
+ return -ENOSYS;
- return nflags & (GTF_reading|GTF_writing);
+ gnttab_interface->update_subpage_entry(ref, domid, frame, flags,
+ page_off, length);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref);
+
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+ int flags, unsigned page_off,
+ unsigned length)
+{
+ int ref, rc;
+
+ ref = get_free_entries(1);
+ if (unlikely(ref < 0))
+ return -ENOSPC;
+
+ rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags,
+ page_off, length);
+ if (rc < 0) {
+ put_free_entry(ref);
+ return rc;
+ }
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage);
+
+bool gnttab_subpage_grants_available(void)
+{
+ return gnttab_interface->update_subpage_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available);
+
+void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid,
+ int flags, domid_t trans_domid,
+ grant_ref_t trans_gref)
+{
+ gnttab_shared.v2[ref].transitive.trans_domid = trans_domid;
+ gnttab_shared.v2[ref].transitive.gref = trans_gref;
+ gnttab_shared.v2[ref].hdr.domid = domid;
+ wmb();
+ gnttab_shared.v2[ref].hdr.flags =
+ GTF_permit_access | GTF_transitive | flags;
+}
+
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+ int flags, domid_t trans_domid,
+ grant_ref_t trans_gref)
+{
+ if (flags & (GTF_accept_transfer | GTF_reading |
+ GTF_writing | GTF_sub_page))
+ return -EPERM;
+
+ if (gnttab_interface->update_trans_entry == NULL)
+ return -ENOSYS;
+
+ gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid,
+ trans_gref);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref);
+
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+ domid_t trans_domid,
+ grant_ref_t trans_gref)
+{
+ int ref, rc;
+
+ ref = get_free_entries(1);
+ if (unlikely(ref < 0))
+ return -ENOSPC;
+
+ rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags,
+ trans_domid, trans_gref);
+ if (rc < 0) {
+ put_free_entry(ref);
+ return rc;
+ }
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans);
+
+bool gnttab_trans_grants_available(void)
+{
+ return gnttab_interface->update_trans_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_trans_grants_available);
+
+static int gnttab_query_foreign_access_v1(grant_ref_t ref)
+{
+ return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing);
+}
+
+static int gnttab_query_foreign_access_v2(grant_ref_t ref)
+{
+ return grstatus[ref] & (GTF_reading|GTF_writing);
+}
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+ return gnttab_interface->query_foreign_access(ref);
}
EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
-int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
{
u16 flags, nflags;
+ u16 *pflags;
- nflags = shared[ref].flags;
+ pflags = &gnttab_shared.v1[ref].flags;
+ nflags = *pflags;
do {
flags = nflags;
if (flags & (GTF_reading|GTF_writing)) {
printk(KERN_ALERT "WARNING: g.e. still in use!\n");
return 0;
}
- } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
+ } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags);
+
+ return 1;
+}
+
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+{
+ gnttab_shared.v2[ref].hdr.flags = 0;
+ mb();
+ if (grstatus[ref] & (GTF_reading|GTF_writing)) {
+ return 0;
+ } else {
+ /* The read of grstatus needs to have acquire
+ semantics. On x86, reads already have
+ that, and we just need to protect against
+ compiler reorderings. On other
+ architectures we may need a full
+ barrier. */
+#ifdef CONFIG_X86
+ barrier();
+#else
+ mb();
+#endif
+ }
return 1;
}
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+ return gnttab_interface->end_foreign_access_ref(ref, readonly);
+}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
unsigned long pfn)
{
- update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
+ gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
-unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
{
unsigned long frame;
u16 flags;
+ u16 *pflags;
+
+ pflags = &gnttab_shared.v1[ref].flags;
/*
* If a transfer is not even yet started, try to reclaim the grant
* reference and return failure (== 0).
*/
- while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
- if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+ while (!((flags = *pflags) & GTF_transfer_committed)) {
+ if (sync_cmpxchg(pflags, flags, 0) == flags)
return 0;
cpu_relax();
}
/* If a transfer is in progress then wait until it is completed. */
while (!(flags & GTF_transfer_completed)) {
- flags = shared[ref].flags;
+ flags = *pflags;
cpu_relax();
}
rmb(); /* Read the frame number /after/ reading completion status. */
- frame = shared[ref].frame;
+ frame = gnttab_shared.v1[ref].frame;
+ BUG_ON(frame == 0);
+
+ return frame;
+}
+
+static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
+{
+ unsigned long frame;
+ u16 flags;
+ u16 *pflags;
+
+ pflags = &gnttab_shared.v2[ref].hdr.flags;
+
+ /*
+ * If a transfer is not even yet started, try to reclaim the grant
+ * reference and return failure (== 0).
+ */
+ while (!((flags = *pflags) & GTF_transfer_committed)) {
+ if (sync_cmpxchg(pflags, flags, 0) == flags)
+ return 0;
+ cpu_relax();
+ }
+
+ /* If a transfer is in progress then wait until it is completed. */
+ while (!(flags & GTF_transfer_completed)) {
+ flags = *pflags;
+ cpu_relax();
+ }
+
+ rmb(); /* Read the frame number /after/ reading completion status. */
+ frame = gnttab_shared.v2[ref].full_page.frame;
BUG_ON(frame == 0);
return frame;
}
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+ return gnttab_interface->end_foreign_transfer_ref(ref);
+}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
- struct gnttab_map_grant_ref *kmap_ops,
- struct page **pages, unsigned int count)
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
{
int i, ret;
pte_t *pte;
EXPORT_SYMBOL_GPL(gnttab_map_refs);
int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
- struct page **pages, unsigned int count)
+ struct page **pages, unsigned int count)
{
int i, ret;
}
EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
+static unsigned nr_status_frames(unsigned nr_grant_frames)
+{
+ return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP;
+}
+
+static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes)
+{
+ int rc;
+
+ rc = arch_gnttab_map_shared(frames, nr_gframes,
+ gnttab_max_grant_frames(),
+ &gnttab_shared.addr);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+static void gnttab_unmap_frames_v1(void)
+{
+ arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+}
+
+static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes)
+{
+ uint64_t *sframes;
+ unsigned int nr_sframes;
+ struct gnttab_get_status_frames getframes;
+ int rc;
+
+ nr_sframes = nr_status_frames(nr_gframes);
+
+ /* No need for kzalloc as it is initialized in following hypercall
+ * GNTTABOP_get_status_frames.
+ */
+ sframes = kmalloc(nr_sframes * sizeof(uint64_t), GFP_ATOMIC);
+ if (!sframes)
+ return -ENOMEM;
+
+ getframes.dom = DOMID_SELF;
+ getframes.nr_frames = nr_sframes;
+ set_xen_guest_handle(getframes.frame_list, sframes);
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames,
+ &getframes, 1);
+ if (rc == -ENOSYS) {
+ kfree(sframes);
+ return -ENOSYS;
+ }
+
+ BUG_ON(rc || getframes.status);
+
+ rc = arch_gnttab_map_status(sframes, nr_sframes,
+ nr_status_frames(gnttab_max_grant_frames()),
+ &grstatus);
+ BUG_ON(rc);
+ kfree(sframes);
+
+ rc = arch_gnttab_map_shared(frames, nr_gframes,
+ gnttab_max_grant_frames(),
+ &gnttab_shared.addr);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+static void gnttab_unmap_frames_v2(void)
+{
+ arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+ arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames));
+}
+
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct gnttab_setup_table setup;
return rc;
}
+ /* No need for kzalloc as it is initialized in following hypercall
+ * GNTTABOP_setup_table.
+ */
frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
if (!frames)
return -ENOMEM;
BUG_ON(rc || setup.status);
- rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
- &shared);
- BUG_ON(rc);
+ rc = gnttab_interface->map_frames(frames, nr_gframes);
kfree(frames);
- return 0;
+ return rc;
+}
+
+static struct gnttab_ops gnttab_v1_ops = {
+ .map_frames = gnttab_map_frames_v1,
+ .unmap_frames = gnttab_unmap_frames_v1,
+ .update_entry = gnttab_update_entry_v1,
+ .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1,
+ .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1,
+ .query_foreign_access = gnttab_query_foreign_access_v1,
+};
+
+static struct gnttab_ops gnttab_v2_ops = {
+ .map_frames = gnttab_map_frames_v2,
+ .unmap_frames = gnttab_unmap_frames_v2,
+ .update_entry = gnttab_update_entry_v2,
+ .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2,
+ .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2,
+ .query_foreign_access = gnttab_query_foreign_access_v2,
+ .update_subpage_entry = gnttab_update_subpage_entry_v2,
+ .update_trans_entry = gnttab_update_trans_entry_v2,
+};
+
+static void gnttab_request_version(void)
+{
+ int rc;
+ struct gnttab_set_version gsv;
+
+ gsv.version = 2;
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
+ if (rc == 0) {
+ grant_table_version = 2;
+ gnttab_interface = &gnttab_v2_ops;
+ } else if (grant_table_version == 2) {
+ /*
+ * If we've already used version 2 features,
+ * but then suddenly discover that they're not
+ * available (e.g. migrating to an older
+ * version of Xen), almost unbounded badness
+ * can happen.
+ */
+ panic("we need grant tables version 2, but only version 1 is available");
+ } else {
+ grant_table_version = 1;
+ gnttab_interface = &gnttab_v1_ops;
+ }
+ printk(KERN_INFO "Grant tables using version %d layout.\n",
+ grant_table_version);
}
int gnttab_resume(void)
{
unsigned int max_nr_gframes;
+ gnttab_request_version();
max_nr_gframes = gnttab_max_grant_frames();
if (max_nr_gframes < nr_grant_frames)
return -ENOSYS;
if (xen_pv_domain())
return gnttab_map(0, nr_grant_frames - 1);
- if (!shared) {
- shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
- if (shared == NULL) {
+ if (gnttab_shared.addr == NULL) {
+ gnttab_shared.addr = ioremap(xen_hvm_resume_frames,
+ PAGE_SIZE * max_nr_gframes);
+ if (gnttab_shared.addr == NULL) {
printk(KERN_WARNING
"Failed to ioremap gnttab share frames!");
return -ENOMEM;
int gnttab_suspend(void)
{
- arch_gnttab_unmap_shared(shared, nr_grant_frames);
+ gnttab_interface->unmap_frames();
return 0;
}
--- /dev/null
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+ struct privcmd_hypercall hypercall;
+ long ret;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+ ret = privcmd_call(hypercall.op,
+ hypercall.arg[0], hypercall.arg[1],
+ hypercall.arg[2], hypercall.arg[3],
+ hypercall.arg[4]);
+
+ return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+ struct page *p, *n;
+
+ list_for_each_entry_safe(p, n, pages, lru)
+ __free_page(p);
+
+ INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data. If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+ unsigned nelem, size_t size,
+ void __user *data)
+{
+ unsigned pageidx;
+ void *pagedata;
+ int ret;
+
+ if (size > PAGE_SIZE)
+ return 0;
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* quiet, gcc */
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (page == NULL)
+ goto fail;
+
+ pagedata = page_address(page);
+
+ list_add_tail(&page->lru, pagelist);
+ pageidx = 0;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(pagedata + pageidx, data, size))
+ goto fail;
+
+ data += size;
+ pageidx += size;
+ }
+
+ ret = 0;
+
+fail:
+ return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+ struct list_head *pos,
+ int (*fn)(void *data, void *state),
+ void *state)
+{
+ void *pagedata;
+ unsigned pageidx;
+ int ret = 0;
+
+ BUG_ON(size > PAGE_SIZE);
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* hush, gcc */
+
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page;
+ pos = pos->next;
+ page = list_entry(pos, struct page, lru);
+ pagedata = page_address(page);
+ pageidx = 0;
+ }
+
+ ret = (*fn)(pagedata + pageidx, state);
+ if (ret)
+ break;
+ pageidx += size;
+ }
+
+ return ret;
+}
+
+struct mmap_mfn_state {
+ unsigned long va;
+ struct vm_area_struct *vma;
+ domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+ struct privcmd_mmap_entry *msg = data;
+ struct mmap_mfn_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ int rc;
+
+ /* Do not allow range to wrap the address space. */
+ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+ return -EINVAL;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg->va != st->va) ||
+ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+ return -EINVAL;
+
+ rc = xen_remap_domain_mfn_range(vma,
+ msg->va & PAGE_MASK,
+ msg->mfn, msg->npages,
+ vma->vm_page_prot,
+ st->domain);
+ if (rc < 0)
+ return rc;
+
+ st->va += msg->npages << PAGE_SHIFT;
+
+ return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+ struct privcmd_mmap mmapcmd;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+ LIST_HEAD(pagelist);
+ struct mmap_mfn_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ rc = gather_array(&pagelist,
+ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ mmapcmd.entry);
+
+ if (rc || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ {
+ struct page *page = list_first_entry(&pagelist,
+ struct page, lru);
+ struct privcmd_mmap_entry *msg = page_address(page);
+
+ vma = find_vma(mm, msg->va);
+ rc = -EINVAL;
+
+ if (!vma || (msg->va != vma->vm_start) ||
+ !privcmd_enforce_singleshot_mapping(vma))
+ goto out_up;
+ }
+
+ state.va = vma->vm_start;
+ state.vma = vma;
+ state.domain = mmapcmd.dom;
+
+ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ &pagelist,
+ mmap_mfn_range, &state);
+
+
+out_up:
+ up_write(&mm->mmap_sem);
+
+out:
+ free_page_list(&pagelist);
+
+ return rc;
+}
+
+struct mmap_batch_state {
+ domid_t domain;
+ unsigned long va;
+ struct vm_area_struct *vma;
+ int err;
+
+ xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+ st->vma->vm_page_prot, st->domain) < 0) {
+ *mfnp |= 0xf0000000U;
+ st->err++;
+ }
+ st->va += PAGE_SIZE;
+
+ return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ return put_user(*mfnp, st->user++);
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+ int ret;
+ struct privcmd_mmapbatch m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long nr_pages;
+ LIST_HEAD(pagelist);
+ struct mmap_batch_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&m, udata, sizeof(m)))
+ return -EFAULT;
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+ m.arr);
+
+ if (ret || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ ret = -EINVAL;
+ if (!vma ||
+ vma->vm_ops != &privcmd_vm_ops ||
+ (m.addr != vma->vm_start) ||
+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+ !privcmd_enforce_singleshot_mapping(vma)) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ state.domain = m.dom;
+ state.vma = vma;
+ state.va = m.addr;
+ state.err = 0;
+
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_batch_fn, &state);
+
+ up_write(&mm->mmap_sem);
+
+ if (state.err > 0) {
+ state.user = m.arr;
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist,
+ mmap_return_errors, &state);
+ }
+
+out:
+ free_page_list(&pagelist);
+
+ return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL:
+ ret = privcmd_ioctl_hypercall(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAP:
+ ret = privcmd_ioctl_mmap(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ ret = privcmd_ioctl_mmap_batch(udata);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+ vma, vma->vm_start, vma->vm_end,
+ vmf->pgoff, vmf->virtual_address);
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* Unsupported for auto-translate guests. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+
+ /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+ * how to recreate these mappings */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+
+const struct file_operations xen_privcmd_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmd_fops);
+
+static struct miscdevice privcmd_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/privcmd",
+ .fops = &xen_privcmd_fops,
+};
+
+static int __init privcmd_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&privcmd_dev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register Xen privcmd device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit privcmd_exit(void)
+{
+ misc_deregister(&privcmd_dev);
+}
+
+module_init(privcmd_init);
+module_exit(privcmd_exit);
--- /dev/null
+#include <linux/fs.h>
+
+extern const struct file_operations xen_privcmd_fops;
obj-y += xenbus.o
+obj-y += xenbus_dev_frontend.o
xenbus-objs =
xenbus-objs += xenbus_client.o
xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
xenbus-objs += $(xenbus-be-objs-y)
+obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o
obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
#ifndef _XENBUS_COMMS_H
#define _XENBUS_COMMS_H
+#include <linux/fs.h>
+
int xs_init(void);
int xb_init_comms(void);
extern struct xenstore_domain_interface *xen_store_interface;
extern int xen_store_evtchn;
+extern const struct file_operations xen_xenbus_fops;
+
#endif /* _XENBUS_COMMS_H */
--- /dev/null
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+
+#include <xen/page.h>
+#include <xen/xenbus_dev.h>
+
+#include "xenbus_comms.h"
+
+MODULE_LICENSE("GPL");
+
+static int xenbus_backend_open(struct inode *inode, struct file *filp)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return nonseekable_open(inode, filp);
+}
+
+static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned long data)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IOCTL_XENBUS_BACKEND_EVTCHN:
+ if (xen_store_evtchn > 0)
+ return xen_store_evtchn;
+ return -ENODEV;
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+ return -EINVAL;
+
+ if (remap_pfn_range(vma, vma->vm_start,
+ virt_to_pfn(xen_store_interface),
+ size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+const struct file_operations xenbus_backend_fops = {
+ .open = xenbus_backend_open,
+ .mmap = xenbus_backend_mmap,
+ .unlocked_ioctl = xenbus_backend_ioctl,
+};
+
+static struct miscdevice xenbus_backend_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/xenbus_backend",
+ .fops = &xenbus_backend_fops,
+};
+
+static int __init xenbus_backend_init(void)
+{
+ int err;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ err = misc_register(&xenbus_backend_dev);
+ if (err)
+ printk(KERN_ERR "Could not register xenbus backend device\n");
+ return err;
+}
+
+static void __exit xenbus_backend_exit(void)
+{
+ misc_deregister(&xenbus_backend_dev);
+}
+
+module_init(xenbus_backend_init);
+module_exit(xenbus_backend_exit);
--- /dev/null
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem
+ * and /proc/xen compatibility mount point.
+ * Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include "xenbus_comms.h"
+
+#include <xen/xenbus.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
+MODULE_LICENSE("GPL");
+
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+ struct list_head list;
+ struct xenbus_transaction handle;
+};
+
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+ struct list_head list;
+ unsigned int cons;
+ unsigned int len;
+ char msg[];
+};
+
+struct xenbus_file_priv {
+ /*
+ * msgbuffer_mutex is held while partial requests are built up
+ * and complete requests are acted on. It therefore protects
+ * the "transactions" and "watches" lists, and the partial
+ * request length and buffer.
+ *
+ * reply_mutex protects the reply being built up to return to
+ * usermode. It nests inside msgbuffer_mutex but may be held
+ * alone during a watch callback.
+ */
+ struct mutex msgbuffer_mutex;
+
+ /* In-progress transactions */
+ struct list_head transactions;
+
+ /* Active watches. */
+ struct list_head watches;
+
+ /* Partial request. */
+ unsigned int len;
+ union {
+ struct xsd_sockmsg msg;
+ char buffer[PAGE_SIZE];
+ } u;
+
+ /* Response queue. */
+ struct mutex reply_mutex;
+ struct list_head read_buffers;
+ wait_queue_head_t read_waitq;
+
+};
+
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+ char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ struct read_buffer *rb;
+ unsigned i;
+ int ret;
+
+ mutex_lock(&u->reply_mutex);
+again:
+ while (list_empty(&u->read_buffers)) {
+ mutex_unlock(&u->reply_mutex);
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(u->read_waitq,
+ !list_empty(&u->read_buffers));
+ if (ret)
+ return ret;
+ mutex_lock(&u->reply_mutex);
+ }
+
+ rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+ i = 0;
+ while (i < len) {
+ unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+
+ ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+
+ i += sz - ret;
+ rb->cons += sz - ret;
+
+ if (ret != 0) {
+ if (i == 0)
+ i = -EFAULT;
+ goto out;
+ }
+
+ /* Clear out buffer if it has been consumed */
+ if (rb->cons == rb->len) {
+ list_del(&rb->list);
+ kfree(rb);
+ if (list_empty(&u->read_buffers))
+ break;
+ rb = list_entry(u->read_buffers.next,
+ struct read_buffer, list);
+ }
+ }
+ if (i == 0)
+ goto again;
+
+out:
+ mutex_unlock(&u->reply_mutex);
+ return i;
+}
+
+/*
+ * Add a buffer to the queue. Caller must hold the appropriate lock
+ * if the queue is not local. (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+ struct read_buffer *rb;
+
+ if (len == 0)
+ return 0;
+
+ rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+ if (rb == NULL)
+ return -ENOMEM;
+
+ rb->cons = 0;
+ rb->len = len;
+
+ memcpy(rb->msg, data, len);
+
+ list_add_tail(&rb->list, queue);
+ return 0;
+}
+
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+ struct read_buffer *rb;
+
+ while (!list_empty(list)) {
+ rb = list_entry(list->next, struct read_buffer, list);
+ list_del(list->next);
+ kfree(rb);
+ }
+}
+
+struct watch_adapter {
+ struct list_head list;
+ struct xenbus_watch watch;
+ struct xenbus_file_priv *dev_data;
+ char *token;
+};
+
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+ kfree(watch->watch.node);
+ kfree(watch->token);
+ kfree(watch);
+}
+
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+ const char *token)
+{
+ struct watch_adapter *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (watch == NULL)
+ goto out_fail;
+
+ watch->watch.node = kstrdup(path, GFP_KERNEL);
+ if (watch->watch.node == NULL)
+ goto out_free;
+
+ watch->token = kstrdup(token, GFP_KERNEL);
+ if (watch->token == NULL)
+ goto out_free;
+
+ return watch;
+
+out_free:
+ free_watch_adapter(watch);
+
+out_fail:
+ return NULL;
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+ const char **vec,
+ unsigned int len)
+{
+ struct watch_adapter *adap;
+ struct xsd_sockmsg hdr;
+ const char *path, *token;
+ int path_len, tok_len, body_len, data_len = 0;
+ int ret;
+ LIST_HEAD(staging_q);
+
+ adap = container_of(watch, struct watch_adapter, watch);
+
+ path = vec[XS_WATCH_PATH];
+ token = adap->token;
+
+ path_len = strlen(path) + 1;
+ tok_len = strlen(token) + 1;
+ if (len > 2)
+ data_len = vec[len] - vec[2] + 1;
+ body_len = path_len + tok_len + data_len;
+
+ hdr.type = XS_WATCH_EVENT;
+ hdr.len = body_len;
+
+ mutex_lock(&adap->dev_data->reply_mutex);
+
+ ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+ if (!ret)
+ ret = queue_reply(&staging_q, path, path_len);
+ if (!ret)
+ ret = queue_reply(&staging_q, token, tok_len);
+ if (!ret && len > 2)
+ ret = queue_reply(&staging_q, vec[2], data_len);
+
+ if (!ret) {
+ /* success: pass reply list onto watcher */
+ list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+ wake_up(&adap->dev_data->read_waitq);
+ } else
+ queue_cleanup(&staging_q);
+
+ mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static int xenbus_write_transaction(unsigned msg_type,
+ struct xenbus_file_priv *u)
+{
+ int rc;
+ void *reply;
+ struct xenbus_transaction_holder *trans = NULL;
+ LIST_HEAD(staging_q);
+
+ if (msg_type == XS_TRANSACTION_START) {
+ trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+ if (!trans) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ reply = xenbus_dev_request_and_reply(&u->u.msg);
+ if (IS_ERR(reply)) {
+ kfree(trans);
+ rc = PTR_ERR(reply);
+ goto out;
+ }
+
+ if (msg_type == XS_TRANSACTION_START) {
+ trans->handle.id = simple_strtoul(reply, NULL, 0);
+
+ list_add(&trans->list, &u->transactions);
+ } else if (msg_type == XS_TRANSACTION_END) {
+ list_for_each_entry(trans, &u->transactions, list)
+ if (trans->handle.id == u->u.msg.tx_id)
+ break;
+ BUG_ON(&trans->list == &u->transactions);
+ list_del(&trans->list);
+
+ kfree(trans);
+ }
+
+ mutex_lock(&u->reply_mutex);
+ rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+ if (!rc)
+ rc = queue_reply(&staging_q, reply, u->u.msg.len);
+ if (!rc) {
+ list_splice_tail(&staging_q, &u->read_buffers);
+ wake_up(&u->read_waitq);
+ } else {
+ queue_cleanup(&staging_q);
+ }
+ mutex_unlock(&u->reply_mutex);
+
+ kfree(reply);
+
+out:
+ return rc;
+}
+
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+ struct watch_adapter *watch, *tmp_watch;
+ char *path, *token;
+ int err, rc;
+ LIST_HEAD(staging_q);
+
+ path = u->u.buffer + sizeof(u->u.msg);
+ token = memchr(path, 0, u->u.msg.len);
+ if (token == NULL) {
+ rc = -EILSEQ;
+ goto out;
+ }
+ token++;
+
+ if (msg_type == XS_WATCH) {
+ watch = alloc_watch_adapter(path, token);
+ if (watch == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ watch->watch.callback = watch_fired;
+ watch->dev_data = u;
+
+ err = register_xenbus_watch(&watch->watch);
+ if (err) {
+ free_watch_adapter(watch);
+ rc = err;
+ goto out;
+ }
+ list_add(&watch->list, &u->watches);
+ } else {
+ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+ if (!strcmp(watch->token, token) &&
+ !strcmp(watch->watch.node, path)) {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ break;
+ }
+ }
+ }
+
+ /* Success. Synthesize a reply to say all is OK. */
+ {
+ struct {
+ struct xsd_sockmsg hdr;
+ char body[3];
+ } __packed reply = {
+ {
+ .type = msg_type,
+ .len = sizeof(reply.body)
+ },
+ "OK"
+ };
+
+ mutex_lock(&u->reply_mutex);
+ rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+ wake_up(&u->read_waitq);
+ mutex_unlock(&u->reply_mutex);
+ }
+
+out:
+ return rc;
+}
+
+static ssize_t xenbus_file_write(struct file *filp,
+ const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ uint32_t msg_type;
+ int rc = len;
+ int ret;
+ LIST_HEAD(staging_q);
+
+ /*
+ * We're expecting usermode to be writing properly formed
+ * xenbus messages. If they write an incomplete message we
+ * buffer it up. Once it is complete, we act on it.
+ */
+
+ /*
+ * Make sure concurrent writers can't stomp all over each
+ * other's messages and make a mess of our partial message
+ * buffer. We don't make any attemppt to stop multiple
+ * writers from making a mess of each other's incomplete
+ * messages; we're just trying to guarantee our own internal
+ * consistency and make sure that single writes are handled
+ * atomically.
+ */
+ mutex_lock(&u->msgbuffer_mutex);
+
+ /* Get this out of the way early to avoid confusion */
+ if (len == 0)
+ goto out;
+
+ /* Can't write a xenbus message larger we can buffer */
+ if ((len + u->len) > sizeof(u->u.buffer)) {
+ /* On error, dump existing buffer */
+ u->len = 0;
+ rc = -EINVAL;
+ goto out;
+ }
+
+ ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+
+ if (ret != 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ /* Deal with a partial copy. */
+ len -= ret;
+ rc = len;
+
+ u->len += len;
+
+ /* Return if we haven't got a full message yet */
+ if (u->len < sizeof(u->u.msg))
+ goto out; /* not even the header yet */
+
+ /* If we're expecting a message that's larger than we can
+ possibly send, dump what we have and return an error. */
+ if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+ rc = -E2BIG;
+ u->len = 0;
+ goto out;
+ }
+
+ if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+ goto out; /* incomplete data portion */
+
+ /*
+ * OK, now we have a complete message. Do something with it.
+ */
+
+ msg_type = u->u.msg.type;
+
+ switch (msg_type) {
+ case XS_WATCH:
+ case XS_UNWATCH:
+ /* (Un)Ask for some path to be watched for changes */
+ ret = xenbus_write_watch(msg_type, u);
+ break;
+
+ default:
+ /* Send out a transaction */
+ ret = xenbus_write_transaction(msg_type, u);
+ break;
+ }
+ if (ret != 0)
+ rc = ret;
+
+ /* Buffered message consumed */
+ u->len = 0;
+
+ out:
+ mutex_unlock(&u->msgbuffer_mutex);
+ return rc;
+}
+
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+ struct xenbus_file_priv *u;
+
+ if (xen_store_evtchn == 0)
+ return -ENOENT;
+
+ nonseekable_open(inode, filp);
+
+ u = kzalloc(sizeof(*u), GFP_KERNEL);
+ if (u == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&u->transactions);
+ INIT_LIST_HEAD(&u->watches);
+ INIT_LIST_HEAD(&u->read_buffers);
+ init_waitqueue_head(&u->read_waitq);
+
+ mutex_init(&u->reply_mutex);
+ mutex_init(&u->msgbuffer_mutex);
+
+ filp->private_data = u;
+
+ return 0;
+}
+
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ struct xenbus_transaction_holder *trans, *tmp;
+ struct watch_adapter *watch, *tmp_watch;
+ struct read_buffer *rb, *tmp_rb;
+
+ /*
+ * No need for locking here because there are no other users,
+ * by definition.
+ */
+
+ list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+ xenbus_transaction_end(trans->handle, 1);
+ list_del(&trans->list);
+ kfree(trans);
+ }
+
+ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ }
+
+ list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+ list_del(&rb->list);
+ kfree(rb);
+ }
+ kfree(u);
+
+ return 0;
+}
+
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+ struct xenbus_file_priv *u = file->private_data;
+
+ poll_wait(file, &u->read_waitq, wait);
+ if (!list_empty(&u->read_buffers))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+const struct file_operations xen_xenbus_fops = {
+ .read = xenbus_file_read,
+ .write = xenbus_file_write,
+ .open = xenbus_file_open,
+ .release = xenbus_file_release,
+ .poll = xenbus_file_poll,
+ .llseek = no_llseek,
+};
+EXPORT_SYMBOL_GPL(xen_xenbus_fops);
+
+static struct miscdevice xenbus_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/xenbus",
+ .fops = &xen_xenbus_fops,
+};
+
+static int __init xenbus_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&xenbus_dev);
+ if (err)
+ printk(KERN_ERR "Could not register xenbus frontend device\n");
+ return err;
+}
+
+static void __exit xenbus_exit(void)
+{
+ misc_deregister(&xenbus_dev);
+}
+
+module_init(xenbus_init);
+module_exit(xenbus_exit);
obj-$(CONFIG_XENFS) += xenfs.o
-xenfs-y = super.o xenbus.o privcmd.o
+xenfs-y = super.o
xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
+++ /dev/null
-/******************************************************************************
- * privcmd.c
- *
- * Interface to privileged domain-0 commands.
- *
- * Copyright (c) 2002-2004, K A Fraser, B Dragovic
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/uaccess.h>
-#include <linux/swap.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/tlb.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/xen.h>
-#include <xen/privcmd.h>
-#include <xen/interface/xen.h>
-#include <xen/features.h>
-#include <xen/page.h>
-#include <xen/xen-ops.h>
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
-#endif
-
-static long privcmd_ioctl_hypercall(void __user *udata)
-{
- struct privcmd_hypercall hypercall;
- long ret;
-
- if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
- return -EFAULT;
-
- ret = privcmd_call(hypercall.op,
- hypercall.arg[0], hypercall.arg[1],
- hypercall.arg[2], hypercall.arg[3],
- hypercall.arg[4]);
-
- return ret;
-}
-
-static void free_page_list(struct list_head *pages)
-{
- struct page *p, *n;
-
- list_for_each_entry_safe(p, n, pages, lru)
- __free_page(p);
-
- INIT_LIST_HEAD(pages);
-}
-
-/*
- * Given an array of items in userspace, return a list of pages
- * containing the data. If copying fails, either because of memory
- * allocation failure or a problem reading user memory, return an
- * error code; its up to the caller to dispose of any partial list.
- */
-static int gather_array(struct list_head *pagelist,
- unsigned nelem, size_t size,
- void __user *data)
-{
- unsigned pageidx;
- void *pagedata;
- int ret;
-
- if (size > PAGE_SIZE)
- return 0;
-
- pageidx = PAGE_SIZE;
- pagedata = NULL; /* quiet, gcc */
- while (nelem--) {
- if (pageidx > PAGE_SIZE-size) {
- struct page *page = alloc_page(GFP_KERNEL);
-
- ret = -ENOMEM;
- if (page == NULL)
- goto fail;
-
- pagedata = page_address(page);
-
- list_add_tail(&page->lru, pagelist);
- pageidx = 0;
- }
-
- ret = -EFAULT;
- if (copy_from_user(pagedata + pageidx, data, size))
- goto fail;
-
- data += size;
- pageidx += size;
- }
-
- ret = 0;
-
-fail:
- return ret;
-}
-
-/*
- * Call function "fn" on each element of the array fragmented
- * over a list of pages.
- */
-static int traverse_pages(unsigned nelem, size_t size,
- struct list_head *pos,
- int (*fn)(void *data, void *state),
- void *state)
-{
- void *pagedata;
- unsigned pageidx;
- int ret = 0;
-
- BUG_ON(size > PAGE_SIZE);
-
- pageidx = PAGE_SIZE;
- pagedata = NULL; /* hush, gcc */
-
- while (nelem--) {
- if (pageidx > PAGE_SIZE-size) {
- struct page *page;
- pos = pos->next;
- page = list_entry(pos, struct page, lru);
- pagedata = page_address(page);
- pageidx = 0;
- }
-
- ret = (*fn)(pagedata + pageidx, state);
- if (ret)
- break;
- pageidx += size;
- }
-
- return ret;
-}
-
-struct mmap_mfn_state {
- unsigned long va;
- struct vm_area_struct *vma;
- domid_t domain;
-};
-
-static int mmap_mfn_range(void *data, void *state)
-{
- struct privcmd_mmap_entry *msg = data;
- struct mmap_mfn_state *st = state;
- struct vm_area_struct *vma = st->vma;
- int rc;
-
- /* Do not allow range to wrap the address space. */
- if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
- ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
- return -EINVAL;
-
- /* Range chunks must be contiguous in va space. */
- if ((msg->va != st->va) ||
- ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
- return -EINVAL;
-
- rc = xen_remap_domain_mfn_range(vma,
- msg->va & PAGE_MASK,
- msg->mfn, msg->npages,
- vma->vm_page_prot,
- st->domain);
- if (rc < 0)
- return rc;
-
- st->va += msg->npages << PAGE_SHIFT;
-
- return 0;
-}
-
-static long privcmd_ioctl_mmap(void __user *udata)
-{
- struct privcmd_mmap mmapcmd;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int rc;
- LIST_HEAD(pagelist);
- struct mmap_mfn_state state;
-
- if (!xen_initial_domain())
- return -EPERM;
-
- if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
- return -EFAULT;
-
- rc = gather_array(&pagelist,
- mmapcmd.num, sizeof(struct privcmd_mmap_entry),
- mmapcmd.entry);
-
- if (rc || list_empty(&pagelist))
- goto out;
-
- down_write(&mm->mmap_sem);
-
- {
- struct page *page = list_first_entry(&pagelist,
- struct page, lru);
- struct privcmd_mmap_entry *msg = page_address(page);
-
- vma = find_vma(mm, msg->va);
- rc = -EINVAL;
-
- if (!vma || (msg->va != vma->vm_start) ||
- !privcmd_enforce_singleshot_mapping(vma))
- goto out_up;
- }
-
- state.va = vma->vm_start;
- state.vma = vma;
- state.domain = mmapcmd.dom;
-
- rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
- &pagelist,
- mmap_mfn_range, &state);
-
-
-out_up:
- up_write(&mm->mmap_sem);
-
-out:
- free_page_list(&pagelist);
-
- return rc;
-}
-
-struct mmap_batch_state {
- domid_t domain;
- unsigned long va;
- struct vm_area_struct *vma;
- int err;
-
- xen_pfn_t __user *user;
-};
-
-static int mmap_batch_fn(void *data, void *state)
-{
- xen_pfn_t *mfnp = data;
- struct mmap_batch_state *st = state;
-
- if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
- st->vma->vm_page_prot, st->domain) < 0) {
- *mfnp |= 0xf0000000U;
- st->err++;
- }
- st->va += PAGE_SIZE;
-
- return 0;
-}
-
-static int mmap_return_errors(void *data, void *state)
-{
- xen_pfn_t *mfnp = data;
- struct mmap_batch_state *st = state;
-
- return put_user(*mfnp, st->user++);
-}
-
-static struct vm_operations_struct privcmd_vm_ops;
-
-static long privcmd_ioctl_mmap_batch(void __user *udata)
-{
- int ret;
- struct privcmd_mmapbatch m;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long nr_pages;
- LIST_HEAD(pagelist);
- struct mmap_batch_state state;
-
- if (!xen_initial_domain())
- return -EPERM;
-
- if (copy_from_user(&m, udata, sizeof(m)))
- return -EFAULT;
-
- nr_pages = m.num;
- if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
- return -EINVAL;
-
- ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
- m.arr);
-
- if (ret || list_empty(&pagelist))
- goto out;
-
- down_write(&mm->mmap_sem);
-
- vma = find_vma(mm, m.addr);
- ret = -EINVAL;
- if (!vma ||
- vma->vm_ops != &privcmd_vm_ops ||
- (m.addr != vma->vm_start) ||
- ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
- !privcmd_enforce_singleshot_mapping(vma)) {
- up_write(&mm->mmap_sem);
- goto out;
- }
-
- state.domain = m.dom;
- state.vma = vma;
- state.va = m.addr;
- state.err = 0;
-
- ret = traverse_pages(m.num, sizeof(xen_pfn_t),
- &pagelist, mmap_batch_fn, &state);
-
- up_write(&mm->mmap_sem);
-
- if (state.err > 0) {
- state.user = m.arr;
- ret = traverse_pages(m.num, sizeof(xen_pfn_t),
- &pagelist,
- mmap_return_errors, &state);
- }
-
-out:
- free_page_list(&pagelist);
-
- return ret;
-}
-
-static long privcmd_ioctl(struct file *file,
- unsigned int cmd, unsigned long data)
-{
- int ret = -ENOSYS;
- void __user *udata = (void __user *) data;
-
- switch (cmd) {
- case IOCTL_PRIVCMD_HYPERCALL:
- ret = privcmd_ioctl_hypercall(udata);
- break;
-
- case IOCTL_PRIVCMD_MMAP:
- ret = privcmd_ioctl_mmap(udata);
- break;
-
- case IOCTL_PRIVCMD_MMAPBATCH:
- ret = privcmd_ioctl_mmap_batch(udata);
- break;
-
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
- vma, vma->vm_start, vma->vm_end,
- vmf->pgoff, vmf->virtual_address);
-
- return VM_FAULT_SIGBUS;
-}
-
-static struct vm_operations_struct privcmd_vm_ops = {
- .fault = privcmd_fault
-};
-
-static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /* Unsupported for auto-translate guests. */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -ENOSYS;
-
- /* DONTCOPY is essential for Xen because copy_page_range doesn't know
- * how to recreate these mappings */
- vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
- vma->vm_ops = &privcmd_vm_ops;
- vma->vm_private_data = NULL;
-
- return 0;
-}
-
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
-{
- return (xchg(&vma->vm_private_data, (void *)1) == NULL);
-}
-#endif
-
-const struct file_operations privcmd_file_ops = {
- .unlocked_ioctl = privcmd_ioctl,
- .mmap = privcmd_mmap,
-};
#include <xen/xen.h>
#include "xenfs.h"
+#include "../privcmd.h"
+#include "../xenbus/xenbus_comms.h"
#include <asm/xen/hypervisor.h>
{
static struct tree_descr xenfs_files[] = {
[1] = {},
- { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+ { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
{ "capabilities", &capabilities_file_ops, S_IRUGO },
- { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
+ { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
{""},
};
int rc;
+++ /dev/null
-/*
- * Driver giving user-space access to the kernel's xenbus connection
- * to xenstore.
- *
- * Copyright (c) 2005, Christian Limpach
- * Copyright (c) 2005, Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Changes:
- * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem
- * and /proc/xen compatibility mount point.
- * Turned xenfs into a loadable module.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/uio.h>
-#include <linux/notifier.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-
-#include "xenfs.h"
-#include "../xenbus/xenbus_comms.h"
-
-#include <xen/xenbus.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * An element of a list of outstanding transactions, for which we're
- * still waiting a reply.
- */
-struct xenbus_transaction_holder {
- struct list_head list;
- struct xenbus_transaction handle;
-};
-
-/*
- * A buffer of data on the queue.
- */
-struct read_buffer {
- struct list_head list;
- unsigned int cons;
- unsigned int len;
- char msg[];
-};
-
-struct xenbus_file_priv {
- /*
- * msgbuffer_mutex is held while partial requests are built up
- * and complete requests are acted on. It therefore protects
- * the "transactions" and "watches" lists, and the partial
- * request length and buffer.
- *
- * reply_mutex protects the reply being built up to return to
- * usermode. It nests inside msgbuffer_mutex but may be held
- * alone during a watch callback.
- */
- struct mutex msgbuffer_mutex;
-
- /* In-progress transactions */
- struct list_head transactions;
-
- /* Active watches. */
- struct list_head watches;
-
- /* Partial request. */
- unsigned int len;
- union {
- struct xsd_sockmsg msg;
- char buffer[PAGE_SIZE];
- } u;
-
- /* Response queue. */
- struct mutex reply_mutex;
- struct list_head read_buffers;
- wait_queue_head_t read_waitq;
-
-};
-
-/* Read out any raw xenbus messages queued up. */
-static ssize_t xenbus_file_read(struct file *filp,
- char __user *ubuf,
- size_t len, loff_t *ppos)
-{
- struct xenbus_file_priv *u = filp->private_data;
- struct read_buffer *rb;
- unsigned i;
- int ret;
-
- mutex_lock(&u->reply_mutex);
-again:
- while (list_empty(&u->read_buffers)) {
- mutex_unlock(&u->reply_mutex);
- if (filp->f_flags & O_NONBLOCK)
- return -EAGAIN;
-
- ret = wait_event_interruptible(u->read_waitq,
- !list_empty(&u->read_buffers));
- if (ret)
- return ret;
- mutex_lock(&u->reply_mutex);
- }
-
- rb = list_entry(u->read_buffers.next, struct read_buffer, list);
- i = 0;
- while (i < len) {
- unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
-
- ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
-
- i += sz - ret;
- rb->cons += sz - ret;
-
- if (ret != 0) {
- if (i == 0)
- i = -EFAULT;
- goto out;
- }
-
- /* Clear out buffer if it has been consumed */
- if (rb->cons == rb->len) {
- list_del(&rb->list);
- kfree(rb);
- if (list_empty(&u->read_buffers))
- break;
- rb = list_entry(u->read_buffers.next,
- struct read_buffer, list);
- }
- }
- if (i == 0)
- goto again;
-
-out:
- mutex_unlock(&u->reply_mutex);
- return i;
-}
-
-/*
- * Add a buffer to the queue. Caller must hold the appropriate lock
- * if the queue is not local. (Commonly the caller will build up
- * multiple queued buffers on a temporary local list, and then add it
- * to the appropriate list under lock once all the buffers have een
- * successfully allocated.)
- */
-static int queue_reply(struct list_head *queue, const void *data, size_t len)
-{
- struct read_buffer *rb;
-
- if (len == 0)
- return 0;
-
- rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
- if (rb == NULL)
- return -ENOMEM;
-
- rb->cons = 0;
- rb->len = len;
-
- memcpy(rb->msg, data, len);
-
- list_add_tail(&rb->list, queue);
- return 0;
-}
-
-/*
- * Free all the read_buffer s on a list.
- * Caller must have sole reference to list.
- */
-static void queue_cleanup(struct list_head *list)
-{
- struct read_buffer *rb;
-
- while (!list_empty(list)) {
- rb = list_entry(list->next, struct read_buffer, list);
- list_del(list->next);
- kfree(rb);
- }
-}
-
-struct watch_adapter {
- struct list_head list;
- struct xenbus_watch watch;
- struct xenbus_file_priv *dev_data;
- char *token;
-};
-
-static void free_watch_adapter(struct watch_adapter *watch)
-{
- kfree(watch->watch.node);
- kfree(watch->token);
- kfree(watch);
-}
-
-static struct watch_adapter *alloc_watch_adapter(const char *path,
- const char *token)
-{
- struct watch_adapter *watch;
-
- watch = kzalloc(sizeof(*watch), GFP_KERNEL);
- if (watch == NULL)
- goto out_fail;
-
- watch->watch.node = kstrdup(path, GFP_KERNEL);
- if (watch->watch.node == NULL)
- goto out_free;
-
- watch->token = kstrdup(token, GFP_KERNEL);
- if (watch->token == NULL)
- goto out_free;
-
- return watch;
-
-out_free:
- free_watch_adapter(watch);
-
-out_fail:
- return NULL;
-}
-
-static void watch_fired(struct xenbus_watch *watch,
- const char **vec,
- unsigned int len)
-{
- struct watch_adapter *adap;
- struct xsd_sockmsg hdr;
- const char *path, *token;
- int path_len, tok_len, body_len, data_len = 0;
- int ret;
- LIST_HEAD(staging_q);
-
- adap = container_of(watch, struct watch_adapter, watch);
-
- path = vec[XS_WATCH_PATH];
- token = adap->token;
-
- path_len = strlen(path) + 1;
- tok_len = strlen(token) + 1;
- if (len > 2)
- data_len = vec[len] - vec[2] + 1;
- body_len = path_len + tok_len + data_len;
-
- hdr.type = XS_WATCH_EVENT;
- hdr.len = body_len;
-
- mutex_lock(&adap->dev_data->reply_mutex);
-
- ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
- if (!ret)
- ret = queue_reply(&staging_q, path, path_len);
- if (!ret)
- ret = queue_reply(&staging_q, token, tok_len);
- if (!ret && len > 2)
- ret = queue_reply(&staging_q, vec[2], data_len);
-
- if (!ret) {
- /* success: pass reply list onto watcher */
- list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
- wake_up(&adap->dev_data->read_waitq);
- } else
- queue_cleanup(&staging_q);
-
- mutex_unlock(&adap->dev_data->reply_mutex);
-}
-
-static int xenbus_write_transaction(unsigned msg_type,
- struct xenbus_file_priv *u)
-{
- int rc;
- void *reply;
- struct xenbus_transaction_holder *trans = NULL;
- LIST_HEAD(staging_q);
-
- if (msg_type == XS_TRANSACTION_START) {
- trans = kmalloc(sizeof(*trans), GFP_KERNEL);
- if (!trans) {
- rc = -ENOMEM;
- goto out;
- }
- }
-
- reply = xenbus_dev_request_and_reply(&u->u.msg);
- if (IS_ERR(reply)) {
- kfree(trans);
- rc = PTR_ERR(reply);
- goto out;
- }
-
- if (msg_type == XS_TRANSACTION_START) {
- trans->handle.id = simple_strtoul(reply, NULL, 0);
-
- list_add(&trans->list, &u->transactions);
- } else if (msg_type == XS_TRANSACTION_END) {
- list_for_each_entry(trans, &u->transactions, list)
- if (trans->handle.id == u->u.msg.tx_id)
- break;
- BUG_ON(&trans->list == &u->transactions);
- list_del(&trans->list);
-
- kfree(trans);
- }
-
- mutex_lock(&u->reply_mutex);
- rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
- if (!rc)
- rc = queue_reply(&staging_q, reply, u->u.msg.len);
- if (!rc) {
- list_splice_tail(&staging_q, &u->read_buffers);
- wake_up(&u->read_waitq);
- } else {
- queue_cleanup(&staging_q);
- }
- mutex_unlock(&u->reply_mutex);
-
- kfree(reply);
-
-out:
- return rc;
-}
-
-static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
-{
- struct watch_adapter *watch, *tmp_watch;
- char *path, *token;
- int err, rc;
- LIST_HEAD(staging_q);
-
- path = u->u.buffer + sizeof(u->u.msg);
- token = memchr(path, 0, u->u.msg.len);
- if (token == NULL) {
- rc = -EILSEQ;
- goto out;
- }
- token++;
-
- if (msg_type == XS_WATCH) {
- watch = alloc_watch_adapter(path, token);
- if (watch == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
- watch->watch.callback = watch_fired;
- watch->dev_data = u;
-
- err = register_xenbus_watch(&watch->watch);
- if (err) {
- free_watch_adapter(watch);
- rc = err;
- goto out;
- }
- list_add(&watch->list, &u->watches);
- } else {
- list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
- if (!strcmp(watch->token, token) &&
- !strcmp(watch->watch.node, path)) {
- unregister_xenbus_watch(&watch->watch);
- list_del(&watch->list);
- free_watch_adapter(watch);
- break;
- }
- }
- }
-
- /* Success. Synthesize a reply to say all is OK. */
- {
- struct {
- struct xsd_sockmsg hdr;
- char body[3];
- } __packed reply = {
- {
- .type = msg_type,
- .len = sizeof(reply.body)
- },
- "OK"
- };
-
- mutex_lock(&u->reply_mutex);
- rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
- wake_up(&u->read_waitq);
- mutex_unlock(&u->reply_mutex);
- }
-
-out:
- return rc;
-}
-
-static ssize_t xenbus_file_write(struct file *filp,
- const char __user *ubuf,
- size_t len, loff_t *ppos)
-{
- struct xenbus_file_priv *u = filp->private_data;
- uint32_t msg_type;
- int rc = len;
- int ret;
- LIST_HEAD(staging_q);
-
- /*
- * We're expecting usermode to be writing properly formed
- * xenbus messages. If they write an incomplete message we
- * buffer it up. Once it is complete, we act on it.
- */
-
- /*
- * Make sure concurrent writers can't stomp all over each
- * other's messages and make a mess of our partial message
- * buffer. We don't make any attemppt to stop multiple
- * writers from making a mess of each other's incomplete
- * messages; we're just trying to guarantee our own internal
- * consistency and make sure that single writes are handled
- * atomically.
- */
- mutex_lock(&u->msgbuffer_mutex);
-
- /* Get this out of the way early to avoid confusion */
- if (len == 0)
- goto out;
-
- /* Can't write a xenbus message larger we can buffer */
- if ((len + u->len) > sizeof(u->u.buffer)) {
- /* On error, dump existing buffer */
- u->len = 0;
- rc = -EINVAL;
- goto out;
- }
-
- ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
-
- if (ret != 0) {
- rc = -EFAULT;
- goto out;
- }
-
- /* Deal with a partial copy. */
- len -= ret;
- rc = len;
-
- u->len += len;
-
- /* Return if we haven't got a full message yet */
- if (u->len < sizeof(u->u.msg))
- goto out; /* not even the header yet */
-
- /* If we're expecting a message that's larger than we can
- possibly send, dump what we have and return an error. */
- if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
- rc = -E2BIG;
- u->len = 0;
- goto out;
- }
-
- if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
- goto out; /* incomplete data portion */
-
- /*
- * OK, now we have a complete message. Do something with it.
- */
-
- msg_type = u->u.msg.type;
-
- switch (msg_type) {
- case XS_WATCH:
- case XS_UNWATCH:
- /* (Un)Ask for some path to be watched for changes */
- ret = xenbus_write_watch(msg_type, u);
- break;
-
- default:
- /* Send out a transaction */
- ret = xenbus_write_transaction(msg_type, u);
- break;
- }
- if (ret != 0)
- rc = ret;
-
- /* Buffered message consumed */
- u->len = 0;
-
- out:
- mutex_unlock(&u->msgbuffer_mutex);
- return rc;
-}
-
-static int xenbus_file_open(struct inode *inode, struct file *filp)
-{
- struct xenbus_file_priv *u;
-
- if (xen_store_evtchn == 0)
- return -ENOENT;
-
- nonseekable_open(inode, filp);
-
- u = kzalloc(sizeof(*u), GFP_KERNEL);
- if (u == NULL)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&u->transactions);
- INIT_LIST_HEAD(&u->watches);
- INIT_LIST_HEAD(&u->read_buffers);
- init_waitqueue_head(&u->read_waitq);
-
- mutex_init(&u->reply_mutex);
- mutex_init(&u->msgbuffer_mutex);
-
- filp->private_data = u;
-
- return 0;
-}
-
-static int xenbus_file_release(struct inode *inode, struct file *filp)
-{
- struct xenbus_file_priv *u = filp->private_data;
- struct xenbus_transaction_holder *trans, *tmp;
- struct watch_adapter *watch, *tmp_watch;
- struct read_buffer *rb, *tmp_rb;
-
- /*
- * No need for locking here because there are no other users,
- * by definition.
- */
-
- list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
- xenbus_transaction_end(trans->handle, 1);
- list_del(&trans->list);
- kfree(trans);
- }
-
- list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
- unregister_xenbus_watch(&watch->watch);
- list_del(&watch->list);
- free_watch_adapter(watch);
- }
-
- list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
- list_del(&rb->list);
- kfree(rb);
- }
- kfree(u);
-
- return 0;
-}
-
-static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
-{
- struct xenbus_file_priv *u = file->private_data;
-
- poll_wait(file, &u->read_waitq, wait);
- if (!list_empty(&u->read_buffers))
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-const struct file_operations xenbus_file_ops = {
- .read = xenbus_file_read,
- .write = xenbus_file_write,
- .open = xenbus_file_open,
- .release = xenbus_file_release,
- .poll = xenbus_file_poll,
- .llseek = no_llseek,
-};
#ifndef _XENFS_XENBUS_H
#define _XENFS_XENBUS_H
-extern const struct file_operations xenbus_file_ops;
-extern const struct file_operations privcmd_file_ops;
extern const struct file_operations xsd_kva_file_ops;
extern const struct file_operations xsd_port_file_ops;
*/
void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+/*
+ * Allow extra references to event channels exposed to userspace by evtchn
+ */
+int evtchn_make_refcounted(unsigned int evtchn);
+int evtchn_get(unsigned int evtchn);
+void evtchn_put(unsigned int evtchn);
+
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
int resend_irq_on_evtchn(unsigned int irq);
void rebind_evtchn_irq(int evtchn, int irq);
int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
int readonly);
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+ int flags, unsigned page_off,
+ unsigned length);
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+ domid_t trans_domid,
+ grant_ref_t trans_gref);
+
+/*
+ * Are sub-page grants available on this version of Xen? Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_subpage_grants_available(void);
+
+/*
+ * Are transitive grants available on this version of Xen? Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_trans_grants_available(void);
/*
* End access through the given grant reference, iff the grant entry is no
void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
unsigned long frame, int readonly);
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off,
+ unsigned length);
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+ int flags, domid_t trans_domid,
+ grant_ref_t trans_gref);
void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
unsigned long pfn);
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
- struct grant_entry **__shared);
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
- unsigned long nr_gframes);
+ void **__shared);
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared);
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
extern unsigned long xen_hvm_resume_frames;
unsigned int gnttab_max_grant_frames(void);
#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
- struct gnttab_map_grant_ref *kmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count);
int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
struct page **pages, unsigned int count);
* Use SMP-safe bit-setting instruction.
*/
+/*
+ * Reference to a grant entry in a specified domain's grant table.
+ */
+typedef uint32_t grant_ref_t;
+
/*
* A grant table comprises a packed array of grant entries in one or more
* page frames shared between Xen and a guest.
* [XEN]: This field is written by Xen and read by the sharing guest.
* [GST]: This field is written by the guest and read by Xen.
*/
-struct grant_entry {
+
+/*
+ * Version 1 of the grant table entry structure is maintained purely
+ * for backwards compatibility. New guests should use version 2.
+ */
+struct grant_entry_v1 {
/* GTF_xxx: various type and flag information. [XEN,GST] */
uint16_t flags;
/* The domain being granted foreign privileges. [GST] */
* GTF_permit_access: Allow @domid to map/access @frame.
* GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
* to this guest. Xen writes the page number to @frame.
+ * GTF_transitive: Allow @domid to transitively access a subrange of
+ * @trans_grant in @trans_domid. No mappings are allowed.
*/
#define GTF_invalid (0U<<0)
#define GTF_permit_access (1U<<0)
#define GTF_accept_transfer (2U<<0)
+#define GTF_transitive (3U<<0)
#define GTF_type_mask (3U<<0)
/*
* GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
* GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
* GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ * GTF_sub_page: Grant access to only a subrange of the page. @domid
+ * will only be allowed to copy from the grant, and not
+ * map it. [GST]
*/
#define _GTF_readonly (2)
#define GTF_readonly (1U<<_GTF_readonly)
#define GTF_reading (1U<<_GTF_reading)
#define _GTF_writing (4)
#define GTF_writing (1U<<_GTF_writing)
+#define _GTF_sub_page (8)
+#define GTF_sub_page (1U<<_GTF_sub_page)
/*
* Subflags for GTF_accept_transfer:
#define _GTF_transfer_completed (3)
#define GTF_transfer_completed (1U<<_GTF_transfer_completed)
+/*
+ * Version 2 grant table entries. These fulfil the same role as
+ * version 1 entries, but can represent more complicated operations.
+ * Any given domain will have either a version 1 or a version 2 table,
+ * and every entry in the table will be the same version.
+ *
+ * The interface by which domains use grant references does not depend
+ * on the grant table version in use by the other domain.
+ */
-/***********************************
- * GRANT TABLE QUERIES AND USES
+/*
+ * Version 1 and version 2 grant entries share a common prefix. The
+ * fields of the prefix are documented as part of struct
+ * grant_entry_v1.
*/
+struct grant_entry_header {
+ uint16_t flags;
+ domid_t domid;
+};
/*
- * Reference to a grant entry in a specified domain's grant table.
+ * Version 2 of the grant entry structure, here is an union because three
+ * different types are suppotted: full_page, sub_page and transitive.
+ */
+union grant_entry_v2 {
+ struct grant_entry_header hdr;
+
+ /*
+ * This member is used for V1-style full page grants, where either:
+ *
+ * -- hdr.type is GTF_accept_transfer, or
+ * -- hdr.type is GTF_permit_access and GTF_sub_page is not set.
+ *
+ * In that case, the frame field has the same semantics as the
+ * field of the same name in the V1 entry structure.
+ */
+ struct {
+ struct grant_entry_header hdr;
+ uint32_t pad0;
+ uint64_t frame;
+ } full_page;
+
+ /*
+ * If the grant type is GTF_grant_access and GTF_sub_page is set,
+ * @domid is allowed to access bytes [@page_off,@page_off+@length)
+ * in frame @frame.
+ */
+ struct {
+ struct grant_entry_header hdr;
+ uint16_t page_off;
+ uint16_t length;
+ uint64_t frame;
+ } sub_page;
+
+ /*
+ * If the grant is GTF_transitive, @domid is allowed to use the
+ * grant @gref in domain @trans_domid, as if it was the local
+ * domain. Obviously, the transitive access must be compatible
+ * with the original grant.
+ */
+ struct {
+ struct grant_entry_header hdr;
+ domid_t trans_domid;
+ uint16_t pad0;
+ grant_ref_t gref;
+ } transitive;
+
+ uint32_t __spacer[4]; /* Pad to a power of two */
+};
+
+typedef uint16_t grant_status_t;
+
+/***********************************
+ * GRANT TABLE QUERIES AND USES
*/
-typedef uint32_t grant_ref_t;
/*
* Handle to track a mapping created via a grant reference.
};
DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+/*
+ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
+ * tracked by <handle> but atomically replace the page table entry with one
+ * pointing to the machine address under <new_addr>. <new_addr> will be
+ * redirected to the null entry.
+ * NOTES:
+ * 1. The call may fail in an undefined manner if either mapping is not
+ * tracked by <handle>.
+ * 2. After executing a batch of unmaps, it is guaranteed that no stale
+ * mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_and_replace 7
+struct gnttab_unmap_and_replace {
+ /* IN parameters. */
+ uint64_t host_addr;
+ uint64_t new_addr;
+ grant_handle_t handle;
+ /* OUT parameters. */
+ int16_t status; /* GNTST_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+
+/*
+ * GNTTABOP_set_version: Request a particular version of the grant
+ * table shared table structure. This operation can only be performed
+ * once in any given domain. It must be performed before any grants
+ * are activated; otherwise, the domain will be stuck with version 1.
+ * The only defined versions are 1 and 2.
+ */
+#define GNTTABOP_set_version 8
+struct gnttab_set_version {
+ /* IN parameters */
+ uint32_t version;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
+
+/*
+ * GNTTABOP_get_status_frames: Get the list of frames used to store grant
+ * status for <dom>. In grant format version 2, the status is separated
+ * from the other shared grant fields to allow more efficient synchronization
+ * using barriers instead of atomic cmpexch operations.
+ * <nr_frames> specify the size of vector <frame_list>.
+ * The frame addresses are returned in the <frame_list>.
+ * Only <nr_frames> addresses are returned, even if the table is larger.
+ * NOTES:
+ * 1. <dom> may be specified as DOMID_SELF.
+ * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ */
+#define GNTTABOP_get_status_frames 9
+struct gnttab_get_status_frames {
+ /* IN parameters. */
+ uint32_t nr_frames;
+ domid_t dom;
+ /* OUT parameters. */
+ int16_t status; /* GNTST_* */
+ GUEST_HANDLE(uint64_t) frame_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
+
+/*
+ * GNTTABOP_get_version: Get the grant table version which is in
+ * effect for domain <dom>.
+ */
+#define GNTTABOP_get_version 10
+struct gnttab_get_version {
+ /* IN parameters */
+ domid_t dom;
+ uint16_t pad;
+ /* OUT parameters */
+ uint32_t version;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
+
/*
* Bitfield values for update_pin_status.flags.
*/
} u;
};
+DEFINE_GUEST_HANDLE(u64);
+
#else /* __ASSEMBLY__ */
/* In assembly code we cannot use C numeric constant suffixes. */
--- /dev/null
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/xenbus_backend.
+ *
+ * Copyright (c) 2011 Bastian Blank <waldi@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_XEN_XENBUS_DEV_H__
+#define __LINUX_XEN_XENBUS_DEV_H__
+
+#include <linux/ioctl.h>
+
+#define IOCTL_XENBUS_BACKEND_EVTCHN \
+ _IOC(_IOC_NONE, 'B', 0, 0)
+
+#endif /* __LINUX_XEN_XENBUS_DEV_H__ */