2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static inline struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->mounted)
31 spin_unlock(&fuse_lock);
35 static inline void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static inline void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static inline void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
108 if (down_interruptible(&fc->outstanding_sem))
110 return do_get_request(fc);
114 * Non-interruptible version of the above function is for operations
115 * which can't legally return -ERESTART{SYS,NOINTR}. This can still
116 * return NULL, but only in case the signal is SIGKILL.
118 struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc)
124 intr = down_interruptible(&fc->outstanding_sem);
125 restore_sigs(&oldset);
126 return intr ? NULL : do_get_request(fc);
129 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
131 spin_lock(&fuse_lock);
132 if (req->preallocated)
133 list_add(&req->list, &fc->unused_list);
135 fuse_request_free(req);
137 /* If we are in debt decrease that first */
138 if (fc->outstanding_debt)
139 fc->outstanding_debt--;
141 up(&fc->outstanding_sem);
142 spin_unlock(&fuse_lock);
145 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
147 if (atomic_dec_and_test(&req->count))
148 fuse_putback_request(fc, req);
151 void fuse_release_background(struct fuse_req *req)
157 spin_lock(&fuse_lock);
158 list_del(&req->bg_entry);
159 spin_unlock(&fuse_lock);
163 * This function is called when a request is finished. Either a reply
164 * has arrived or it was interrupted (and not yet sent) or some error
165 * occured during communication with userspace, or the device file was
166 * closed. It decreases the referece count for the request. In case
167 * of a background request the referece to the stored objects are
168 * released. The requester thread is woken up (if still waiting), and
169 * finally the request is either freed or put on the unused_list
171 * Called with fuse_lock, unlocks it
173 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
177 putback = atomic_dec_and_test(&req->count);
178 spin_unlock(&fuse_lock);
179 if (req->background) {
180 down_read(&fc->sbput_sem);
182 fuse_release_background(req);
183 up_read(&fc->sbput_sem);
185 wake_up(&req->waitq);
186 if (req->in.h.opcode == FUSE_INIT) {
189 if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
192 /* After INIT reply is received other requests can go
193 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
194 up()s on outstanding_sem. The last up() is done in
195 fuse_putback_request() */
196 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
197 up(&fc->outstanding_sem);
200 fuse_putback_request(fc, req);
204 * Unfortunately request interruption not just solves the deadlock
205 * problem, it causes problems too. These stem from the fact, that an
206 * interrupted request is continued to be processed in userspace,
207 * while all the locks and object references (inode and file) held
208 * during the operation are released.
210 * To release the locks is exactly why there's a need to interrupt the
211 * request, so there's not a lot that can be done about this, except
212 * introduce additional locking in userspace.
214 * More important is to keep inode and file references until userspace
215 * has replied, otherwise FORGET and RELEASE could be sent while the
216 * inode/file is still used by the filesystem.
218 * For this reason the concept of "background" request is introduced.
219 * An interrupted request is backgrounded if it has been already sent
220 * to userspace. Backgrounding involves getting an extra reference to
221 * inode(s) or file used in the request, and adding the request to
222 * fc->background list. When a reply is received for a background
223 * request, the object references are released, and the request is
224 * removed from the list. If the filesystem is unmounted while there
225 * are still background requests, the list is walked and references
226 * are released as if a reply was received.
228 * There's one more use for a background request. The RELEASE message is
229 * always sent as background, since it doesn't return an error or
232 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
235 list_add(&req->bg_entry, &fc->background);
237 req->inode = igrab(req->inode);
239 req->inode2 = igrab(req->inode2);
244 static int request_wait_answer_nonint(struct fuse_req *req)
249 err = wait_event_interruptible(req->waitq, req->finished);
250 restore_sigs(&oldset);
254 /* Called with fuse_lock held. Releases, and then reacquires it. */
255 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req,
260 spin_unlock(&fuse_lock);
262 intr = wait_event_interruptible(req->waitq, req->finished);
264 intr = request_wait_answer_nonint(req);
265 spin_lock(&fuse_lock);
266 if (intr && interruptible && req->sent) {
267 /* If request is already in userspace, only allow KILL
268 signal to interrupt */
269 spin_unlock(&fuse_lock);
270 intr = request_wait_answer_nonint(req);
271 spin_lock(&fuse_lock);
276 if (!interruptible || req->sent)
277 req->out.h.error = -EINTR;
279 req->out.h.error = -ERESTARTNOINTR;
281 req->interrupted = 1;
283 /* This is uninterruptible sleep, because data is
284 being copied to/from the buffers of req. During
285 locked state, there mustn't be any filesystem
286 operation (e.g. page fault), since that could lead
288 spin_unlock(&fuse_lock);
289 wait_event(req->waitq, !req->locked);
290 spin_lock(&fuse_lock);
292 if (!req->sent && !list_empty(&req->list)) {
293 list_del(&req->list);
294 __fuse_put_request(req);
295 } else if (!req->finished && req->sent)
296 background_request(fc, req);
299 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
304 for (i = 0; i < numargs; i++)
305 nbytes += args[i].size;
310 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
313 /* zero is special */
316 req->in.h.unique = fc->reqctr;
317 req->in.h.len = sizeof(struct fuse_in_header) +
318 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
319 if (!req->preallocated) {
320 /* If request is not preallocated (either FORGET or
321 RELEASE), then still decrease outstanding_sem, so
322 user can't open infinite number of files while not
323 processing the RELEASE requests. However for
324 efficiency do it without blocking, so if down()
325 would block, just increase the debt instead */
326 if (down_trylock(&fc->outstanding_sem))
327 fc->outstanding_debt++;
329 list_add_tail(&req->list, &fc->pending);
333 static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req,
337 spin_lock(&fuse_lock);
339 req->out.h.error = -ENOTCONN;
340 else if (fc->conn_error)
341 req->out.h.error = -ECONNREFUSED;
343 queue_request(fc, req);
344 /* acquire extra reference, since request is still needed
345 after request_end() */
346 __fuse_get_request(req);
348 request_wait_answer(fc, req, interruptible);
350 spin_unlock(&fuse_lock);
353 void request_send(struct fuse_conn *fc, struct fuse_req *req)
355 request_send_wait(fc, req, 1);
359 * Non-interruptible version of the above function is for operations
360 * which can't legally return -ERESTART{SYS,NOINTR}. This can still
361 * be interrupted but only with SIGKILL.
363 void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req)
365 request_send_wait(fc, req, 0);
368 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
370 spin_lock(&fuse_lock);
372 queue_request(fc, req);
373 spin_unlock(&fuse_lock);
375 req->out.h.error = -ENOTCONN;
376 request_end(fc, req);
380 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
383 request_send_nowait(fc, req);
386 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
389 spin_lock(&fuse_lock);
390 background_request(fc, req);
391 spin_unlock(&fuse_lock);
392 request_send_nowait(fc, req);
395 void fuse_send_init(struct fuse_conn *fc)
397 /* This is called from fuse_read_super() so there's guaranteed
398 to be a request available */
399 struct fuse_req *req = do_get_request(fc);
400 struct fuse_init_in_out *arg = &req->misc.init_in_out;
401 arg->major = FUSE_KERNEL_VERSION;
402 arg->minor = FUSE_KERNEL_MINOR_VERSION;
403 req->in.h.opcode = FUSE_INIT;
405 req->in.args[0].size = sizeof(*arg);
406 req->in.args[0].value = arg;
407 req->out.numargs = 1;
408 req->out.args[0].size = sizeof(*arg);
409 req->out.args[0].value = arg;
410 request_send_background(fc, req);
414 * Lock the request. Up to the next unlock_request() there mustn't be
415 * anything that could cause a page-fault. If the request was already
416 * interrupted bail out.
418 static inline int lock_request(struct fuse_req *req)
422 spin_lock(&fuse_lock);
423 if (req->interrupted)
427 spin_unlock(&fuse_lock);
433 * Unlock request. If it was interrupted during being locked, the
434 * requester thread is currently waiting for it to be unlocked, so
437 static inline void unlock_request(struct fuse_req *req)
440 spin_lock(&fuse_lock);
442 if (req->interrupted)
443 wake_up(&req->waitq);
444 spin_unlock(&fuse_lock);
448 struct fuse_copy_state {
450 struct fuse_req *req;
451 const struct iovec *iov;
452 unsigned long nr_segs;
453 unsigned long seglen;
461 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
462 struct fuse_req *req, const struct iovec *iov,
463 unsigned long nr_segs)
465 memset(cs, 0, sizeof(*cs));
469 cs->nr_segs = nr_segs;
472 /* Unmap and put previous page of userspace buffer */
473 static inline void fuse_copy_finish(struct fuse_copy_state *cs)
476 kunmap_atomic(cs->mapaddr, KM_USER0);
478 flush_dcache_page(cs->pg);
479 set_page_dirty_lock(cs->pg);
487 * Get another pagefull of userspace buffer, and map it to kernel
488 * address space, and lock request
490 static int fuse_copy_fill(struct fuse_copy_state *cs)
492 unsigned long offset;
495 unlock_request(cs->req);
496 fuse_copy_finish(cs);
498 BUG_ON(!cs->nr_segs);
499 cs->seglen = cs->iov[0].iov_len;
500 cs->addr = (unsigned long) cs->iov[0].iov_base;
504 down_read(¤t->mm->mmap_sem);
505 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
507 up_read(¤t->mm->mmap_sem);
511 offset = cs->addr % PAGE_SIZE;
512 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
513 cs->buf = cs->mapaddr + offset;
514 cs->len = min(PAGE_SIZE - offset, cs->seglen);
515 cs->seglen -= cs->len;
518 return lock_request(cs->req);
521 /* Do as much copy to/from userspace buffer as we can */
522 static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
525 unsigned ncpy = min(*size, cs->len);
528 memcpy(cs->buf, *val, ncpy);
530 memcpy(*val, cs->buf, ncpy);
540 * Copy a page in the request to/from the userspace buffer. Must be
543 static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
544 unsigned offset, unsigned count, int zeroing)
546 if (page && zeroing && count < PAGE_SIZE) {
547 void *mapaddr = kmap_atomic(page, KM_USER1);
548 memset(mapaddr, 0, PAGE_SIZE);
549 kunmap_atomic(mapaddr, KM_USER1);
553 if (!cs->len && (err = fuse_copy_fill(cs)))
556 void *mapaddr = kmap_atomic(page, KM_USER1);
557 void *buf = mapaddr + offset;
558 offset += fuse_copy_do(cs, &buf, &count);
559 kunmap_atomic(mapaddr, KM_USER1);
561 offset += fuse_copy_do(cs, NULL, &count);
563 if (page && !cs->write)
564 flush_dcache_page(page);
568 /* Copy pages in the request to/from userspace buffer */
569 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
573 struct fuse_req *req = cs->req;
574 unsigned offset = req->page_offset;
575 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
577 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
578 struct page *page = req->pages[i];
579 int err = fuse_copy_page(cs, page, offset, count, zeroing);
584 count = min(nbytes, (unsigned) PAGE_SIZE);
590 /* Copy a single argument in the request to/from userspace buffer */
591 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
595 if (!cs->len && (err = fuse_copy_fill(cs)))
597 fuse_copy_do(cs, &val, &size);
602 /* Copy request arguments to/from userspace buffer */
603 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
604 unsigned argpages, struct fuse_arg *args,
610 for (i = 0; !err && i < numargs; i++) {
611 struct fuse_arg *arg = &args[i];
612 if (i == numargs - 1 && argpages)
613 err = fuse_copy_pages(cs, arg->size, zeroing);
615 err = fuse_copy_one(cs, arg->value, arg->size);
620 /* Wait until a request is available on the pending list */
621 static void request_wait(struct fuse_conn *fc)
623 DECLARE_WAITQUEUE(wait, current);
625 add_wait_queue_exclusive(&fc->waitq, &wait);
626 while (fc->mounted && list_empty(&fc->pending)) {
627 set_current_state(TASK_INTERRUPTIBLE);
628 if (signal_pending(current))
631 spin_unlock(&fuse_lock);
633 spin_lock(&fuse_lock);
635 set_current_state(TASK_RUNNING);
636 remove_wait_queue(&fc->waitq, &wait);
640 * Read a single request into the userspace filesystem's buffer. This
641 * function waits until a request is available, then removes it from
642 * the pending list and copies request data to userspace buffer. If
643 * no reply is needed (FORGET) or request has been interrupted or
644 * there was an error during the copying then it's finished by calling
645 * request_end(). Otherwise add it to the processing list, and set
648 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
649 unsigned long nr_segs, loff_t *off)
652 struct fuse_conn *fc;
653 struct fuse_req *req;
655 struct fuse_copy_state cs;
658 spin_lock(&fuse_lock);
659 fc = file->private_data;
668 if (list_empty(&fc->pending))
671 req = list_entry(fc->pending.next, struct fuse_req, list);
672 list_del_init(&req->list);
673 spin_unlock(&fuse_lock);
676 reqsize = req->in.h.len;
677 fuse_copy_init(&cs, 1, req, iov, nr_segs);
679 if (iov_length(iov, nr_segs) >= reqsize) {
680 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
682 err = fuse_copy_args(&cs, in->numargs, in->argpages,
683 (struct fuse_arg *) in->args, 0);
685 fuse_copy_finish(&cs);
687 spin_lock(&fuse_lock);
689 if (!err && req->interrupted)
692 if (!req->interrupted)
693 req->out.h.error = -EIO;
694 request_end(fc, req);
698 request_end(fc, req);
701 list_add_tail(&req->list, &fc->processing);
702 spin_unlock(&fuse_lock);
707 spin_unlock(&fuse_lock);
711 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
712 size_t nbytes, loff_t *off)
715 iov.iov_len = nbytes;
717 return fuse_dev_readv(file, &iov, 1, off);
720 /* Look up request on processing list by unique ID */
721 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
723 struct list_head *entry;
725 list_for_each(entry, &fc->processing) {
726 struct fuse_req *req;
727 req = list_entry(entry, struct fuse_req, list);
728 if (req->in.h.unique == unique)
734 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
737 unsigned reqsize = sizeof(struct fuse_out_header);
740 return nbytes != reqsize ? -EINVAL : 0;
742 reqsize += len_args(out->numargs, out->args);
744 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
746 else if (reqsize > nbytes) {
747 struct fuse_arg *lastarg = &out->args[out->numargs-1];
748 unsigned diffsize = reqsize - nbytes;
749 if (diffsize > lastarg->size)
751 lastarg->size -= diffsize;
753 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
758 * Write a single reply to a request. First the header is copied from
759 * the write buffer. The request is then searched on the processing
760 * list by the unique ID found in the header. If found, then remove
761 * it from the list and copy the rest of the buffer to the request.
762 * The request is finished by calling request_end()
764 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
765 unsigned long nr_segs, loff_t *off)
768 unsigned nbytes = iov_length(iov, nr_segs);
769 struct fuse_req *req;
770 struct fuse_out_header oh;
771 struct fuse_copy_state cs;
772 struct fuse_conn *fc = fuse_get_conn(file);
776 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
777 if (nbytes < sizeof(struct fuse_out_header))
780 err = fuse_copy_one(&cs, &oh, sizeof(oh));
784 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
788 spin_lock(&fuse_lock);
789 req = request_find(fc, oh.unique);
794 list_del_init(&req->list);
795 if (req->interrupted) {
796 request_end(fc, req);
797 fuse_copy_finish(&cs);
803 spin_unlock(&fuse_lock);
805 err = copy_out_args(&cs, &req->out, nbytes);
806 fuse_copy_finish(&cs);
808 spin_lock(&fuse_lock);
811 if (req->interrupted)
813 } else if (!req->interrupted)
814 req->out.h.error = -EIO;
815 request_end(fc, req);
817 return err ? err : nbytes;
820 spin_unlock(&fuse_lock);
822 fuse_copy_finish(&cs);
826 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
827 size_t nbytes, loff_t *off)
830 iov.iov_len = nbytes;
831 iov.iov_base = (char __user *) buf;
832 return fuse_dev_writev(file, &iov, 1, off);
835 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
837 struct fuse_conn *fc = fuse_get_conn(file);
838 unsigned mask = POLLOUT | POLLWRNORM;
843 poll_wait(file, &fc->waitq, wait);
845 spin_lock(&fuse_lock);
846 if (!list_empty(&fc->pending))
847 mask |= POLLIN | POLLRDNORM;
848 spin_unlock(&fuse_lock);
853 /* Abort all requests on the given list (pending or processing) */
854 static void end_requests(struct fuse_conn *fc, struct list_head *head)
856 while (!list_empty(head)) {
857 struct fuse_req *req;
858 req = list_entry(head->next, struct fuse_req, list);
859 list_del_init(&req->list);
860 req->out.h.error = -ECONNABORTED;
861 request_end(fc, req);
862 spin_lock(&fuse_lock);
866 static int fuse_dev_release(struct inode *inode, struct file *file)
868 struct fuse_conn *fc;
870 spin_lock(&fuse_lock);
871 fc = file->private_data;
874 end_requests(fc, &fc->pending);
875 end_requests(fc, &fc->processing);
876 fuse_release_conn(fc);
878 spin_unlock(&fuse_lock);
882 struct file_operations fuse_dev_operations = {
883 .owner = THIS_MODULE,
885 .read = fuse_dev_read,
886 .readv = fuse_dev_readv,
887 .write = fuse_dev_write,
888 .writev = fuse_dev_writev,
889 .poll = fuse_dev_poll,
890 .release = fuse_dev_release,
893 static struct miscdevice fuse_miscdevice = {
896 .fops = &fuse_dev_operations,
899 int __init fuse_dev_init(void)
902 fuse_req_cachep = kmem_cache_create("fuse_request",
903 sizeof(struct fuse_req),
905 if (!fuse_req_cachep)
908 err = misc_register(&fuse_miscdevice);
910 goto out_cache_clean;
915 kmem_cache_destroy(fuse_req_cachep);
920 void fuse_dev_cleanup(void)
922 misc_deregister(&fuse_miscdevice);
923 kmem_cache_destroy(fuse_req_cachep);